<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Neurosci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Computational Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Neurosci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1662-5188</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fncom.2026.1626315</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Deep learning based approach for Behavior classification in diagnoses of Autism Spectrum Disorder using naturalistic videos</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Jabbar</surname> <given-names>Usama</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/2946093"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Waseem Iqbal</surname> <given-names>Muhammad</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Nechifor</surname> <given-names>Alexandru</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3063172"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Abaker</surname> <given-names>Mohammed</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Khairalseed</surname> <given-names>Mohammed Ahmed</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Antohi</surname> <given-names>Valentin Marian</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/1804298"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Fortea</surname> <given-names>Costinela</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/2643780"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Stefanescu</surname> <given-names>Catalin Aurelian</given-names></name>
<xref ref-type="aff" rid="aff8"><sup>8</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Computer Science, Superior University</institution>, <city>Lahore</city>, <country country="pk">Pakistan</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Software Engineering, Superior University</institution>, <city>Lahore</city>, <country country="pk">Pakistan</country></aff>
<aff id="aff3"><label>3</label><institution>Department of Individual Sports and Physiotherapy, Faculty of Physical Education and Sport, Dunarea de Jos University</institution>, <city>Gala&#x00163;i</city>, <country country="ro">Romania</country></aff>
<aff id="aff4"><label>4</label><institution>Applied College, King Khalid University</institution>, <city>Muhayil</city>, <country country="sa">Saudi Arabia</country></aff>
<aff id="aff5"><label>5</label><institution>Faculty of Education, University of Gadarif</institution>, <city>Gadarif</city>, <country country="ss">Sudan</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Business Administration, Dunarea de Jos University of Galati</institution>, <city>Gala&#x00163;i</city>, <country country="ro">Romania</country></aff>
<aff id="aff7"><label>7</label><institution>Department of Finance, Accounting and Economic Theory, Transilvania University of Brasov</institution>, <city>Brasov</city>, <country country="ro">Romania</country></aff>
<aff id="aff8"><label>8</label><institution>Department of Sports Games and Physical Education, Faculty of Physical Education and Sport, Dunarea de Jos University of Galati</institution>, <city>Gala&#x00163;i</city>, <country country="ro">Romania</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Alexandru Nechifor, <email xlink:href="mailto:alexandrunechiformed@yahoo.com">alexandrunechiformed@yahoo.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-18">
<day>18</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>20</volume>
<elocation-id>1626315</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>09</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>19</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Jabbar, Waseem Iqbal, Nechifor, Abaker, Khairalseed, Antohi, Fortea and Stefanescu.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Jabbar, Waseem Iqbal, Nechifor, Abaker, Khairalseed, Antohi, Fortea and Stefanescu</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Autism Spectrum Disorder (ASD) is a neurodevelopmental disorder that is marked by a lack of communication skills in social situations and repetitive and stereotypical Behaviors. The most widespread form of diagnosing ASD among children is based on psychological screening test along with monitoring of the Behavioral pattern, especially repetitive Behaviors. Some of these Behaviors include hand-flapping, head banging and spinning which are common among ASD children. In our research, we examine abnormal Behavioral patterns that may reflect ASD through the videos of children engaged in the everyday activities in the unstructured settings. A publicly available multiclass Self-Stimulatory Behavior Dataset (SSBD) is use in classify autistic Behavior. Before training the model, the dataset is thoroughly pre-processed (region-of-interest (ROI) detection and image cropping to eliminate irrelevant background objects). Moreover, information-augmenting methods are used to reduce overfitting and increase training efficiency and generalization effectiveness. In order to obtain spatiotemporal details successfully, a number of deep learning models are tested, such as studied CNN-GRU model, 3D-CNN &#x0002B; LSTM, MobileNet, VGG16, and EfficientNet-B7. The findings of the experiment prove that the proposed CNN-GRU model is superior to all competing methods. The model with a k-fold cross-validation provides a steady accuracy of 0.9284 &#x000B1; 0.0039&#x02013;0.9294 &#x000B1; 0.0038, which means that the model is robust and consistent across the folds. The effectiveness of the proposed approach is additionally justified by the comparisons with state-of-the-art methods. The results show that the systems based on the action recognition can help clinicians monitor the Behavioral trends and facilitate the quick, accurate, and effective screening of ASD. The proposed approach works effectively in predicting Behavior in real-life, uncontrolled videos and shows tremendous potential for real-world clinical implementation as a decision-support tool.</p></abstract>
<kwd-group>
<kwd>Autism Spectrum Disorder</kwd>
<kwd>Behavior analysis</kwd>
<kwd>CNN-GRU</kwd>
<kwd>deep learning</kwd>
<kwd>naturalistic video</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="11"/>
<table-count count="6"/>
<equation-count count="5"/>
<ref-count count="44"/>
<page-count count="15"/>
<word-count count="10160"/>
</counts>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Autism is a neural disease that influences the ability of a person to communicate effectively and do social interactions. It is a nervous behavior disorder, which is accompanied by repetitive actions, impairment of social interaction, communication, and language. A combination of these symptoms is known as autism (<xref ref-type="bibr" rid="B41">White et al., 2009</xref>; <xref ref-type="bibr" rid="B8">Cabanillas-Tello and Cabanillas-Carbonell, 2020</xref>; <xref ref-type="bibr" rid="B14">Kamran et al., 2022</xref>). Though the exact causes behind the rising rate of autism in children are not well understood, still, many of them still need intensive support and care throughout their life even when they are treated at early stages of the condition. However, early intervention is vital to the enhancement of long-term outcomes (<xref ref-type="bibr" rid="B7">Bilal et al., 2022</xref>). According to population-based studies, emotional and Behavioral issues in children with ASD can range from 40 to 50 percent and are clinically significant (<xref ref-type="bibr" rid="B38">Totsika et al., 2011</xref>). ASD children&#x00027;s diagnoses, severity levels, and skill assessments have largely been achieved through conventional diagnostic techniques. There are two traditional techniques, such as the rating scale and functional assessment, that are used for diagnosis and observing the Behavior of autistic patients. The rating scale method asks a series of questions and calculates the score. If the score exceeds the threshold value, the patient has autism. In-depth observations and assessments of children&#x00027;s abilities in various areas, including self-stimulatory Behavior, joint attention, independent play, social interaction, and the recognition of emotions from facial expressions, are part of the functional assessment. Determining how children feel is one of the challenges we have while working with them, particularly when it comes to autistic children who have a hard time adjusting to their surroundings. Using assistive technologies and figuring out how to make the most of the usage of technology and intelligent systems to help these kids is one way to solve this challenge (<xref ref-type="bibr" rid="B10">Case-Smith et al., 2015</xref>; <xref ref-type="bibr" rid="B30">Roane, 2016</xref>). Children with autism tend to be quiet, yet they can copy specific actions from cartoons and movies. As a result, they might act dangerously or unexpectedly (<xref ref-type="bibr" rid="B1">Abdel Hameed et al., 2022</xref>). However, there are several restrictions when employing traditional diagnostic and functional assessment techniques. First off, interpreting the observed Behavior of an autism patient is manual and takes time. Second, a clinician&#x00027;s observations may not always be trustworthy or valid due to variations in professional training, experience, resource availability, and cultural backgrounds. Employing AI (artificial intelligence) approaches for early identification and neurological evaluations of ASD has been shown to have substantial advantages (<xref ref-type="bibr" rid="B5">Alwakeel et al., 2015</xref>). In the past era, facial analysis and computer vision-based Behavior imaging have demonstrated promising outcomes in aiding doctors in the identification of a variety of medical problems, including ASD (<xref ref-type="bibr" rid="B17">Kohli et al., 2022</xref>). Even though computer vision has shown many promising applications, its use in evaluating Behavior, play, imitation, life skills, posture, and gait analysis to measure the joint attention of ASD children has not yet been investigated (<xref ref-type="bibr" rid="B34">Su et al., 2020</xref>). Computer vision-based intelligent activity monitoring and irregularity prediction in real-time can offer a trustworthy environment of assistance for those with mental or physical impairments. The ability to recognize activity has been substantially revolutionized by combining cutting-edge data processing techniques with computer vision monitoring concepts (<xref ref-type="bibr" rid="B16">Klintwall and Eikeseth, 2014</xref>; <xref ref-type="bibr" rid="B42">Ye et al., 2013</xref>). Despite the many potential solutions that computer vision has shown in different fields, in the context of autism Behavior monitoring and classification of abnormal Behavior has not yet been investigated. The purpose of this study is to create a computer vision-based monitoring framework that can classify different Behavioral actions from naturalistic videos of children with ASD, helping the clinician make an ASD diagnosis and monitor their Behavior. We do not intend to conduct clinical diagnosis of ASD in this study, but instead, we aim at examining observable behavioral cues in unconstrained video data as positive indicators, which may help in primary screening. The suggested video-based framework will be used as a supplement to the current clinical assessment protocol, which will offer an automated assistive screening tool which may then be used to bring to attention the behavioral trends of interest to be investigated further by the respective professionals. The given approach can be especially helpful in the context of large-scale screening or resource-intensive settings, where the number of trained clinicians is scarce. The ultimate diagnosis and interpretation must be left to a competent healthcare system and a suggested method is to be considered as a clinical decision support system and not an independent diagnostic system.</p>
<p>The objective of the study is as follows:</p>
<list list-type="bullet">
<list-item><p>The primary focus of this study is to propose a monitoring framework to assess their behavior using a deep learning model intelligently.</p></list-item>
<list-item><p>In this study, we proposed a CNN-GRU model to classify into four classes of autism behavior: arm flapping, spinning, head banging, and normal Behavior.</p></list-item>
<list-item><p>The effectiveness of the suggested approach has been assessed against various existing autism behavior monitoring approaches.</p></list-item>
</list>
<p>The remaining sections of the paper are arranged as follows. We will review the relevant work in Section 2. The CNN-based resources and techniques are explained in Section 3. Section 4 provides illustrations of the experimental findings, a comparative analysis, and discusses the paper&#x00027;s weaknesses and possible future study approaches. Section 5 presents the conclusion.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>This section thoroughly summarizes the available and frequently used techniques to examine child Behavior and identify autism. Researchers have suggested several approaches to analyze and monitor Behavior to find autism. Some research used observation and video analysis to study motor Behavior.</p>
<sec>
<label>2.1</label>
<title>Activity recognition based Behavior analysis</title>
<p>Activity recognition recognizes important occurrences in huge video collections. In order to identify clinically significant patterns from the photographs and videos and categories engaging activities for ASD youngsters, machine learning (ML) and computer vision (CV) have enhanced several features of human visual perception(<xref ref-type="bibr" rid="B33">Stevens et al., 2019</xref>; <xref ref-type="bibr" rid="B44">Zhao et al., 2022</xref>; <xref ref-type="bibr" rid="B11">Gu et al., 2018</xref>). Marinoiu et al. were presented with one of the largest multimodal autistic interaction datasets. Additionally, they suggested a fine-tuned action and emotion classification based on data gathered from children with ASD during robot-assisted therapy sessions. Their findings demonstrated a good agreement between machine-predicted scores and expert human diagnosis (<xref ref-type="bibr" rid="B23">Marinoiu et al., 2018</xref>). Rehg et al. presented a novel action recognition dataset to analyze children&#x00027;s social and communicative Behaviors using video and audio data. Their early experimental findings showed this dataset&#x00027;s potential to facilitate multi-modal activity detection (<xref ref-type="bibr" rid="B29">Rehg et al., 2013</xref>). Washington et al. suggested a deep learning-based computer vision classifier for identifying head banging in home recording videos. They use a head banging detector to extract the target head posture from videos, and then use a CNN &#x0002B; LSTM architecture to analyze the head banging motion. The experimental result enhanced deep learning models by attaining the accuracy of 90% in correctly identifying the head bagging and no head bagging (<xref ref-type="bibr" rid="B29">Rehg et al., 2013</xref>). They explored two methods and contrasted a bag-of-visual-words method with RNNs and CNNs. This demonstrates that deep learning architectures give outstanding results for detecting four activities: spinning, head banging, arm flapping, and other hand and arm movements (<xref ref-type="bibr" rid="B39">Washington et al., 2021</xref>). Ali et al. proposed a framework to recognize actions in videos of ASD children, using 3D Convolutional Neural Networks (3D-CNN), combined with target person identification and monitoring techniques. The experimental results show that deep learning models can attain an accuracy of 75% in correctly identifying autism Behavior actions in children (<xref ref-type="bibr" rid="B24">Negin et al., 2021</xref>). Rajagopalan et al. suggested the SSBD, which comprises videos of autistic kids carrying out everyday activities. They combined a histogram of optical flow alongside a histogram of dominating motions. Their proposed binary classification framework for headbanging and spinning has an accuracy of 86.6%, and for multi-classification, headbanging, arm flapping, and spinning have an accuracy of 76.3% (<xref ref-type="bibr" rid="B28">Rajagopalan and Goecke, 2014</xref>). Lakkapragada et al. proposed a deep learning-based model to classify autistic children abnormal hand movement and control hand movement in autistic children. This research aims to show that deep learning algorithms can successfully identify hand flapping in uncontrolled home videos. His work used the SSBD dataset and the deep learning model MobileNet. The experimental results show that the model has attained the highest accuracy of 84.0% (<xref ref-type="bibr" rid="B18">Lakkapragada et al., 2022</xref>). Tang et al. proposed a deep learning approach to detect emotion using smile facial expression. In this study, he presented the RCLA&#x00026;NBH_Smile dataset, a novel dataset. Thirty-four baby face expression recordings of their interactions with their mothers were captured, and more than 77,000 frames were manually labeled. The experimental results show that the proposed model has attained the highest accuracy of 87.16% (<xref ref-type="bibr" rid="B36">Tang et al., 2018</xref>). Manocha et al. suggested a system for monitoring autistic children&#x00027;s physical activity to identify abnormalities. His study proposes an activity prediction algorithm constructed using a deep 3D CNN and LSTM for detecting physical anomalies. The experimental results show that the suggested approach has attained an accuracy of 92.89% (<xref ref-type="bibr" rid="B22">Manocha and Singh, 2019</xref>). Ali et al. created a Behavior diagnostic paradigm for ASD. The stereotyped Behavior of the children was recorded in an uncontrolled setting during their ASD diagnosis, and they gathered and interpreted a set of these recordings. Children with ASD will be classified and their performance assessed using a multi-modality-based late fusion network. The findings showed that the proposed approach achieves better results and an accuracy of 85.6&#x02013;86.04% (<xref ref-type="bibr" rid="B3">Ali et al., 2022</xref>).</p>
</sec>
<sec>
<label>2.2</label>
<title>Facial expression-based behavioral analysis</title>
<p>People can communicate orally and nonverbally using facial expressions and eye contact. It can be distressing and lead to social anxiety for some individuals with ASD to maintain eye contact. It is difficult for kids with ASD to detect nonverbal clues, respond to them, and understand their gestures and emotions. Carpenter et al. used a trained computer vision model to extract positive, neutral, and various other facial features from a database of facial expressions. He argues that mimicking facial expressions is a crucial sign of social interaction abilities, supported by his discovery that kids with ASD exhibit more neutral facial expressions (<xref ref-type="bibr" rid="B9">Carpenter et al., 2021</xref>). He proposed a transfer learning-based approach to analyzing the facial features of autism patients. Their research showed that their method could more accurately identify the emotional expressions of children with ASD (<xref ref-type="bibr" rid="B12">Han et al., 2018</xref>). He developed an automated approach to identify emotions in youngsters engaging with robots to treat ASD. According to their research, computer vision could improve Behavior analysis when people engage with robots (<xref ref-type="bibr" rid="B20">Leo et al., 2015</xref>). Leo et al. suggested a machine learning approach for examining the facial expressions of kids with TD and ASD. The suggested approach may be effectively used to analyse the facial expressions made by kids with ASD thoroughly. Their results showed that their approach may yield an F1-score of 0.86% (<xref ref-type="bibr" rid="B19">Leo et al., 2019</xref>). Chintan et al. suggested an approach for digitally interpreting facial expressions to analyze human Behavior. He predicted the emotion across seven distinct categories using a deep learning network and the CK&#x0002B; dataset (<xref ref-type="bibr" rid="B37">Thacker and Makwana, 2019</xref>). Ahmed et al. developed an automated approach to identify kids&#x00027; facial expressions in videos recorded while they were using computer-assisted learning software. Their results showed how computer vision evaluation can automatically quantify Behavioral and emotional participation (<xref ref-type="bibr" rid="B2">Ahmed and Goodwin, 2017</xref>). Furthermore, a deep learning technique for facial image analysis has been used to identify cognitive developmental issues. Li et al. introduced a CNN-based approach for classifying ASD based on face features. Its results show that several face features improve autism classification (<xref ref-type="bibr" rid="B21">Li et al., 2019</xref>). Every methodology covered in the literature review section has been employed in various deep learning or machine learning models, except for a few limitations that are described in the comparison in <xref ref-type="table" rid="T1">Table 1</xref>. Most research tested their suggested model utilizing performance assessment parameters; there was room for improvement, evident in our results section.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Summary of related work.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>References</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Accuracy</bold></th>
<th valign="top" align="left"><bold>Limitation</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B33">Stevens et al. (2019)</xref></td>
<td valign="top" align="left">AVA benchmark, JHMDB, UCF101</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">75.3%, 92.8%, 84.8%</td>
<td valign="top" align="left">He proposed a human action dataset to compare the performance with two different datasets, but did not describe the pre-processing and model development steps.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B44">Zhao et al. (2022)</xref></td>
<td valign="top" align="left">DE-ENIGMA</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">45.68%</td>
<td valign="top" align="left">Need to improve results</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B11">Gu et al. (2018)</xref></td>
<td valign="top" align="left">Multimodal dataset MMDB</td>
<td valign="top" align="left">SVM</td>
<td valign="top" align="center">Not reported</td>
<td valign="top" align="left">Need to improve methodology.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B23">Marinoiu et al. (2018)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">CNN &#x0002B; LSTM</td>
<td valign="top" align="center">90.77%</td>
<td valign="top" align="left">He implemented a basic CNN model that performs binary classification and was tested on a small dataset.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B29">Rehg et al. (2013)</xref></td>
<td valign="top" align="left">ESBD</td>
<td valign="top" align="left">SVM, MLP, LSTM, NB</td>
<td valign="top" align="center">79.28%</td>
<td valign="top" align="left">Need to improve results.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B39">Washington et al. (2021)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">3D CNN</td>
<td valign="top" align="center">75.62%</td>
<td valign="top" align="left">Need to improve results.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B9">Carpenter et al. (2021)</xref></td>
<td valign="top" align="left">Own dataset</td>
<td valign="top" align="left">Statistically analysis</td>
<td valign="top" align="center">73.0%</td>
<td valign="top" align="left">Need to improve methodology.<break/> No information on data pre-processing.<break/> Need to improve results.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B3">Ali et al. (2022)</xref></td>
<td valign="top" align="left">FER</td>
<td valign="top" align="left">Transfer learning</td>
<td valign="top" align="center">82.2%, 87.1%</td>
<td valign="top" align="left">He proposed a framework to analyze emotion-based Behavior, but did not describe the pre-processing and model development steps.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B20">Leo et al. (2015)</xref></td>
<td valign="top" align="left">FER</td>
<td valign="top" align="left">CNN &#x0002B; SVR</td>
<td valign="top" align="center">86.0%</td>
<td valign="top" align="left">Not an evaluation of real-time expression-based Behavior analysis.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B28">Rajagopalan and Goecke (2014)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">LSTM, MobileNetV2</td>
<td valign="top" align="center">85.0%</td>
<td valign="top" align="left">He implemented pretrained models that perform binary classification and tested them on a small dataset.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B18">Lakkapragada et al. (2022)</xref></td>
<td valign="top" align="left">RCLA&#x00026;NBH Smile data</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">87.16%</td>
<td valign="top" align="left">He implemented a basic CNN model that performs binary classification and was tested on a small dataset. Other autism Behavior landmarks are ignored. Need to improve methodology and do multi-classification.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B36">Tang et al. (2018)</xref></td>
<td valign="top" align="left">Own dataset</td>
<td valign="top" align="left">3D CNN-LSTM</td>
<td valign="top" align="center">92.89%</td>
<td valign="top" align="left">Autism associated Behavior landmarks and features are not used in the dataset.</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B13">Howard et al. (2017)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">VGG 19, LSTM</td>
<td valign="top" align="center">96%</td>
<td valign="top" align="left">He implemented pretrained models that perform binary classification. Need to improve methodology and do multi-classification.</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<p>In this section, we describe our proposed approach in detail. We used several deep learning techniques to monitor Behavior and classify the autism related symptoms and normal Behavior. The primary stages of our suggested approach are shown in <xref ref-type="fig" rid="F1">Figure 1</xref>; further information is given below.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Proposed framework for the detection of autism behavior.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0001.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a video-based behavior classification system with five stages: data acquisition using cameras, data preprocessing including frame extraction and resizing, feature extraction using pre-trained models, classification into arm flapping, spinning, head banging, or normal behavior, and evaluation using accuracy and loss metrics.</alt-text>
</graphic>
</fig>
<sec>
<label>3.1</label>
<title>Data acquisition</title>
<p>In data acquisition, we collected the dataset used in our experiment. There is no previous dataset available that fully meets our condition. The SSBD data set was used to train the deep learning models. The SSBD is a freely accessible dataset gathered from autistic children. <xref ref-type="fig" rid="F2">Figure 2</xref> presents the detail steps for data acquisition and depict the flow of converting video into frames. The data consists of Videos that capture autistic children performing actions like spinning, headbanging, and shaking their hands. Parents and caretakers posted their videos on websites that are accessible to the public. <xref ref-type="fig" rid="F3">Figure 3</xref> presents the details of classes in dataset.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Flow chart for dataset preparation process.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0002.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a data acquisition and structuring process with two main branches: downloading SSBD dataset videos with subsequent data acquisition, and unzipping files followed by splitting videos into frames for dataset structuring. Black and purple colors emphasize steps and direction.</alt-text>
</graphic>
</fig>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Class distribution of dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0003.tif">
<alt-text content-type="machine-generated">Bar chart illustrating the class distribution of the SSBD dataset, displaying four behavior classes: Arm Flapping, Head Banging, and Spinning each with 25 videos, and Normal Behaviour with 20 videos.</alt-text>
</graphic>
</fig>
<p>There are 75 videos in the preliminary dataset, which was gathered from the social media platform YouTube (<xref ref-type="bibr" rid="B27">Rajagopalan et al., 2013</xref>). Only one dataset, SSBD, is freely available, but this data is somewhat helpful. However, it does not fully meet our conditions. The SSBD dataset was developed by collecting the naturalistic video recordings of autistic children.</p>
</sec>
<sec>
<label>3.2</label>
<title>Data pre-processing</title>
<sec>
<label>3.2.1</label>
<title>Frame extraction</title>
<p>The developers of the SSBD dataset gave the URLs of 75 YouTube videos, with annotations at the start and finish of abnormal Behavior that suggests autism. Each video has an average length of 90 s. There are multiple Behavioral movements in various time throughout the videos in this dataset. As a result, it was necessary to divide the portions according to the Behavioral movements, such as autistic Behavior, head banging, spinning, arm flapping, or other normal Behavioral movements. Every video was just a series of repeated frames meant to provide the impression of motion. The Python OpenCV package was use from each video to extract the frames and pre-process them. All videos were uniformly sample to obtain 30 frames per video as to maintain a temporal variation across all the samples. The frames were extracted and convert into feature vectors and a temporal sequence is form by placing the feature vectors in consecutive order. This sequence had a constant input length of GRU of 30 time steps, is use as input to capture temporal dependencies for classification. <xref ref-type="fig" rid="F4">Figure 4</xref> displays some frames of videos from the SSBD dataset.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Some sample frames of the SSBD video dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0004.tif">
<alt-text content-type="machine-generated">First panel: A young child bends forward with their head near the floor, representing head banging behavior. Second panel: A child stands and appears to be spinning in place. Third panel: A child flaps their arms repeatedly, demonstrating arm flapping. Fourth panel: A child with a black bar over their eyes drinks from a cup, representing typical behavior. Fifth panel: Another child bends forward for head banging next to toys. Sixth panel: A child spins inside a room. Seventh panel: A child flaps their arms near a table filled with items. Eighth panel: A child with a black bar over their eyes uses a laptop, displaying typical behavior.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>3.2.2</label>
<title>Image resizing and cropping</title>
<p>All the sizes of the images in each class are different, so they are not all the same. Therefore, all photos were transformed to 224 &#x000D7; 224-pixel values to achieve linearity and better outcomes. Another issue with images is that several unnecessary objects surround the object of interest. This problem results in an inaccurate categorization of the actual Behavior class. So, we solve the issue by extracting the region of interest and drawing a bounding box around the desired object using the OpenCV package. We use the bounding box detection to crop the original images.</p>
</sec>
<sec>
<label>3.2.3</label>
<title>Data augmentation</title>
<p>Data augmentation was used on the data samples in order to produce more image samples. Applying this strategy has the primary benefit of increasing model prediction and accuracy. We used the image data generator function from the Python Keras library to reduce over-fitting and increase the diversity of the training set. Reducing the variance in pixel values would enhance the computer&#x00027;s performance. Pixel values can only be found in the range [0, 1] by default due to the input value 1/255. The images were rotated toward a 25-degree targeted orientation. An unrestricted right and left image rotation is possible with the width shift range transformation when the width shift value is set to 0.1. Additional data sets were shifted with a zoom range of 0.1 in vertical and horizontal directions Every frame of the videos was augmented with rescaling, random rotation (maximum 20), width and height perturbation (maximum 20 percent), shear (maximum 20 percent), zoom (80&#x02013;120 percent), as well as random horizontal flipping and nearest-pixel filling performed after perturbation. These additions brought in further data variety and were used to enhance model generalization (<xref ref-type="bibr" rid="B13">Howard et al., 2017</xref>).</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Deep learning models</title>
<p>This research presents the practical use of deep learning models for video-based human Behavior recognition and classification. On the other hand, videos consist of a series of images placed in a particular order to create an activity. Various techniques, including long short-term memory networks (LSTMs), GRU, MobileNet, VGG19, and Efficient NetB4, could be applied to the classification of videos (<xref ref-type="bibr" rid="B15">Khalil et al., 2022</xref>). The primary goal was to classify autistic Behavior with satisfactory results. This is why I have experimented with several deep learning models, applying the pre-processing steps listed above on data sets before giving input to any model.</p>
<sec>
<label>4.1</label>
<title>MobileNet</title>
<p>The MobileNet model is a lightweight model designed explicitly for embedded and mobile devices with constrained computational capabilities, designed by Andrew G in 2017. The MobileNetV2 paradigm was chosen based on several different considerations. Due to the small dataset used to train the model, over-fitting was possible. However, utilizing a smaller but more expressive system, such as MobileNetV2, greatly reduced this effect. The primary concept included employing depth-wise distinct convolutions to minimize processing demands and parameter values while achieving high precision (<xref ref-type="bibr" rid="B13">Howard et al., 2017</xref>).</p>
</sec>
<sec>
<label>4.2</label>
<title>VGG19</title>
<p>University of Oxford researchers introduced the VGG (Visual Geometry Group) network as their deep convolutional neural network model in 2014. The VGG network model features 19 distinct layers, where 16 belong to convolutional operations, followed by fully connected layers as the final three. The stride measures at one pixel, and the pad measures at one pixel with a filter size set to 3 &#x000D7; 3. A small kernel size enables parameters to cover the entire image space while reducing the number of parameters. The 2 &#x000D7; 2 max pooling function of VGG-19 executes through a stride of 2. VGG Net further supported the theory that convolutional neural networks need an extensive layer structure because they understand visual patterns through structured systems. Feature extraction takes place through 16 layers of the network, following classification functions operated by the first three convolutional layers. Each feature extraction layer group contains five sections where max pooling operates as the final step. The system requires images with 224 &#x000D7; 224 dimensions (<xref ref-type="bibr" rid="B32">Simonyan and Zisserman, 2015</xref>).</p>
</sec>
<sec>
<label>4.3</label>
<title>Efficient NetB4</title>
<p>Tan and Le proposed the EfficientNet-B4 as a convolutional neural network (CNN) model in 2019, which became part of the EfficientNet family. The image classification domain has recognized the significant accomplishments of EfficientNet-B4. The platform contains two scaling elements for reliable dimension expansion and resolution enhancement, resulting in cutting-edge performance. The neural network contains depth-wise separable inverted bottleneck convolutions named MBConv and squeeze-and-excitation SE blocks for feature recalibration improvement (<xref ref-type="bibr" rid="B35">Szegedy et al., 2015</xref>).</p>
</sec>
<sec>
<label>4.4</label>
<title>LSTM</title>
<p>An LSTM model (also known as a Long Short-Term Memory) is a type of recurrent neural network that is specifically trained to acquire long-term dependencies in sequential data. It solves the vanishing gradient issue on the basis of memory cells and gating. Time-series analysis, and natural language processing are the most effective tasks with which LSTMs work. The model has input, forget and output gates that selectively remember or forget information. This is an advantage because LSTM models are highly appropriate in modeling intricate patterns of time-varying data (<xref ref-type="bibr" rid="B31">Saar-Tsechansky and Provost, 2007</xref>).</p>
</sec>
<sec>
<label>4.5</label>
<title>3D CNN</title>
<p>The 3D CNN is a continuation of the traditional 2D CNN, which operates on a three-dimensional dataset. It takes into account both time and space using 3D convolutional kernels. The applications of 3D CNNs include video analysis, action recognition and medical imaging. They model dynamic behaviors by learning motion patterns across sequential frames. This is why 3D CNNs are applicable in those tasks that require spatiotemporal features mentions (<xref ref-type="bibr" rid="B4">Ali et al., 2023</xref>).</p>
</sec>
<sec>
<label>4.6</label>
<title>Gated recurrent unit</title>
<p>The GRU model, a particular kind of recurrent neural network (RNN), is a widely used deep learning technique. GRU was developed to address the vanishing gradient issue with RNN. GRU controls the transmission status according to the state of the gates to remember the information that needs to be stored for an extended period, and less important information. <xref ref-type="fig" rid="F5">Figure 5</xref> illustrates a GRU cell&#x00027;s basic structure, consisting of two gates: an update gate and a reset gate. Information moving through the cell is rejected and accepted by the two gates. The reset gate determines what quantity of the previous data should be forgotten based on the decision taken by the sigmoid function &#x003C3;. The output of the sigmoid is <italic>r</italic><sub><italic>t</italic></sub>. The GRU model will process the data if the sigmoid function value is 1; if it is 0, the data will not be processed. The current input <italic>x</italic><sub><italic>t</italic></sub> and the prior hidden state (ht-1) serves as the input for the reset gate. The Update gate determines the information that is going to be modified to convey a future state. Thus, the update gate requires a fundamental aspect of the prior state. To modify the cell state, the update gate additionally incorporates a sigmoid activation function (<xref ref-type="bibr" rid="B43">Zhang et al., 2015</xref>; <xref ref-type="bibr" rid="B22">Manocha and Singh, 2019</xref>; <xref ref-type="bibr" rid="B6">Bansal et al., 2023</xref>).</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Structure of GRU model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0005.tif">
<alt-text content-type="machine-generated">Diagram illustrating the architecture of a Gated Recurrent Unit (GRU) cell, showing reset gate and update gate computations with operations such as multiplication, addition, sigmoid, and tanh activation functions, and the flow of input, hidden state, and output signals.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>4.7</label>
<title>Proposed CNN GRU model</title>
<p>In this investigation of autism action classification, CNN in conjunction with GRU was employed as a classifier. Compared to the LSTM architecture, the GRU architecture is simpler and requires fewer parameters to be configured. In <xref ref-type="fig" rid="F6">Figure 6</xref>, we show the proposed CNN GRU model architecture. The CNN GRU model uses a CNN convolutional layer to extract some key features from an image while preserving the image&#x00027;s original feature layout. Additionally, to avoid the issue of overfitting the model, the max pooling layer is utilized to shave the weak feature values and choose the deeper feature values from the key instances. This work also employed the rectified linear unit to trim down the eigenvalues smaller than 0 between the convolution layer and the layer that performs max pooling to speed up model training. The gated recurrent unit&#x00027;s (GRU) update and reset gates are then used to process the eigenvalues, speeding up the model&#x00027;s computation and improving its accuracy. For the convenience of the fully linked layer&#x00027;s use later on, the feature value is converted into a single-dimensional data by connecting the flattened layer. Lastly, the output is generated using the Softmax activation function.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Proposed architecture of the CNN-GRU model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0006.tif">
<alt-text content-type="machine-generated">Flowchart showing a deep learning model for behavior classification, with input images labeled as behavior data, passing through convolutional layers, a GRU layer, flatten and dense layers, and outputting categories such as head banging, arm flapping, spinning, and normal behavior.</alt-text>
</graphic>
</fig>
<p>The k-fold validation results and the comparative performance analysis, which is strongly support the selection of CNN-GRU architecture. Although layers in CNN are useful in extracting spatial features of single frames, autism-related Behavior is a temporal phenomenon that need sequence modeling across frames. GRU was chosen in place of LSTM because it uses a simpler gating operation and fewer parameters to be trained, resulting in better training stability and less overfitting during a k-fold training where the same model is trained repeatedly on varying data splits. Based on the results of the performance tables, CNN-GRU model is always more precise, recall, and F1-score per-fold than standalone CNN models, which means that it generalizes better and its performance is less variable. This equal performance at least in recall and F1-score indicates that GRU is very good at capturing the temporal dependencies, but with no computational cost of LSTM. Thus, the k-fold validation findings are empirical reasons to use CNN-GRU as an effective and strong architecture to assess autism behavior.</p>
</sec>
<sec>
<label>4.8</label>
<title>Hyperparameters optimization</title>
<p>The same hyperparameter settings were used to train all the models to facilitate a fair comparison of them: VGG16, CNN-LSTM, MobileNet, CNN-GRU and EfficientNet-B7. The output layer activation function was the Softmax and a dropout rate of 0.4 was used to ensure that it did not overfit. The Adam optimizer was used with the learning rate of 0.001 to train the models to minimize the categorical cross-entropy loss. The evaluation metric was the accuracy and all models were trained with 50 epochs and under the same conditions.</p>
</sec>
<sec>
<label>4.9</label>
<title>Performance evaluation measure</title>
<p>Performance evaluation is necessary in finding out the extent to which a classification model achieves its objectives. The target performance evaluation metrics determine how well and accurately the model works on the test data. An appropriate selection of metrics, such as Accuracy, and F1 score, among others, is essential to carry out a detailed assessment (<xref ref-type="bibr" rid="B43">Zhang et al., 2015</xref>). The following are the formulae generally used to calculate the above performance measures.</p>
<p>Accuracy is calculated as the sum of the value of TP and TN, divided by the total number of TP, TN, FP, and FN values.</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>N</mml:mtext></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>N</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>N</mml:mtext></mml:mstyle></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>The percentage of correctly determines the degree of precision recognized positive instances and dividing the total number of positively and mistakenly identified positive instances, as formulated in <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref>.</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>Recall is calculated as the percentage of correctly recognized positive instances, divided by the total number of true positive and identified false positive instances, as formulated in <xref ref-type="disp-formula" rid="EQ3">Equation 3</xref>.</p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>N</mml:mtext></mml:mstyle></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>F1 score is calculated by multiplying the mean of Recall and Precision and dividing by the total number of Recall and Precision, as expressed in <xref ref-type="disp-formula" rid="EQ4">Equation 4</xref>.</p>
<disp-formula id="EQ4"><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle mathvariant="bold"><mml:mn>2</mml:mn></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>N</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle mathvariant="bold"><mml:mtext>F</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mstyle mathvariant="bold"><mml:mn>2</mml:mn></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>T</mml:mtext></mml:mstyle><mml:mstyle mathvariant="bold"><mml:mtext>P</mml:mtext></mml:mstyle></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(4)</label></disp-formula>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Results and discussion</title>
<p>The experiment is conducted via Google Collaboratory, an open-source cloud-based tool provided by Google. This section presents the results of our suggested CNN-GRU model and other deep learning models on a publicly available autism Behavior dataset. A variety of deep learning models were tested for validated accuracy, and based on the findings, more refinement was made to the selected model customization based on validation accuracy. Moreover, we assess a customized suggested CNN-GRU classifier using performance metrics such as accuracy, Specificity, Sensitivity, precision, recall, and F1-score.</p>
<sec>
<label>5.1</label>
<title>Performance analysis of proposed model</title>
<p>In <xref ref-type="table" rid="T2">Table 2</xref> presents the results of different deep learning models. We implemented five deep learning models on the SSBD dataset and assessed their performance using validation accuracy and loss. Our proposed customized CNN-GRU model performs better than other models and attains the highest validation accuracy of 96% with a minimum validation loss of 0.16. Moreover, VGG16 performs the worst compared to others and achieves a minimum validation accuracy of 72.37% with a validation loss of 4.75. Our suggested approach has attained the highest accuracy score of 0.96 for arm flapping and head banging classes. In contrast, normal Behavior and spinning classes have attained low performance metrics scores of 0.96 and 0.97.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Performance evaluation of different deep learning models on the SSBD dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>EM</bold></th>
<th valign="top" align="center"><bold>VGG16</bold></th>
<th valign="top" align="center"><bold>MobileNet</bold></th>
<th valign="top" align="center"><bold>Efficient Net B7</bold></th>
<th valign="top" align="center"><bold>3D-CNN &#x0002B; LSTM</bold></th>
<th valign="top" align="center"><bold>CNN-GRU</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="2">SSBD</td>
<td valign="top" align="left">Val accuracy</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.96</td>
</tr>
 <tr>
<td valign="top" align="left">Val loss</td>
<td valign="top" align="center">0.42</td>
<td valign="top" align="center">0.39</td>
<td valign="top" align="center">0.38</td>
<td valign="top" align="center">0.40</td>
<td valign="top" align="center">0.16</td>
</tr></tbody>
</table>
</table-wrap>
<p>The graphical representation of the experiment results of our proposed CNN-GRU model is shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. We clearly see that the Arm flapping class has attained the highest bar because it attains the highest F1 score, 0.98, and the head banging class achieves the same result. Normal Behavior and spinning classes achieve the lowest bar because they earn low F1 scores of 0.93 and 0.94, respectively.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Class wise performance assessment of proposed CNN-GRU model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0007.tif">
<alt-text content-type="machine-generated">Bar chart compares five classification metrics&#x02014;accuracy, precision, recall, F1-score, and specificity&#x02014;across four behavioral classes: arm flapping, head banging, normal behaviour, and spinning, showing consistently high metric values near one for all categories.</alt-text>
</graphic>
</fig>
<sec>
<label>5.1.1</label>
<title>Confusion matrices analysis</title>
<p>In <xref ref-type="fig" rid="F8">Figure 8</xref> show the confusion matrices of the four behavior classes of autism Arm Flapping, Head Banging, Normal behavior, and Spinning. In <xref ref-type="fig" rid="F8">Figure 8a</xref> result of k fold validation of CNN-GRU shows that there is a strong and balanced performance in the classification of all four classes of behaviors. There is a very high level of correct prediction with 297 out of 303 samples being correctly predicted and a slight level of confusion with Spinning and Normal Behavior which implies that there is strong feature learning of Arm Flapping. Head Banging also demonstrates good performance with 245 correct classifications with very few misclassifications with a majority of them as Spinning and therefore there is a little overlap in motion patterns but good separability. Normal Behavior is very stable with the least number of misclassifications. Spinning has the highest false negative rate (&#x0007E;14%), which means that it is the most difficult to learn. In <xref ref-type="fig" rid="F8">Figure 8b</xref> CNN GRU test results, Arm Flapping (322 correct, 7 incorrect) and Head Banging (387 correct, 8 incorrect) had a high class wise accuracy (0.98). Normal Behavior 196 correct predictions, 24 misclassifications (0.96 accuracy). Spinning has made 69 correct predictions and 16 misclassifications suggesting a high overall recognition and a relatively high level of confusion. The normalized confusion matrix, the majority of misclassifications are between Spinning and the repetitive behaviors (Arm Flapping and Head Banging), so there is a possible overlap in the dynamics of motion. Normal Behavior is moderately confused with repetitive classes in the independent test set. <xref ref-type="table" rid="T3">Table 3</xref> presents the performance assessment analysis of several deep learning models regarding recall, F1-score, and precision. Our proposed CNN-GRU model performs better than other models and attains the highest precision score of 96%, a recall of 96%, and an F1 score of 96.31%. Conversely, the VGG16 model had a minimum precision score of 77.04%, a recall of 52.74%, and an F1 score of 51.95.</p>
<fig position="float" id="F8">
<label>Figure 8</label>
<caption><p>Confusion matrices of proposed CNN GRU model <bold>(a)</bold> k-fold cross validation results <bold>(b)</bold> over all test performance.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0008.tif">
<alt-text content-type="machine-generated">Two side-by-side confusion matrices depict classification results for four behaviors: Arm Flapping, Head Banging, Normal Behaviour, and Spinning. Both matrices use varying blue shades to indicate value magnitudes, with diagonal dominance showing correct predictions. The left matrix shows generally lower counts and more misclassifications than the right. The right matrix includes a color bar legend indicating frequencies up to three hundred fifty. Each axis is labeled with behavior categories to compare predicted versus true labels.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Performance comparison of the proposed customized CNN-GRU model with baseline classifiers.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>EM</bold></th>
<th valign="top" align="center"><bold>VGG16</bold></th>
<th valign="top" align="center"><bold>MobileNet</bold></th>
<th valign="top" align="center"><bold>Efficient Net B7</bold></th>
<th valign="top" align="center"><bold>3D-CNN &#x0002B; LSTM</bold></th>
<th valign="top" align="center"><bold>CNN-GRU</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Precision</td>
<td valign="top" align="center">0.91</td>
<td valign="top" align="center">0.93</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.96</td>
</tr>
<tr>
<td valign="top" align="left">Recall</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.95</td>
</tr>
<tr>
<td valign="top" align="left">F1-score</td>
<td valign="top" align="center">0.90</td>
<td valign="top" align="center">0.92</td>
<td valign="top" align="center">0.95</td>
<td valign="top" align="center">0.89</td>
<td valign="top" align="center">0.96</td>
</tr></tbody>
</table>
</table-wrap>
<p>In <xref ref-type="fig" rid="F9">Figure 9</xref>, we depict the model learning curve of our suggested approach. The above figure is divided into two separate images. The learning curve for the loss of training and validation is displayed on the left portion of the image. The image&#x00027;s right portion shows the training and validation accuracy learning curve. These learning curves are highly helpful since they provide insight into how a model behaves in terms of miss rate and learning ability.</p>
<fig position="float" id="F9">
<label>Figure 9</label>
<caption><p>Learning curve of proposed CNN GRU model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0009.tif">
<alt-text content-type="machine-generated">Two line charts compare model training and validation performance across epochs. The left chart shows both training and validation accuracy rising and plateauing near zero point nine five. The right chart depicts training and validation loss, both decreasing and stabilizing near zero point two, with validation loss consistently lower than training loss after the initial epochs. Both charts include labeled axes and legends for clarity.</alt-text>
</graphic>
</fig>
<p>The ROC curve of the Proposed CNN GRU Model is displayed in <xref ref-type="fig" rid="F10">Figure 10</xref>. This curve graph&#x00027;s primary benefit is its ability to visualize model performance, highlighting the trade-off between the rate of false positives and true positives. Because our suggested model was multiclassified, the ROC curve displays four distinct behavior classes.</p>
<fig position="float" id="F10">
<label>Figure 10</label>
<caption><p>ROC curve of proposed CNN GRU model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0010.tif">
<alt-text content-type="machine-generated">ROC curve for multi-class classification with four lines representing classes zero to three, showing high performance as curves are close to the top-left corner. Legend displays AUC scores near one for all classes.</alt-text>
</graphic>
</fig>
<p>The <xref ref-type="fig" rid="F11">Figure 11</xref> shows the relative efficiency of various classifiers, which demonstrates the usefulness of the suggested CNN-GRU model. However, CNN-GRU was the most accurate of all the five analyzed models with the highest value of 0.9294 and the least variance (standard error of 0.0038), which implies the best performance and the least variation. MobileNet (0.9193 &#x000B1; 0.0078) and VGG16 (0.9142 &#x000B1; 0.0131) were also found to be very strong but a little less than CNN-GRU. Conversely, EfficientNet-B7 (0.8808 &#x000B1; 0.0084) and 3D CNN-LSTM (0.8912 &#x000B1; 0.0105) had relatively lower accuracies implying that they were not very effective in this task. It is important to note that 3D CNN-LSTM is video-based model but CNN-GRU was more accurate than CNN-LSTM showing the benefit of combining CNN features extractions with GRU to model time. On the whole, the radar chart visually highlights that CNN-GRU model has a high level of accuracy and consistent performance; thus, it is the strongest among the analyzed architectures.</p>
<fig position="float" id="F11">
<label>Figure 11</label>
<caption><p>Accuracy comparison of the proposed CNN GRU model with baseline classifiers.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fncom-20-1626315-g0011.tif">
<alt-text content-type="machine-generated">Radar chart comparing five machine learning models: VGG16, EfficientNet-B7, MobileNet, 3D CNN-LSTM, and CNN-GRU, with performance scores ranging from 0.75 to 0.95. VGG16 outperforms others across most axes.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec>
<label>5.2</label>
<title>2 K fold cross validation analysis</title>
<p>All the models were tested by means of k-fold cross-validation to ensure a high quality and objective evaluation. In <xref ref-type="table" rid="T4">Table 4</xref> presents the results of different deep learning models. The findings suggest that the CNN-GRU model has the best overall performance with the average accuracy of 0.92 and equal precision, recall and F1-score of 0.91. MobileNet is also highly and steadily performing, with 0.91 on all evaluation measures, which demonstrates its efficiency and the ability to generalize in conditions of cross-validation. VGG16, is also highly and steadily performing, with achieving score 0.91. EfficientNet-B7 achieves the competitive performance of 0.88 and the standalone 3dCNN-LSTM model achieves the same of 0.89. Comprehensively, the stability and the generalization capabilities of the hybrid CNN-GRU model are supported by k-fold cross-validation and determine it as the most trustworthy strategy among the considered ones.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>K fold comparison of the proposed CNN GRU model with base line models.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Classifiers</bold></th>
<th valign="top" align="center"><bold>Accuracy</bold></th>
<th valign="top" align="center"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>F1-score</bold></th>
<th valign="top" align="center"><bold>Sensitivity</bold></th>
<th valign="top" align="center"><bold>Specificity</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Vgg16</td>
<td valign="top" align="center">0.9142 &#x000B1; 0.0131</td>
<td valign="top" align="center">0.9146 &#x000B1; 0.0123</td>
<td valign="top" align="center">0.9138 &#x000B1; 0.0125</td>
<td valign="top" align="center">0.9142 &#x000B1; 0.0131</td>
<td valign="top" align="center">0.9146 &#x000B1; 0.0123</td>
</tr>
<tr>
<td valign="top" align="left">Efficient Net b7</td>
<td valign="top" align="center">0.8808 &#x000B1; 0.0084</td>
<td valign="top" align="center">0.8799 &#x000B1; 0.0092</td>
<td valign="top" align="center">0.8786 &#x000B1; 0.0093</td>
<td valign="top" align="center">0.8808 &#x000B1; 0.0084</td>
<td valign="top" align="center">0.8799 &#x000B1; 0.0092</td>
</tr>
<tr>
<td valign="top" align="left">Mobile Net</td>
<td valign="top" align="center">0.9193 &#x000B1; 0.0078</td>
<td valign="top" align="center">0.9191 &#x000B1; 0.0080</td>
<td valign="top" align="center">0.9189 &#x000B1; 0.0078</td>
<td valign="top" align="center">0.9193 &#x000B1; 0.0078</td>
<td valign="top" align="center">0.9191 &#x000B1; 0.0080</td>
</tr>
<tr>
<td valign="top" align="left">3dCNN-LSTM</td>
<td valign="top" align="center">0.8912 &#x000B1; 0.0105</td>
<td valign="top" align="center">0.8927 &#x000B1; 0.0083</td>
<td valign="top" align="center">0.8888 &#x000B1; 0.0123</td>
<td valign="top" align="center">0.8912 &#x000B1; 0.0105</td>
<td valign="top" align="center">0.8927 &#x000B1; 0.0083</td>
</tr>
<tr>
<td valign="top" align="left">CNN-GRU</td>
<td valign="top" align="center">0.9294 &#x000B1; 0.0038</td>
<td valign="top" align="center">0.9290 &#x000B1; 0.0036</td>
<td valign="top" align="center">0.9284 &#x000B1; 0.0039</td>
<td valign="top" align="center">0.9294 &#x000B1; 0.0038</td>
<td valign="top" align="center">0.9290 &#x000B1; 0.0036</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>5.3</label>
<title>Statistically analysis</title>
<p>The significance level (alpha) is set to 0.05. The proposed CNN-GRU model is compare to baseline models by using four paired <italic>t</italic>-tests. All the comparisons had <italic>p-</italic>values that were significantly lower than the adjusted significance value of 0.0125 as indicated in <xref ref-type="table" rid="T5">Table 5</xref>, which proved to be statistically significant. Precisely, CNN-GRU had tremendous improvements compared to MobileNet (<italic>p</italic> = 0.0028) and VGG16 (<italic>p</italic> = 0.0051). Greater significance is found as compared to 3D CNN-LSTM and EfficientNet-B7, and lower p values. These findings suggest that the high performance of CNN-GRU model is not random and is not due to chance alone. Bonferroni&#x00027;s correction is use to regulate the family wise error rate. The significance value is adjusted using <xref ref-type="disp-formula" rid="EQ5">Equation 5</xref>.</p>
<disp-formula id="EQ5"><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mo>&#x003B1;</mml:mo></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>adjusted</mml:mtext></mml:mstyle><mml:mtext>&#x000A0;</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mtext>&#x000A0;</mml:mtext><mml:mfrac><mml:mrow><mml:mstyle mathvariant="bold-italic"><mml:mo>&#x003B1;</mml:mo></mml:mstyle></mml:mrow><mml:mrow><mml:mstyle mathvariant="bold"><mml:mtext>N</mml:mtext></mml:mstyle></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(5)</label></disp-formula>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Statistically analysis of proposed CNN GRU model with base line models.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Classifiers</bold></th>
<th valign="top" align="center"><bold><italic>T-</italic>test</bold></th>
<th valign="top" align="center"><bold><italic>P-</italic>test</bold></th>
<th valign="top" align="center"><bold>Significance &#x0003C; 0.0125</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">CNN-GRU vs. MobileNet</td>
<td valign="top" align="center">3.6811</td>
<td valign="top" align="center">0.0028</td>
<td valign="top" align="center">Yes</td>
</tr>
<tr>
<td valign="top" align="left">CNN-GRU vs. VGG16</td>
<td valign="top" align="center">3.5239</td>
<td valign="top" align="center">0.0051</td>
<td valign="top" align="center">yes</td>
</tr>
<tr>
<td valign="top" align="left">CNN-GRU vs. 3D CNN-LSTM</td>
<td valign="top" align="center">10.8180</td>
<td valign="top" align="center">0.0000</td>
<td valign="top" align="center">yes</td>
</tr>
<tr>
<td valign="top" align="left">CNN-GRU vs. EfficientNet-B7</td>
<td valign="top" align="center">16.6697</td>
<td valign="top" align="center">0.0000</td>
<td valign="top" align="center">yes</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>5.4</label>
<title>Ablation of study</title>
<p>In this ablation study, the value of each specific architecture element is analyzed by comparing the proposed CNN-GRU model with a variety of baseline and hybrid networks, i.e. VGG16, EfficientNet-B7, MobileNet, 3D CNN-LSTM. We measures the importance of various architectural elements through a comparison of the proposed CNN-GRU model and various baseline and hybrid networks such as VGG16, EfficientNet-B7, MobileNet, and 3D CNN-LSTM. <xref ref-type="table" rid="T4">Table 4</xref> summarizes quantitative results in terms of accuracy, precision, recall and F1-score whilst <xref ref-type="table" rid="T5">Table 5</xref> displays statistical significance of performance differences using paired <italic>t</italic>-tests. As <xref ref-type="table" rid="T4">Table 4</xref> indicates, the proposed CNN-GRU model is the most overall winner in terms of the accuracy of 0.9294 &#x000B1; 0.0038 and F1-score of 0.9284 &#x000B1; 0.0039, surpassing all other competing models. Also, the t-test results in <xref ref-type="table" rid="T5">Table 5</xref> statistically prove the improvements, with CNN-GRU showing significant performance improvements over MobileNet (<italic>t</italic> = 3.6811, <italic>p</italic> = 0.0028) and VGG16 (<italic>t</italic> = 3.5239, <italic>p</italic> = 0.0051). Moreover, comparatively with spatiotemporal architectures, CNN-GRU demonstrates a substantial and statistically significant edge over the 3D CNN-LSTM (<italic>t</italic> = 10.8180, <italic>p</italic> &#x0003C; 0.001), which means that GRU offers to represent time more efficiently and more reliably than LSTM in such a scenario. The most significant statistical support is seen against EfficientNet-B7 (<italic>t</italic> = 16.6697, <italic>p</italic> &#x0003C; 0.001) indicating that the lack of explicit temporal modeling is not offset by the increase in depth of the network.</p>
</sec>
<sec>
<label>5.5</label>
<title>Comparative analysis with existing method</title>
<p>It is a challenging endeavor due to autism diagnosis relies on long-term observation of human behavior. In this study, we presented an intelligent approach to detect autism related Behavior such as hand flapping, head banging, spinning, and normal Behavior. Our proposed approach uses deep learning and computer vision techniques. An analysis of comparisons is necessary to demonstrate the effectiveness of the suggested model. In <xref ref-type="table" rid="T6">Table 6</xref>, we present the details of the comparative analysis of our suggested models with various current models used in recent studies. After examining and comparing the performance of our suggested model with various current models, we found that the proposed model achieves a 92.94% accuracy score, which is far better than other approaches used in recent studies.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Comparison of the proposed model with relevant studies.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>References</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Accuracy</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B39">Washington et al. (2021)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">3D CNN</td>
<td valign="top" align="center">75.62%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B23">Marinoiu et al. (2018)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">CNN &#x0002B; LSTM</td>
<td valign="top" align="center">90.77%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B33">Stevens et al. (2019)</xref></td>
<td valign="top" align="left">SSBD UCF101</td>
<td valign="top" align="left">KNN</td>
<td valign="top" align="center">86.6% 76.3%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B28">Rajagopalan and Goecke (2014)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">LSTM, MobileNetV2</td>
<td valign="top" align="center">85.0%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B18">Lakkapragada et al. (2022)</xref></td>
<td valign="top" align="left">RCLA&#x00026;NBH</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">87.16%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B40">Wei et al. (2023)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">83%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B25">Park et al. (2024)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">LRCN</td>
<td valign="top" align="center">79.61%</td>
</tr>
<tr>
<td valign="top" align="left"><xref ref-type="bibr" rid="B26">Prakash et al. (2025)</xref></td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">DCNN</td>
<td valign="top" align="center">78.57%</td>
</tr>
<tr>
<td valign="top" align="left">Proposed</td>
<td valign="top" align="left">SSBD</td>
<td valign="top" align="left">CNN-GRU</td>
<td valign="top" align="center">92.94%</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="s6">
<label>6</label>
<title>Discussion</title>
<p>The comparative analysis reveals that the suggested CNN-GRU model is more successful than base architectures in recognition accuracy and resilience. Its high accuracy, recall, and F1-score suggest that it has a balanced performance in terms of correctly identifying behavioral classes with the lowest error predictions. The traditional CNN models (e.g., VGG-based models) demonstrate a relatively lower recall and F1-scores, implying that they are unable to capture complex behavioral patterns. Though lightweight and deep learning models like MobileNet and Efficient Net are also competitive, they still do not outperform the proposed model. The consistency of the CNN-GRU architecture in terms of its accuracy and F1-scores across folds are also supported by the K-fold cross-validation results. Conversely, the stand alone 3D CNN-LSTM model shows relatively low performance that temporal modeling is not sufficient without strong spatial feature extractions. Moreover, the lower variance between folds shows that the model is less sensitive to changes in data distortions, which is especially critical in real-world situations of studying behaviors.</p>
<sec>
<label>6.1</label>
<title>Real-world clinical setting</title>
<p>Instead of being a stand-alone diagnostic system, the suggested system shows tremendous potential for real-world clinical implementation as a decision-support tool. In practice the proposed CNN-GRU-based system to be use as a decision support system for clinicians, integrated with the existing screening system. In the regular Behavioral screening of patients, video clips of the children recorded with the use of cameras in clinical setting and interpreted by the system to provide risk scores and behaviors for Autism. These risk scores and Behaviors can easily integrated into existing electronic health records (EHRs) software systems/clinical dashboards for the clinician to interpret the predictions of the system alongside existing screening tools. In terms of system implementation, the system would not be hardware-dependent; it would be able to run on regular clinical computer workstations or be deliver through a cloud-based system for easy accessibility. The system also would be capable of near-real-time predictions due to the computational efficiency of the CNN-GRU architecture, making it ideal for use in early screening settings. The system would be intend to improve the screening process, help with regular screening, and facilitate the clinician with the diagnosis of ASD in the early stages while maintaining control of the diagnosis with the clinician.</p>
</sec>
<sec>
<label>6.2</label>
<title>Ethical implications and safeguard</title>
<p>The suggested framework does not imply any direct human experimentation and uses entirely anonymized, publicly available dataset the full scope of the ethical implications related to the use of automated diagnostic support systems in clinical settings is taken into consideration. Specifically, there are risks that these systems will be abused or overused as the sole diagnostic systems. To address this issue, the suggested CNN-GRU model will be used as a purely clinical decision-support tool, one that will help a professional to identify possible patterns of Behavior related to ASD, but not to substitute his/her judgment. The protective measures must involve upholding human-in-the-loop decision making where ultimate diagnoses must be controlled by qualified health care practitioners. Functional audits also require regular performance audits, monitoring biases, and retraining using more diverse datasets to minimize the risk of systematic errors and enhance generalization. Lastly, there is need to establish explicit clinical usage rules and ethical controls to help avoid misuse and have automated predictions used with care, especially in delicate pediatrics diagnosis cases.</p>
</sec>
<sec>
<label>6.3</label>
<title>Limitation</title>
<p>Although the SSBD dataset is publicly available and annotated, the systematic form of annotation, this study uses a single dataset consisting of video recordings of children acting in a predetermined Behavior sequence. Although this allows the ability to control the experimentation, it might not represent the full range of diversity and complexity of real-world manifestations of autism spectrum disorder. The Behavioral patterns of ASD may be diverse among different people, at different developmental stages, in different environmental situations, and across cultures. While we use stratified k-fold cross-validation to minimize sampling bias and to ensure that the proposed CNN GRU model is tested on similar subsets of the data, we acknowledge that the k-fold validation will not be able to offset the limited Behavioral and contextual diversity within one dataset. Such a restriction may affect the model performance when used in unconstrained real-life situations or even in the clinical environment. The same hyperparameters were used to train all architectures to create consistency and fairness of experiments. Nonetheless, various models such Efficient Net and 3D CNN-LSTM are normally prone to be optimally tuned to the architecture. This single arrangement may not wholly utilize the strengths of some of them and may rather work against them. The future work will be done to conduct systematic hyperparameter optimization specific to each model to have a more balanced and fully optimized comparison. In addition, the validation of the offered methodology on various datasets gathered across a variety of sources, such as clinical settings and home based to enhance the rigor and external validity.</p>
</sec>
</sec>
<sec sec-type="conclusions" id="s7">
<label>7</label>
<title>Conclusion</title>
<p>Autism Spectrum Disorder (ASD) is a neurodevelopmental disorder that interferes with cognitive functioning, communication, and Behavioral patterns of people of various age groups. There is a strong role played by early detection of ASD toward developing a stronger learning ability and management of action in a timely manner. This study proposes a deep learning-based framework in order to analyze Behaviors associated to autism and initially distinguishing between autistic and typical Behaviors. The publicly available multiclass Self-Stimulatory Behavior Dataset (SSBD) of real-life video recordings of children with different Behavioral patterns were used as experimental data. We implemented five deep learning models CNN-GRU, MobileNet, VGG16, EfficientNet-B7, and 3DCNN-LSTM on the SSBD dataset to assessed their performance. Results show that the effectively prove the usefulness of the proposed CNN-GRU model as compared to the traditional deep learning classifiers. The findings reveal that the suggested tailored CNN-GRU model is more efficient than the other models with an accuracy of 92.94%. Such high performance presents the capability of the model in capturing both spatial and temporal patterns of Behavior in unconstrained and real life video environments. The suggested method can help experts and non-experts to recognize important Behaviors and facilitate automated Behavior monitoring application in assessing autism. In future work will be directed at the validation of the offered methodology on various datasets gathered across a variety of sources, such as naturalistic and clinical settings, to enhance the rigor and external validity.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="s9">
<title>Ethics statement</title>
<p>This study utilized a publicly available dataset from GitHub (<ext-link ext-link-type="uri" xlink:href="https://github.com/antran89/clipping_ssbd_videos">https://github.com/antran89/clipping_ssbd_videos</ext-link>). As the data were previously collected and anonymized by the original authors, no additional ethical approval or informed consent was required for this research.</p>
</sec>
<sec sec-type="author-contributions" id="s10">
<title>Author contributions</title>
<p>UJ: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. MW: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. AN: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. MA: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. MK: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. VA: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. CF: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. CS: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s12">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abdel Hameed</surname> <given-names>M.</given-names></name> <name><surname>Hassaballah</surname> <given-names>M.</given-names></name> <name><surname>Hosney</surname> <given-names>M. E.</given-names></name> <name><surname>Alqahtani</surname> <given-names>A.</given-names></name></person-group> (<year>2022</year>). <article-title>An AI-enabled Internet of Things based autism care system for improving cognitive ability of children with autism spectrum disorders</article-title>. <source>Comput. Intell. Neurosci.</source> <volume>2022</volume>, <fpage>1</fpage>&#x02013;<lpage>12</lpage>. doi: <pub-id pub-id-type="doi">10.1155/2022/2247675</pub-id><pub-id pub-id-type="pmid">35655510</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ahmed</surname> <given-names>A. A.</given-names></name> <name><surname>Goodwin</surname> <given-names>M. S.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Automated detection of facial expressions during computer-assisted instruction in individuals on the autism spectrum,&#x0201D;</article-title> in <source>Proceedings of the CHI Conference on Human Factors in Computing Systems</source> (<publisher-loc>Denver, CO</publisher-loc>: <publisher-name>Association for Computing Machinery</publisher-name>), <fpage>6050</fpage>&#x02013;<lpage>6055</lpage>.</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Ali</surname> <given-names>A.</given-names></name> <name><surname>Negin</surname> <given-names>F.</given-names></name> <name><surname>Th&#x000FC;mmler</surname> <given-names>S.</given-names></name> <name><surname>Bremond</surname> <given-names>F.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Video-based behavior understanding of children for objective diagnosis of autism,&#x0201D;</article-title> in <source>Proceedings of the 17th International Joint Conference on Computer Vision, Imaging and Computer Graphics Theory and Applications</source> (<publisher-loc>SCITEPRESS</publisher-loc>), <fpage>475</fpage>&#x02013;<lpage>484</lpage>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ali</surname> <given-names>G.</given-names></name> <name><surname>Dastgir</surname> <given-names>A.</given-names></name> <name><surname>Iqbal</surname> <given-names>M. W.</given-names></name> <name><surname>Anwar</surname> <given-names>M.</given-names></name> <name><surname>Faheem</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>A hybrid convolutional neural network model for automatic diabetic retinopathy classification from fundus images</article-title>. <source>IEEE J. Transl. Eng. Health Med.</source> <volume>11</volume>, <fpage>341</fpage>&#x02013;<lpage>350</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JTEHM.2023.3282104</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Alwakeel</surname> <given-names>S. S.</given-names></name> <name><surname>Alhalabi</surname> <given-names>B.</given-names></name> <name><surname>Aggoune</surname> <given-names>H.</given-names></name> <name><surname>Alwakeel</surname> <given-names>M.</given-names></name></person-group> (<year>2015</year>). <article-title>A machine learning based WSN system for autism activity recognition</article-title>. in <source>2015 IEEE 14th International Conference on Machine Learning and Applications (ICMLA)</source> (<publisher-loc>Miami, FL</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>771</fpage>&#x02013;<lpage>776</lpage>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bansal</surname> <given-names>M.</given-names></name> <name><surname>Kumar</surname> <given-names>M.</given-names></name> <name><surname>Sachdeva</surname> <given-names>M.</given-names></name> <name><surname>Mittal</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Transfer learning for image classification using VGG19: Caltech-101 image data set</article-title>. <source>J. Ambient Intell. Human. Comput.</source> <volume>14</volume>, <fpage>3609</fpage>&#x02013;<lpage>3620</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s12652-021-03488-z</pub-id><pub-id pub-id-type="pmid">34548886</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bilal</surname> <given-names>M.</given-names></name> <name><surname>Ali</surname> <given-names>G.</given-names></name> <name><surname>Iqbal</surname> <given-names>M. W.</given-names></name> <name><surname>Anwar</surname> <given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>Auto-Prep: efficient and robust automated data preprocessing pipeline</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>107764</fpage>&#x02013;<lpage>107784</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3198662</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cabanillas-Tello</surname> <given-names>A.</given-names></name> <name><surname>Cabanillas-Carbonell</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Application software analysis for children with autism spectrum disorder: a review of the scientific literature from 2005 &#x02013; 2020,&#x0201D;</article-title> in <source>2020 International Conference on e-Health and Bioengineering (EHB)</source> (<publisher-loc>Iasi</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x02013;<lpage>4</lpage>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Carpenter</surname> <given-names>K. L. H.</given-names></name> <name><surname>Hahemi</surname> <given-names>J.</given-names></name> <name><surname>Campbell</surname> <given-names>K.</given-names></name> <name><surname>Lippmann</surname> <given-names>S. J.</given-names></name> <name><surname>Baker</surname> <given-names>J. P.</given-names></name> <name><surname>Egger</surname> <given-names>H. L.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Digital behavioral phenotyping detects atypical pattern of facial expression in toddlers with autism</article-title>. <source>Autism Res.</source> <volume>14</volume>, <fpage>488</fpage>&#x02013;<lpage>499</lpage>. doi: <pub-id pub-id-type="doi">10.1002/aur.2391</pub-id><pub-id pub-id-type="pmid">32924332</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Case-Smith</surname> <given-names>J.</given-names></name> <name><surname>Weaver</surname> <given-names>L. L.</given-names></name> <name><surname>Fristad</surname> <given-names>M. A.</given-names></name></person-group> (<year>2015</year>). <article-title>A systematic review of sensory processing interventions for children with autism spectrum disorders</article-title>. <source>Autism</source> <volume>19</volume>, <fpage>133</fpage>&#x02013;<lpage>148</lpage>. doi: <pub-id pub-id-type="doi">10.1177/1362361313517762</pub-id><pub-id pub-id-type="pmid">24477447</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>C.</given-names></name> <name><surname>Sun</surname> <given-names>C.</given-names></name> <name><surname>Ross</surname> <given-names>D. A.</given-names></name> <name><surname>Vondrick</surname> <given-names>C.</given-names></name> <name><surname>Pantofaru</surname> <given-names>C.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;Ava: a video dataset of spatio-temporally localized atomic visual actions,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Salt Lake City, UT</publisher-loc>), <fpage>6047</fpage>&#x02013;<lpage>6056</lpage>.</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Xie</surname> <given-names>L.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>F.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Affective computing of childern with authism based on feature transfer,&#x0201D;</article-title> in <source>2018 5th IEEE International Conference on Cloud Computing and Intelligence Systems (CCIS)</source> (<publisher-loc>Nanjing</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>845</fpage>&#x02013;<lpage>849</lpage>.</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Howard</surname> <given-names>A. G.</given-names></name> <name><surname>Zhu</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>B.</given-names></name> <name><surname>Kalenichenko</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Weyand</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>MobileNets: efficient convolutional neural networks for mobile vision applications</article-title>. <source>arXiv [Preprint].</source> Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1704.04861">http://arxiv.org/abs/1704.04861</ext-link> (accessed December 7, 2023).</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kamran</surname> <given-names>M.</given-names></name> <name><surname>Malik</surname> <given-names>M.</given-names></name> <name><surname>Iqbal</surname> <given-names>M. W.</given-names></name> <name><surname>Anwar</surname> <given-names>M.</given-names></name> <name><surname>Aqeel</surname> <given-names>M.</given-names></name> <name><surname>Ahmad</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>Web simplification prototype for cognitive disabled users</article-title>. <source>Hum. Behav. Emerg. Technol.</source> <volume>2022</volume>:<fpage>1</fpage>&#x02013;<lpage>14</lpage>. doi: <pub-id pub-id-type="doi">10.1155/2022/5817410</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khalil</surname> <given-names>M. I.</given-names></name> <name><surname>Tehsin</surname> <given-names>S.</given-names></name> <name><surname>Humayun</surname> <given-names>M.</given-names></name> <name><surname>Jhanjhi</surname> <given-names>N. Z.</given-names></name> <name><surname>AlZain</surname> <given-names>M. A.</given-names></name></person-group> (<year>2022</year>). <article-title>Multi-scale network for thoracic organs segmentation</article-title>. <source>Comput. Mater. Contin.</source> <volume>70</volume>, <fpage>3251</fpage>&#x02013;<lpage>3265</lpage>. doi: <pub-id pub-id-type="doi">10.32604/cmc.2022.020561</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Klintwall</surname> <given-names>L.</given-names></name> <name><surname>Eikeseth</surname> <given-names>S.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Early and intensive behavioral intervention (EIBI) in autism,&#x0201D;</article-title> in <source>Comprehensive Guide to Autism</source>, Eds. V. B. Patel, V. R. Preedy, and C. R. Martin (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>Springer New York</publisher-name>), <fpage>117</fpage>&#x02013;<lpage>137</lpage>.</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kohli</surname> <given-names>M.</given-names></name> <name><surname>Kar</surname> <given-names>A. K.</given-names></name> <name><surname>Sinha</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>The role of intelligent technologies in early detection of autism spectrum disorder (ASD): a scoping review</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>104887</fpage>&#x02013;<lpage>104913</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3208587</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lakkapragada</surname> <given-names>A.</given-names></name> <name><surname>Kline</surname> <given-names>A.</given-names></name> <name><surname>Mutlu</surname> <given-names>O. C.</given-names></name> <name><surname>Paskov</surname> <given-names>K.</given-names></name> <name><surname>Chrisman</surname> <given-names>B.</given-names></name> <name><surname>Stockham</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>The classification of abnormal hand movement to aid in autism detection: machine learning study</article-title>. <source>JMIR Biomed. Eng.</source> <volume>7</volume>:<fpage>e33771</fpage>. doi: <pub-id pub-id-type="doi">10.2196/33771</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Leo</surname> <given-names>M.</given-names></name> <name><surname>Carcagn&#x000EC;</surname> <given-names>P.</given-names></name> <name><surname>Distante</surname> <given-names>C.</given-names></name> <name><surname>Mazzeo</surname> <given-names>P. L.</given-names></name> <name><surname>Spagnolo</surname> <given-names>P.</given-names></name> <name><surname>Levante</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Computational analysis of deep visual data for quantifying facial expression production</article-title>. <source>Appl. Sci.</source> <volume>9</volume>:<fpage>4542</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app9214542</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Leo</surname> <given-names>M.</given-names></name> <name><surname>Del Coco</surname> <given-names>M.</given-names></name> <name><surname>Carcagni</surname> <given-names>P.</given-names></name> <name><surname>Distante</surname> <given-names>C.</given-names></name> <name><surname>Bernava</surname> <given-names>M.</given-names></name> <name><surname>Pioggia</surname> <given-names>G.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>&#x0201C;Automatic emotion recognition in robot-children interaction for ASD treatment,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Conference on Computer Vision Workshops (Santiago)</source> <fpage>145</fpage>&#x02013;<lpage>153</lpage>.</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>B.</given-names></name> <name><surname>Mehta</surname> <given-names>S.</given-names></name> <name><surname>Aneja</surname> <given-names>D.</given-names></name> <name><surname>Foster</surname> <given-names>C.</given-names></name> <name><surname>Ventola</surname> <given-names>P.</given-names></name> <name><surname>Shic</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x0201C;A facial affect analysis system for autism spectrum disorder,&#x0201D;</article-title> in <source>2019 IEEE International Conference on Image Processing (ICIP)</source> (<publisher-loc>Taipei</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4549</fpage>&#x02013;<lpage>4553</lpage>.</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Manocha</surname> <given-names>A.</given-names></name> <name><surname>Singh</surname> <given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>An intelligent monitoring system for indoor safety of individuals suffering from autism spectrum disorder (ASD)</article-title>. <source>J. Ambient Intell. Human. Comput.</source> <volume>14</volume>, <fpage>15793</fpage>&#x02013;<lpage>15808</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s12652-019-01277-3</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Marinoiu</surname> <given-names>E.</given-names></name> <name><surname>Zanfir</surname> <given-names>M.</given-names></name> <name><surname>Olaru</surname> <given-names>V.</given-names></name> <name><surname>Sminchisescu</surname> <given-names>C.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;3D human sensing, action and emotion recognition in robot assisted therapy of children with autism,&#x0201D;</article-title> in <source>2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Salt Lake City, UT</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2158</fpage>&#x02013;<lpage>2167</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Negin</surname> <given-names>F.</given-names></name> <name><surname>Ozyer</surname> <given-names>B.</given-names></name> <name><surname>Agahian</surname> <given-names>S.</given-names></name> <name><surname>Kacdioglu</surname> <given-names>S.</given-names></name> <name><surname>Ozyer</surname> <given-names>G. T.</given-names></name></person-group> (<year>2021</year>). <article-title>Vision-assisted recognition of stereotype behaviors for early diagnosis of autism spectrum disorders</article-title>. <source>Neurocomputing</source> <volume>446</volume>, <fpage>145</fpage>&#x02013;<lpage>155</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.neucom.2021.03.004</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>S.</given-names></name> <name><surname>Chang</surname> <given-names>S.</given-names></name> <name><surname>Oh</surname> <given-names>J.</given-names></name></person-group> (<year>2024</year>). <article-title>Utilizing deep learning for early diagnosis of autism: detecting self-stimulatory behavior</article-title>. <source>Int. J. Adv. Cult. Technol.</source> <volume>12</volume>, <fpage>148</fpage>&#x02013;<lpage>158</lpage>. doi: <pub-id pub-id-type="doi">10.17703/IJACT.2024.12.3.148</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Prakash</surname> <given-names>V. G.</given-names></name> <name><surname>Kohli</surname> <given-names>M.</given-names></name> <name><surname>Prathosh</surname> <given-names>A. P.</given-names></name> <name><surname>Juneja</surname> <given-names>M.</given-names></name> <name><surname>Gupta</surname> <given-names>M.</given-names></name> <name><surname>Sairam</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Video-based real-time assessment and diagnosis of autism spectrum disorder using deep neural networks</article-title>. <source>Expert Syst.</source> <volume>42</volume>:<fpage>e13253</fpage>. doi: <pub-id pub-id-type="doi">10.1111/exsy.13253</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rajagopalan</surname> <given-names>S. S.</given-names></name> <name><surname>Dhall</surname> <given-names>A.</given-names></name> <name><surname>Goecke</surname> <given-names>R.</given-names></name></person-group> (<year>2013</year>). <article-title>&#x0201C;Self-stimulatory behaviors in the wild for autism diagnosis,&#x0201D;</article-title> in <source>2013 IEEE International Conference on Computer Vision Workshops</source> (<publisher-loc>Sydney</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>755</fpage>&#x02013;<lpage>761</lpage>.</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rajagopalan</surname> <given-names>S. S.</given-names></name> <name><surname>Goecke</surname> <given-names>R.</given-names></name></person-group> (<year>2014</year>). <article-title>&#x0201C;Detecting self-stimulatory behaviors for autism diagnosis,&#x0201D;</article-title> in <source>2014 IEEE International Conference on Image Processing (ICIP)</source> (<publisher-loc>Paris</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>1470</fpage>&#x02013;<lpage>1474</lpage>.</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rehg</surname> <given-names>J.</given-names></name> <name><surname>Abowd</surname> <given-names>G.</given-names></name> <name><surname>Rozga</surname> <given-names>A.</given-names></name> <name><surname>Romero</surname> <given-names>M.</given-names></name> <name><surname>Clements</surname> <given-names>M.</given-names></name> <name><surname>Sclaroff</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2013</year>). <article-title>&#x0201C;Decoding children&#x00027;s social behavior,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Portland, OR</publisher-loc>), <fpage>3414</fpage>&#x02013;<lpage>3421</lpage>.</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Roane</surname> <given-names>H. S.</given-names></name></person-group> (<year>2016</year>). <article-title>Applied behavior analysis as treatment for autism spectrum disorder</article-title>. <source>J. Pediatr.</source> <volume>175</volume>, <fpage>27</fpage>&#x02013;<lpage>32</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jpeds.2016.04.023</pub-id><pub-id pub-id-type="pmid">27179552</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Saar-Tsechansky</surname> <given-names>M.</given-names></name> <name><surname>Provost</surname> <given-names>F.</given-names></name></person-group> (<year>2007</year>). <article-title>Handling missing values when applying classification models</article-title>. <source>J. Mach. Learn. Res.</source> <volume>8</volume>, <fpage>1625</fpage>&#x02013;<lpage>1657</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Simonyan</surname> <given-names>K.</given-names></name> <name><surname>Zisserman</surname> <given-names>A.</given-names></name></person-group> (<year>2015</year>). <article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <source>arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1409.1556">http://arxiv.org/abs/1409.1556</ext-link> (accessed December 7, 2023).</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stevens</surname> <given-names>E.</given-names></name> <name><surname>Dixon</surname> <given-names>D. R.</given-names></name> <name><surname>Novack</surname> <given-names>M. N.</given-names></name> <name><surname>Granpeesheh</surname> <given-names>D.</given-names></name> <name><surname>Smith</surname> <given-names>T.</given-names></name> <name><surname>Linstead</surname> <given-names>E.</given-names></name></person-group> (<year>2019</year>). <article-title>Identification and analysis of behavioral phenotypes in autism spectrum disorder via unsupervised machine learning</article-title>. <source>Int. J. Med. Inform.</source> <volume>129</volume>, <fpage>29</fpage>&#x02013;<lpage>36</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2019.05.006</pub-id><pub-id pub-id-type="pmid">31445269</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>C.</given-names></name> <name><surname>Xu</surname> <given-names>Z.</given-names></name> <name><surname>Pathak</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>F.</given-names></name></person-group> (<year>2020</year>). <article-title>Deep learning in mental health outcome research: a scoping review</article-title>. <source>Transl. Psychiatry</source> <volume>10</volume>:<fpage>116</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41398-020-0780-3</pub-id><pub-id pub-id-type="pmid">32532967</pub-id></mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Szegedy</surname> <given-names>C.</given-names></name> <name><surname>Vanhoucke</surname> <given-names>V.</given-names></name> <name><surname>Ioffe</surname> <given-names>S.</given-names></name> <name><surname>Shlens</surname> <given-names>J.</given-names></name> <name><surname>Wojna</surname> <given-names>Z.</given-names></name></person-group> (<year>2015</year>). <article-title>Rethinking the Inception architecture for computer vision</article-title>. <source>arXiv [Preprint]</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1512.00567">http://arxiv.org/abs/1512.00567</ext-link> (accessed December 7, 2023).</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tang</surname> <given-names>C.</given-names></name> <name><surname>Zheng</surname> <given-names>W.</given-names></name> <name><surname>Zong</surname> <given-names>Y.</given-names></name> <name><surname>Cui</surname> <given-names>Z.</given-names></name> <name><surname>Qiu</surname> <given-names>N.</given-names></name> <name><surname>Yan</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2018</year>). Automatic smile detection of infants in mother-infant interaction via CNN-based feature learning. in <italic>Proceedings of the Joint Workshop of the 4th Workshop on Affective Social Multimedia Computing and first Multi-Modal Affective Computing of Large-Scale Multimedia Data</italic> (Seoul), <fpage>35</fpage>&#x02013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3267935.3267951</pub-id></mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Thacker</surname> <given-names>C. B.</given-names></name> <name><surname>Makwana</surname> <given-names>R. M.</given-names></name></person-group> (<year>2019</year>). <article-title>Human behavior analysis through facial expression recognition in images using deep learning</article-title>. <source>Int. J. Innov. Technol. Explor. Eng.</source> <volume>9</volume>, <fpage>391</fpage>&#x02013;<lpage>397</lpage>. doi: <pub-id pub-id-type="doi">10.35940/ijitee.B6379.129219</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Totsika</surname> <given-names>V.</given-names></name> <name><surname>Hastings</surname> <given-names>R. P.</given-names></name> <name><surname>Emerson</surname> <given-names>E.</given-names></name> <name><surname>Lancaster</surname> <given-names>G. A.</given-names></name> <name><surname>Berridge</surname> <given-names>D. M.</given-names></name></person-group> (<year>2011</year>). <article-title>A population-based investigation of behavioral and emotional problems and maternal mental health: associations with autism spectrum disorder and intellectual disability: ASD and intellectual disability</article-title>. <source>J. Child Psychol. Psychiatry</source> <volume>52</volume>, <fpage>91</fpage>&#x02013;<lpage>99</lpage>. doi: <pub-id pub-id-type="doi">10.1111/j.1469-7610.2010.02295.x</pub-id></mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Washington</surname> <given-names>P.</given-names></name> <name><surname>Kline</surname> <given-names>A.</given-names></name> <name><surname>Mutlu</surname> <given-names>O. C.</given-names></name> <name><surname>Leblanc</surname> <given-names>E.</given-names></name> <name><surname>Hou</surname> <given-names>C.</given-names></name> <name><surname>Stockham</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Activity recognition with moving cameras and few training examples: applications for detection of autism-related headbanging,&#x0201D;</article-title> in <source>Extended Abstracts of the 2021 CHI Conference on Human Factors in Computing Systems</source> (<publisher-loc>Yokohama</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>7</lpage>.</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>P.</given-names></name> <name><surname>Ahmedt-Aristizabal</surname> <given-names>D.</given-names></name> <name><surname>Gammulle</surname> <given-names>H.</given-names></name> <name><surname>Denman</surname> <given-names>S.</given-names></name> <name><surname>Armin</surname> <given-names>M. A.</given-names></name></person-group> (<year>2023</year>). <article-title>Vision-based activity recognition in children with autism-related behaviors</article-title>. <source>Heliyon</source> <volume>9</volume>:<fpage>e16763</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.heliyon.2023.e16763</pub-id><pub-id pub-id-type="pmid">37303525</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>White</surname> <given-names>S. W.</given-names></name> <name><surname>Oswald</surname> <given-names>D.</given-names></name> <name><surname>Ollendick</surname> <given-names>T.</given-names></name> <name><surname>Scahill</surname> <given-names>L.</given-names></name></person-group> (<year>2009</year>). <article-title>Anxiety in children and adolescents with autism spectrum disorders</article-title>. <source>Clin. Psychol. Rev.</source> <volume>29</volume>, <fpage>216</fpage>&#x02013;<lpage>229</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cpr.2009.01.003</pub-id><pub-id pub-id-type="pmid">19223098</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ye</surname> <given-names>Y.</given-names></name> <name><surname>Ci</surname> <given-names>S.</given-names></name> <name><surname>Katsaggelos</surname> <given-names>A. K.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Qian</surname> <given-names>Y.</given-names></name></person-group> (<year>2013</year>). <article-title>Wireless video surveillance: a survey</article-title>. <source>IEEE Access</source> <volume>1</volume>, <fpage>646</fpage>&#x02013;<lpage>660</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2013.2282613</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>D.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>X.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;Estimating the uncertainty of average F1 scores,&#x0201D;</article-title> in <source>Proceedings of the 2015 ACM SIGIR International Conference on Theory of Information Retrieval</source> (<publisher-loc>Northampton, MA</publisher-loc>), <fpage>317</fpage>&#x02013;<lpage>320</lpage>.</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Z.</given-names></name> <name><surname>Zhu</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Xing</surname> <given-names>J.</given-names></name> <name><surname>Hu</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Identifying autism with head movement features by implementing machine learning algorithms</article-title>. <source>J. Autism Dev. Disord.</source> <volume>52</volume>, <fpage>3038</fpage>&#x02013;<lpage>3049</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s10803-021-05179-2</pub-id><pub-id pub-id-type="pmid">34250557</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/868601/overview">Saad Arif</ext-link>, King Faisal University, Saudi Arabia</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/777282/overview">Changiz Eslahchi</ext-link>, Shahid Beheshti University, Iran</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/855416/overview">Harish Katti</ext-link>, National Institutes of Health, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2906492/overview">Muhammad A. Khan</ext-link>, Prince Mohammad bin Fahd University, Saudi Arabia</p>
</fn>
</fn-group>
</back>
</article>