<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Toxicol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Toxicology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Toxicol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2673-3080</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1758963</article-id>
<article-id pub-id-type="doi">10.3389/ftox.2026.1758963</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Appearance-based computer vision pipeline for multi-animal monitoring of canine activity, behavior and clinical observations</article-title>
<alt-title alt-title-type="left-running-head">Eberhardt et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/ftox.2026.1758963">10.3389/ftox.2026.1758963</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Eberhardt</surname>
<given-names>Eline</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3348132"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Plochaet</surname>
<given-names>Jef</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3348543"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ophoff</surname>
<given-names>Tanguy</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>De Feyter</surname>
<given-names>Floris</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>De Landtsheer</surname>
<given-names>Sarah</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Teuns</surname>
<given-names>Greet</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/263025"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Vergauwen</surname>
<given-names>Maarten</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Feyen</surname>
<given-names>Bianca</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Goedem&#xe9;</surname>
<given-names>Toon</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Kopljar</surname>
<given-names>Ivan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/756625"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Non-Clinical Safety and Submissions, Preclinical Sciences and Translational Safety, J&#x26;J Innovative Medicine, Janssen Research &#x26; Development</institution>, <city>Beerse</city>, <country country="BE">Belgium</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Eavise- PSI, department of Electrical Engineering ESAT, KU Leuven</institution>, <city>Sint-Katelijne-Waver</city>, <country country="BE">Belgium</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>Scientific and In vivo Strategies, Preclinical Sciences and Translational Safety, J&#x26;J Innovative Medicine, Janssen Research &#x26; Development</institution>, <city>Beerse</city>, <country country="BE">Belgium</country>
</aff>
<aff id="aff4">
<label>4</label>
<institution>Global Safety Pharmacology, Preclinical Sciences and Translational Safety, J&#x26;J Innovative Medicine, Janssen Research &#x26; Development</institution>, <city>Beerse</city>, <country country="BE">Belgium</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Ivan Kopljar, <email xlink:href="mailto:ikopljar@its.jnj.com">ikopljar@its.jnj.com</email>
</corresp>
<fn fn-type="equal" id="fn001">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work and share first authorship</p>
</fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-18">
<day>18</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>8</volume>
<elocation-id>1758963</elocation-id>
<history>
<date date-type="received">
<day>02</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>21</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>27</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Eberhardt, Plochaet, Ophoff, De Feyter, De Landtsheer, Teuns, Vergauwen, Feyen, Goedem&#xe9; and Kopljar.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Eberhardt, Plochaet, Ophoff, De Feyter, De Landtsheer, Teuns, Vergauwen, Feyen, Goedem&#xe9; and Kopljar</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-18">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Behavioral monitoring of laboratory animals is essential for evaluating drug safety, yet existing assessments are typically limited to in-room observations by technicians. Here, we introduce our versatile AI model pipeline, composed of interconnected artificial neural networks that leverage end-to-end learning based solely on video-derived appearance features of canines. This non-invasive approach enables detailed mapping of activity, behavior and clinical signs at individual animal level under diverse conditions. To validate its real-world application, we conducted extensive field testing on hours of footage. Trained on a large, annotated dataset, our model can accurately multi-track up to three group-housed canines using color-coded reflective harnesses, achieving high re-identification accuracies (&#x2265;92.5%) and IDF1 scores up to 99.9%. AI-derived locomotor activity showed a strong correlation with accelerometer-based measurements (r &#x3d; 0.965). Our AI model detects 11 behavior and clinical observation classes, with a mean class accuracy of 48% and individual accuracies up to 93%. As such, a detailed time-specific quantitative output is available for activity, mobility, pose, eating, drinking and specific clinical signs (ataxia, anxiety, circling, convulsions, head shaking, involuntary muscle movements, limping, limb stiff, vomiting). Our innovative approach brings holistic behavioral and health monitoring in canines closer to routine practice and contributes towards the 3Rs principles.</p>
</abstract>
<kwd-group>
<kwd>animal behavior</kwd>
<kwd>CNS effects</kwd>
<kwd>computer vision</kwd>
<kwd>longitudinal behavioral assessment</kwd>
<kwd>preclinical animal models</kwd>
<kwd>safety pharmacology</kwd>
<kwd>toxicology</kwd>
<kwd>videomonitoring</kwd>
</kwd-group>
<funding-group>
<award-group id="gs1">
<funding-source id="sp1">
<institution-wrap>
<institution>Agentschap Innoveren en Ondernemen</institution>
<institution-id institution-id-type="doi" vocab="open-funder-registry" vocab-identifier="10.13039/open_funder_registry">10.13039/100012331</institution-id>
</institution-wrap>
</funding-source>
</award-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. The work presented in this paper was carried out in the framework of project grant HBC.2021.1126 to IK, funded by the government Flanders Innovation and Entrepreneurship (VLAIO) agency; <ext-link ext-link-type="uri" xlink:href="http://www.vlaio.be/">http://www.vlaio.be</ext-link>. The funders had no role in study design, data collection and analysis, decision to publish, or preparation of the manuscript.</funding-statement>
</funding-group>
<counts>
<fig-count count="7"/>
<table-count count="2"/>
<equation-count count="3"/>
<ref-count count="42"/>
<page-count count="21"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Toxicology and Informatics</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Before new medicines reach clinical trials in humans, a thorough non-clinical safety evaluation is made to assess potential safety risks and to guide safe clinical dosing strategies. Non-clinical safety assessments involve <italic>in vivo</italic> studies typically conducted in rodent and/or non-rodent species, providing complementary insights into potential effects of drug candidates on various physiological systems. These <italic>in vivo</italic> studies are guided by the 3Rs principle (reduce, refine, replace) to minimize the use and discomfort of animals. Among non-rodent species, Beagle dogs are frequently used because of their well-characterized biology and suitability for long-term observations.</p>
<p>In these non-clinical <italic>in vivo</italic> studies, the evaluation of behavioral changes and/or abnormalities (Clinical Observations or ClinObs) is crucial for understanding potential safety concerns in humans. Common clinical conditions can include altered activity level or behavior (e.g., increased anxiety or aggression), gastrointestinal signs (e.g., diarrhea or vomiting), motor deficits (e.g., limping) and neurological events (e.g., convulsion). The nature and/or severity of the observed abnormalities, with neurological signs as an important driver, heavily influences decisions to advance or halt a drug candidate. To enable correct decision making, it is critical that significant events are detected and correctly interpreted, distinguishing true drug-related effects from spontaneous occurrences. This is essential for ensuring that safety concerns are neither overlooked nor misjudged. Since neurological clinical signs have a low spontaneous incidence in many laboratory animal species, including canines (<xref ref-type="bibr" rid="B27">Orciani et al., 2024</xref>), quantifying the physiological baseline is extremely challenging. To increase the likelihood of event detection, both during baseline and study phases, it would be beneficial to detect premonitory signs (warning signals) which are certain behaviors that foreshadow the actual event (<xref ref-type="bibr" rid="B3">Authier et al., 2019</xref>). In canines, for example, convulsions are most frequently preceded by tremors and ataxia (<xref ref-type="bibr" rid="B3">Authier et al., 2019</xref>).</p>
<p>Traditionally, the presence of behavioral changes and/or ClinObs are assessed in-person by trained researchers which presents several limitations, including time constraints, subjectivity, and the effect of human interference on animal behavior. Video surveillance can eliminate these gaps through continuous monitoring of animal behavior. This can be combined with activity tracking methods (such as Actiwatch (<xref ref-type="bibr" rid="B11">Cools et al., 2020</xref>)) that provide a detailed overview of the animal&#x2019;s activity level over time, allowing for the identification of changes that may warrant further investigation of the corresponding video footage. These manual video analysis approaches still remain time- and labor-intensive. Additionally, not every behavioral change is associated with a variation in activity level, meaning that significant events could still be overlooked.</p>
<p>To achieve truly continuous monitoring of animal activity and abnormal behaviors, innovative approaches are required that go beyond traditional in-room observations and wearable devices (see also (<xref ref-type="bibr" rid="B6">Berridge et al., 2024</xref>)). Artificial intelligence (AI)-driven video analysis, powered by computer vision, offers a promising solution by delivering a detailed time-specific and quantitative output on individual animal level. This enables the investigation of behavioral patterns and premonitory signs, and an objective quantification of spontaneous occurrences compared to on-study events. While AI-based monitoring technologies for rodents are widely explored and commercially available for several years (<xref ref-type="bibr" rid="B21">Isik and Unal, 2023</xref>; <xref ref-type="bibr" rid="B23">Kahnau et al., 2023</xref>; <xref ref-type="bibr" rid="B33">Sillito et al., 2025</xref>), similar solutions for non-rodent species remain undeveloped.</p>
<p>Overcoming the limitations of early simple animal tracking methods that were standard in behavioral pharmacology, the emergence of deep learning brought supervised methods that improved resolution and flexibility. Markerless pose estimation frameworks such as DeepLabCut (<xref ref-type="bibr" rid="B25">Lauer et al., 2022</xref>) and SLEAP (<xref ref-type="bibr" rid="B28">Pereira et al., 2022</xref>) enabled user-defined labeling of rodent poses with high precision. Most of these approaches are keypoint-based, reducing animals to skeletal landmarks (e.g., limb joints, nose, or tail base) that feed into behavior classifiers. Commercial tools like EthoVision XT (Noldus) extend this paradigm, offering interpretable pipelines for rodent ethology. However, while efficient for gross motor actions, keypoint-only representations often miss clinically important but subtle events&#x2014;such as tremors, micro-movements, or fine distinctions like sniffing versus drinking&#x2014;since appearance cues are discarded. Applications of keypoint tracking in non-rodent species are emerging: pose-based recognition of activity states in rhesus macaques using a 26-camera setup (<xref ref-type="bibr" rid="B4">Bala et al., 2020</xref>), vision-based cattle tracking for welfare monitoring (<xref ref-type="bibr" rid="B38">Wu et al., 2021</xref>), and camera-based detection of sow induced piglet-crushing (<xref ref-type="bibr" rid="B18">Gan et al., 2025</xref>). These studies illustrate feasibility across species but remain task-specific and limited in scope compared to rodent-focused systems.</p>
<p>To address the limitations of sparse keypoint data, recent work has shifted toward appearance-based deep learning. By learning directly from pixels, these models can capture subtle postural and textural cues that keypoints overlook. DeepEthogram (<xref ref-type="bibr" rid="B8">Bohnslav et al., 2021</xref>) uses a two-stream Convolutional Neural Network (CNN) for this, albeit constrained to a classic top-down camera viewpoint. In the computer vision literature, viewpoint-free methods were presented, including AnimalMotionCLIP (<xref ref-type="bibr" rid="B41">Zhong et al., 2025</xref>), which links visual motion with semantic descriptors, and MammalNet (<xref ref-type="bibr" rid="B10">Chen et al., 2023</xref>), a large-scale wildlife recognition framework. In canines, end-to-end models could even predict emotional states from still images, underscoring the potential of appearance-based AI for clinically relevant behavioral monitoring (<xref ref-type="bibr" rid="B17">Franzoni et al., 2024</xref>). The key advantage here is sensitivity to subtlety; the main challenge remains the need for large, diverse datasets to train these models.</p>
<p>In parallel, unsupervised methods such as motion-mapping (<xref ref-type="bibr" rid="B5">Berman et al., 2014</xref>) and hidden Markov models (<xref ref-type="bibr" rid="B37">Wiltschko et al., 2015</xref>) have been used to discover behavioral &#x201c;syllables&#x201d; without predefined labels. While powerful for exploratory ethology, these approaches are less compatible with pharmaceutical safety studies, where predefined and interpretable ClinObs categories are required for regulatory acceptance.</p>
<p>Together, these developments illustrate the trade-off between interpretability, annotation burden, and sensitivity. Our present work builds on this landscape by advancing appearance-based end-to-end learning tailored to non-clinical safety studies in canines, combining clinical interpretability with the capacity to detect subtle behavioral and clinical signs. We created an integrated AI model pipeline composed of different Artificial Neural Networks (ANNs) capable of monitoring individual canine activity, behavior and ClinObs; across single and group-housed conditions.</p>
<p>In this work, we present our unique, integrated AI model pipeline which we extensively validated in real-world scenarios involving hours of footage (&#x3e;18 million frames). Thanks to our efficient annotation strategy using a single dot to mark the animal, we were able to reach an astonishing &#x3e;1.8 million annotated frames. This large dataset enabled us to develop ANNs that can recognize a variety of behaviors (eating and drinking) and ClinObs (ataxia, anxiety, circling, convulsions, head shaking, involuntary muscle movements, limping, limb stiff, vomiting), purely on video data without the requirement for additional sensors or intermediate keypoint representations. Additionally, our use of non-obtrusive visual identifiers resulted in robust individual animal tracking which is crucial to generate read-outs on individual animal level. This strategy worked even in challenging group-housed settings and complex front-view multi-camera set-ups where occluders such as enrichment objects, sleeping spots and groupmates can obstruct the animals, their movement and interactions. Finally, we designed our ANNs to be generic, which allows for their application for diverse observations even across species, provided that sufficient retraining is performed.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<label>2</label>
<title>Materials and methods</title>
<sec id="s2-1">
<label>2.1</label>
<title>Animals and housing</title>
<p>Video surveillance cameras (AXIS P3235-LVE or AXIS P3245-LVE) were positioned in front view, outside of the animal bins at &#xb1;1.2&#xa0;m height (<xref ref-type="fig" rid="F1">Figure 1c</xref>). Data was captured at 25 or 30 frames per second (fps). All the data used to train our models was solely sourced from historical video footage of previously performed in-house studies and colony animals and originated from five experimental rooms with the same housing and camera setup. All procedures were approved by the ethics committee on Animal Experiments of the research center of Johnson and Johnson, located in Beerse, Belgium. Animals were housed in accordance with the <italic>Guide for the Care and Use of Animals,</italic> European Directive of 2010 (2010/63/EU) on the protection of animals used for scientific purposes, the Belgian and Flemish Region implementing legislation, and in an AAALAC-accredited facility under controlled temperature and humidity and maintained on a 12&#xa0;h light/dark cycle. The studies reported here were compliant with the ARRIVE Guidelines for reporting animal research. Animals were under the care of trained biotechnicians with veterinary oversight and received appropriate veterinary care if needed. Enrichment was always provided to colony and study animals (toys, blankets, etc.) as well as sleeping space (baskets or beds).</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Concept for AI-based continuous monitoring of canines. <bold>(a)</bold> Illustration of our pipeline with interconnected ANN modules that are combined into an integrated AI model for activity, behavior and ClinObs tracking. Per ANN module, the required inputs and the generated outputs are shown. First, a detector ANN detects all canines in a video stream and subsequently generates a table with rectangular coordinates of each detection in each frame and its corresponding confidence. These are used to extract cut-outs of the individual animals in the video stream, which are then fed into a harness classifier, returning the specific harness the animal is wearing. The tracker module subsequently links detections from the same animal of different frames together in a single track. These tracks are then used to group the cut-outs separately per animal, which serve as input for the last two ANNs (pose classifier and behavior and ClinObs classifier). <bold>(b)</bold> Colored harnesses with reflective patterns used for animal identification and tracking: Red-None (R&#x2013;N), Yellow-Reflective stripes (Y-St) and Black-Reflective squares (B-Sq). <bold>(c)</bold> Our two different setups (bar and glass fronts) with their respective day and night vision. To allow correct recognition of behaviors and ClinObs, front view cameras are applied which are installed at &#xb1; 1.2&#xa0;m height.</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g001.tif">
<alt-text content-type="machine-generated">Multi-panel scientific figure describing an AI-based video analysis pipeline for animal behavior tracking. Panel a shows a flowchart from video input through artificial neural network modules&#x2014;detector, harness classifier, tracker, pose classifier, and behavior classifier&#x2014;outputting metrics like frame coordinates, harness IDs, pose, and behavior, used for activity and clinical observation tracking. Panel b displays cropped images of three types of dog harnesses for identification: red non-reflective, yellow with reflective stripes, and black with reflective squares, each shown in visible and infrared views. Panel c presents four camera setups of a kennel: day and night views with bars and glass, illustrating lighting conditions for monitoring.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Annotations</title>
<p>
<xref ref-type="table" rid="T1">Table 1</xref> provides an overview of the total amount of annotated data. Only for the ClinObs, the majority of the data originated from the setup with bars, as the glass fronts were only recently implemented. All data was labeled by a pool of nine people with experience (5&#x2013;20&#xa0;years) in canine behavior, and an additional quality control (QC) check was performed by a single expert. All ClinObs videos were approved as being representative prior to labeling by three experts as previously described (<xref ref-type="bibr" rid="B15">Eberhardt et al., 2025</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Overview of the manually annotated video data for each ANN. Per ANN and per relevant subsection (different setup, dataset, or class): number of annotated video frames and annotation type (bounding box or dot).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">ANN</th>
<th rowspan="2" align="center">ANN subsection</th>
<th colspan="2" align="center">N&#xb0; frames</th>
<th rowspan="2" align="center">Annotation type</th>
</tr>
<tr>
<th align="center">Day</th>
<th align="center">Night</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="2" align="right">Detector (balanced bars/Glass)</td>
<td align="right">Bars</td>
<td align="center">196002</td>
<td align="center">103300</td>
<td rowspan="2" align="center">Bounding box</td>
</tr>
<tr>
<td align="right">Glass</td>
<td align="center">65176</td>
<td align="center">8937</td>
</tr>
<tr>
<td rowspan="2" align="right">Harness classifier (balanced bars/Glass)</td>
<td align="right">
<italic>Initial dataset</italic>
<break/>Red-none<break/>Yellow-stripes<break/>Black-squares<break/>Black-dots<break/>Grey-fluo<break/>No harness</td>
<td align="center">
<italic>381</italic>
<break/>70<break/>62<break/>64<break/>60<break/>61<break/>64</td>
<td align="center">
<italic>385</italic>
<break/>65<break/>66<break/>62<break/>65<break/>60<break/>67</td>
<td rowspan="2" align="center">Dot</td>
</tr>
<tr>
<td align="right">
<italic>Final dataset</italic>
<break/>Red-none<break/>Yellow-stripes<break/>Black-squares</td>
<td align="center">
<italic>18004</italic>
<break/>6878<break/>6042<break/>5084</td>
<td align="center">
<italic>17252</italic>
<break/>5268<break/>6260<break/>5724</td>
</tr>
<tr>
<td rowspan="4" align="right">Tracker (bars)</td>
<td align="right">Day-S</td>
<td align="center">2878</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
<td rowspan="4" align="center">Dot</td>
</tr>
<tr>
<td align="right">Day-C</td>
<td align="center">1746</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
</tr>
<tr>
<td align="right">Night-S</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
<td align="center">2301</td>
</tr>
<tr>
<td align="right">Night-C</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
<td align="center">1713</td>
</tr>
<tr>
<td rowspan="5" align="right">Pose classifier (bars)</td>
<td align="right">Lying</td>
<td align="center">10632</td>
<td align="center">2270</td>
<td rowspan="5" align="center">Bounding box</td>
</tr>
<tr>
<td align="right">Sitting</td>
<td align="center">4663</td>
<td align="center">642</td>
</tr>
<tr>
<td align="right">Standing</td>
<td align="center">23625</td>
<td align="center">5093</td>
</tr>
<tr>
<td align="right">Standing up</td>
<td align="center">4892</td>
<td align="center">345</td>
</tr>
<tr>
<td align="right">Standing down</td>
<td align="center">293</td>
<td align="center">40</td>
</tr>
<tr>
<td rowspan="4" align="right">Behavior classifier (balanced bars/Glass)</td>
<td align="right">Eating-hopper</td>
<td align="center">45250</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
<td rowspan="4" align="center">Bounding box/Dot</td>
</tr>
<tr>
<td align="right">Eating-bowl</td>
<td align="center">35867</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
</tr>
<tr>
<td align="right">Drinking</td>
<td align="center">60804</td>
<td align="center">16942</td>
</tr>
<tr>
<td align="right">None</td>
<td align="center">258020</td>
<td align="center">54014</td>
</tr>
<tr>
<td rowspan="13" align="right">Behavior and ClinObs classifier (majority only bars)</td>
<td align="right">Eating-hopper</td>
<td align="center">51148</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
<td rowspan="13" align="center">Bounding box/Dot</td>
</tr>
<tr>
<td align="right">Eating-bowl</td>
<td align="center">35867</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
</tr>
<tr>
<td align="right">Drinking</td>
<td align="center">57224</td>
<td align="center">16942</td>
</tr>
<tr>
<td align="right">Anxiety</td>
<td align="center">8762</td>
<td align="center" style="background-color:#AEAAAA">&#x200b;</td>
</tr>
<tr>
<td align="right">Ataxia</td>
<td align="center">39878</td>
<td align="center">3116</td>
</tr>
<tr>
<td align="right">Circling</td>
<td align="center">13021</td>
<td align="center">3059</td>
</tr>
<tr>
<td align="right">Convulsion</td>
<td align="center">24098</td>
<td align="center">3059</td>
</tr>
<tr>
<td align="right">Head shaking</td>
<td align="center">3816</td>
<td align="center">2016</td>
</tr>
<tr>
<td align="right">Inv. Muscle movements</td>
<td align="center">72526</td>
<td align="center">16162</td>
</tr>
<tr>
<td align="right">Limb stiff</td>
<td align="center">3817</td>
<td align="center">86</td>
</tr>
<tr>
<td align="right">Limping</td>
<td align="center">47334</td>
<td align="center">11131</td>
</tr>
<tr>
<td align="right">Vomiting/Retching</td>
<td align="center">7608</td>
<td align="center">3427</td>
</tr>
<tr>
<td align="right">None</td>
<td align="center">696844</td>
<td align="center">190825</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We deliberately included challenging data in our datasets to better reflect real-world conditions. These difficult cases comprised examples where distinguishing between similar behaviors or clinical observations proved particularly difficult&#x2013;for example, sniffing the drinking nipple (<italic>not drinking</italic>) versus actual drinking. Other challenging examples included occluded or overlapping animals, and observations that were very similar, such as rolling versus convulsion. Adding this complexity during training enhanced model robustness; and during evaluation allowed for more rigorous assessment.</p>
<p>All annotations were performed using an in-house customized version of the open-source Computer Vision Annotation Tool (CVAT) (<ext-link ext-link-type="uri" xlink:href="https://github.com/cvat-ai/cvat">https://github.com/cvat-ai/cvat</ext-link>). We started with bounding box annotations which we replaced by a dot-2-box strategy to decrease labeling time once our detector ANN was finalized. In the latter, a single center-point (dot) was placed on each animal which was matched with the center point of a predicted bounding box by our trained detector. The latter was subsequently used to generate cut-outs for ANN training, validation and testing.</p>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Performance metrics</title>
<p>Model performance was assessed using accuracy (top-1), precision, recall, and the derived F1 score, all obtained from confusion matrices. Accuracy refers to the proportion of correctly classified frames. However, this measure can be inflated by the presence of class imbalance, as canines spend much of their time in the dominant background class <italic>none</italic> and spend most of their time lying or standing. To address this, we report both top-1 accuracy (micro accuracy across all frames) and class accuracy (the mean of individual class accuracies). Precision quantifies the proportion of predicted positive frames that are correct, while recall measures the proportion of true positive frames identified. Their harmonic mean, the F1 score, provides a balanced measure of both. Accuracy and F1 were calculated at the threshold, maximizing the F1 point on the precision&#x2013;recall curve.</p>
<p>Tracking performance was measured using standard metrics (<xref ref-type="bibr" rid="B31">Ristani et al., 2016</xref>). Multi Object Tracking Accuracy (MOTA) captures overall error by combining missed detections, false positives, and identity switches. To evaluate tracking consistency, we used IDR, IDP and IDF1: identity recall (IDR) quantifies the fraction of the ground-truth tracks that are correctly recovered, identity precision (IDP) measures the proportion of predicted tracks that match ground-truth tracks, and IDF1 is their harmonic mean.</p>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>ANN modules composing our AI model pipeline</title>
<p>All ANN training, validation and testing were performed on the Domino platform (<ext-link ext-link-type="uri" xlink:href="https://domino.ai/platform">https://domino.ai/platform</ext-link>) using a single Nvidia L4 GPU (24&#xa0;GB of memory).</p>
<sec id="s2-4-1">
<label>2.4.1</label>
<title>Detector</title>
<p>The data (<xref ref-type="table" rid="T1">Table 1</xref>) was split into train-validation-test sets according to a 70%-10%&#x2013;20% ratio, and per video rather than on a frame-level. Since frames from the same video likely resemble each other, they should not be distributed across different subsets, as this could hinder the ability to determine whether the detector is overfitting. Additionally, we balanced day/night, bars/glass and single/multiple canines across the different subsets.</p>
<p>We compared a variety of object detectors for which we selected the best architecture and training parameters on the training and validation datasets: YOLOv2 (<xref ref-type="bibr" rid="B29">Redmon and Farhadi, 2017</xref>), YOLOv3 (<xref ref-type="bibr" rid="B30">Redmon and Farhadi, 2018</xref>) YOLOv4 (<xref ref-type="bibr" rid="B7">Bochkovskiy et al., 2020</xref>), YOLT (<xref ref-type="bibr" rid="B35">Van Etten, 2018</xref>), D-YOLO (<xref ref-type="bibr" rid="B1">Acatay et al., 2018</xref>) and ResNet-YOLO (<xref ref-type="bibr" rid="B26">Ophoff et al., 2022</xref>).</p>
<p>Our most performant detector YOLOv2 trained with the Complete Intersection-over-Union (CIoU) loss introduced in (<xref ref-type="bibr" rid="B7">Bochkovskiy et al., 2020</xref>) (Results 3.3.1), was fine-tuned from ImageNet pretraining and optimized with stochastic gradient descent. A cyclic learning rate was used ranging from 2 &#xd7; 10<sup>&#x2212;8</sup> to 10<sup>&#x2013;3</sup>. The training was conducted with an effective batch size of 32 over a maximum of 30 epochs. We selected the best model on the validation set after training. The loss function employed the Complete Intersection-over-Union (CIoU). The input images were resized to 640 &#xd7; 384 pixels. The input data was normalized using ImageNet statistics and augmentation included random horizontal flipping, color jitter and geometric jitter.</p>
</sec>
<sec id="s2-4-2">
<label>2.4.2</label>
<title>Animal identification</title>
<sec id="s2-4-2-1">
<label>2.4.2.1</label>
<title>Fur approach</title>
<p>We initially investigated fur patterns for animal identification, training a ResNet-18 model (<xref ref-type="bibr" rid="B19">He et al., 2016</xref>) a dataset of 40 canines (50 images per canine) with 5-fold cross-validation and data augmentation. This approach was validated using a nearest-neighbor classifier and three-crop evaluation, this resulted in a 63.7% top-1 accuracy, which proved insufficient for reliable tracking (data not shown). The suboptimal performance was likely due to the smaller dataset combined with the variability in fur patterns depending on which part of the animal was visible. As accurate tracking of individual animals is essential, we opted to use the harness approach for identification.</p>
</sec>
<sec id="s2-4-2-2">
<label>2.4.2.2</label>
<title>Harness approach</title>
<p>To overcome the limitations of fur-based identification, we shifted to using harnesses with distinct visual features. Each harness had a unique daytime color and a reflective pattern visible at night. As the maximum number of individuals in a group is typically three, we only needed three distinct IDs. For larger group sizes, animals can be housed in sub-groups of &#x2264;3 animals each. To identify the most distinguishable combinations, we started with six harness types: <italic>Black-Dots, Black-Reflective squares, Grey-Fluo, Yellow-Reflective stripes</italic>, <italic>Red-No reflection</italic> and <italic>No harness.</italic>
</p>
<p>All harness classifier models consist of a ResNet-18 (<xref ref-type="bibr" rid="B19">He et al., 2016</xref>) backbone trained with a batch size of 100 and cross-entropy loss. The 6-ID and 4-ID models were finetuned from ImageNet1k weights using SGD with a learning rate of 0.01 for 50 epochs. The 3-ID model was finetuned from the 4-ID model using a slightly lower learning rate of 0.008 and trained for 20 epochs. Input images were resized, normalized to ImageNet statistics, and augmented with random horizontal and vertical flipping. Each model used 10-fold cross-validation to select optimal hyperparameters.<list list-type="bullet">
<list-item>
<p>6-ID Model: Trained on the initial dataset (<xref ref-type="table" rid="T1">Table 1</xref>) with all six harness types. After hyperparameter tuning, the model was trained on the full training set and evaluated on the test set, achieving a top-1 accuracy of 94.2%. Based on these results, we dropped the underperforming harnesses <italic>Black-Dots</italic> and <italic>Grey-Fluo</italic> (see <xref ref-type="sec" rid="s12">Supplementary Figure S1</xref>).</p>
</list-item>
<list-item>
<p>4-ID Model: Retrained on the reduced set of four harnesses: <italic>Black-Reflective squares</italic>, <italic>Red-No reflection</italic>, <italic>Yellow-Reflective stripes</italic>, and <italic>No harness.</italic> This model achieved a top-1 accuracy of 99.0%. As performance was consistent throughout all classes, we chose to drop the <italic>No harness</italic> class to ensure all canines were treated equally during a study.</p>
</list-item>
<list-item>
<p>3-ID Model: The final model was trained on three harnesses: <italic>Black-Reflective squares</italic> (B-Sq), <italic>Red-No Reflection</italic> (R-N), and <italic>Yellow-Reflective stripes</italic> (Y-St). To improve robustness, we expanded the dataset to include more challenging scenarios, such as occlusions and cut-outs containing multiple harnesses (<xref ref-type="table" rid="T1">Table 1</xref>). This model was evaluated on a separate set of videos, which also served as the evaluation set for the tracker (Results 3.3.3).</p>
</list-item>
</list>
</p>
</sec>
</sec>
<sec id="s2-4-3">
<label>2.4.3</label>
<title>Animal tracking and reID</title>
<p>To optimally select the correct detections and assign the correct animal ID, we developed the following strategy. The detector renders several potential detections within a frame (or across multiple camera views) which are all processed by the harness classifier model. To select the optimal combination, we utilized the confidence scores from both the detector and the harness classifier. If a canine with the same ID was detected in the previous frame, we included the IoU score between the bounding box in the previous and current frame to further refine the selection of bounding boxes (<xref ref-type="fig" rid="F2">Figure 2</xref>). Using this information, we calculated a score for each potential detection in the frame using the following formula:<disp-formula id="equ1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the confidence scores of the detector and Harness classifier model, respectively, and range from 0 to 1. <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the score between the current bounding box and the bounding box in the previous frame. When there is no previous detection, this score is set to 0. <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>d</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>r</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are weighing factors for the corresponding scores. We achieved the best results with respective values of 1.0, 0.75, and 1.0.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Multi-camera processing for accurate tracking and reID. Illustration of our tracking and reID strategy in group-housed animals across multiple camera views. All potential detections within a frame (across multiple camera views) are processed by the reID model (red shaded box) that utilizes the weighted confidence scores from both the detector and the harness classifier model to select the optimal combinations. Furthermore, if a canine with the same ID was detected in the previous frame, we employ the Intersection over Union (IoU) metric to further refine the bounding box selection. Using these three parameters, a score is calculated for each potential detection in the frame, with a higher score indicating greater certainty in our models&#x27; predictions. Finally, we apply a modified Jonker-Volgenant algorithm to find the combination that yields the highest overall score. In a last step, we use a Kalman filter for tracking the bounding boxes over time, smoothing out abrupt changes and filling in missed detections.</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a multi-step computer vision workflow for animal detection and identification in a cage from two camera views. Initial frames show bounding boxes for potential animal detections. Middle section visualizes a scoring process that combines detection confidence, harness classification, and weighted matching, followed by algorithmic identification assignment. Final output displays determined identities in colored boxes in the respective camera frames.</alt-text>
</graphic>
</fig>
<p>In this way, each detection is given a score between 0 and 1 for every ID, with a higher score indicating greater certainty in our models&#x27; predictions. To filter out less certain detections and reduce the likelihood of false positives, only detection/ID combinations with a score &#x3e;0.5 were retained. We subsequently used a modified Jonker-Volgenant algorithm (<xref ref-type="bibr" rid="B22">Jonker and Volgenant, 1987</xref>; <xref ref-type="bibr" rid="B32">SciPy, 2008</xref>; <xref ref-type="bibr" rid="B12">Crouse, 2016</xref>) to find the combination that yields the highest overall score. A significant advantage of this algorithm is that it prevents the same ID from being detected twice. In addition, because the algorithm runs on every frame, a misclassification on a certain frame does not influence the misclassification probability in subsequent frames.</p>
<p>As camera views of adjacent kennels are processed together, the same approach was applied in group-housed conditions when animals were able to move across different camera views (<xref ref-type="fig" rid="F2">Figure 2</xref>). The Harness classifier model processes every detection across all different camera views, and the modified Jonker-Volgenant algorithm continues to select the best combination, thus preventing duplicate IDs across the various camera views. Another advantage of this simultaneous processing is that the algorithm can compare all detections and assign IDs based on its certainty. For example, if only two of three harnesses are clearly visible, the algorithm can correctly assign those two IDs and infer the ID of the third animal, even though the harness itself is not visible.</p>
<p>Finally, we used a Kalman filter (<xref ref-type="bibr" rid="B24">Kalman, 1960</xref>) to track the bounding boxes over time, smooth out abrupt changes and fill in missed detections.</p>
<p>We evaluated the performance of the tracking module on a collection of video snippets covering day and night with animals either mostly separate (Day/Night-S) or clustered (closely together with overlapping bounding boxes, Day/Night-C) (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
</sec>
<sec id="s2-4-4">
<label>2.4.4</label>
<title>AI activity tracking and mobile/immobile</title>
<p>For the AI activity tracking, in postprocessing, the Euclidean distance (in pixel) was determined between the center of the current bounding box and the center of the bounding box on the previous frame/of the previous detection (<xref ref-type="fig" rid="F4">Figure 4a</xref>). In this way, a measure of movement was obtained for every animal on every frame. These movements were then binned per time frame (generally per minute) to obtain data for visualization in &#x201c;actigrams&#x201d; that provide a detailed plot of activity levels over time per animal.</p>
<p>To get a more general overview of the animals&#x2019; activity, the movements described above were classified on a frame-level as &#x2018;mobile&#x2019; or &#x2018;immobile&#x2019; based on a cutoff (<xref ref-type="fig" rid="F4">Figure 4a</xref>, and Methods 2.5.2). Mobile refers to animals moving from one point in space to another, while immobile refers to animals remaining in the same location.</p>
</sec>
<sec id="s2-4-5">
<label>2.4.5</label>
<title>Pose classifier</title>
<p>We considered five poses: <italic>Lying</italic>, <italic>Sitting</italic>, <italic>Standing</italic>, <italic>Standing Up</italic> (on hind legs) and <italic>Standing Down</italic> (on front legs, e.g., when jumping off the bench). The initial dataset was split with similar ratios of the different poses in the subsets.</p>
<p>ResNet-18 (<xref ref-type="bibr" rid="B19">He et al., 2016</xref>) served as a backbone for the pose classifier, trained using the cross-entropy loss for up to 50 epochs using a batch size of 64. Again, the best model was selected based on its performance on the validation set. SGD and a cyclic learning rate (varying between 1 &#xd7; 10<sup>&#x2212;3</sup> and 2 &#xd7; 10<sup>&#x2212;2</sup>) were used for optimization. The inputs were letterboxed (using gray to pad the borders) and resized to 256 &#xd7; 256 pixels, normalized, augmented with random horizontal flipping, color jitter. We used class balancing during training, as well as detector-generated bounding boxes to make the model robust to imperfect bounding boxes.</p>
</sec>
<sec id="s2-4-6">
<label>2.4.6</label>
<title>Behavior classifier</title>
<p>The behavior classifier receives sequential cut-outs of a single canine over time from the tracker module. Unlike previous models, which primarily process spatial information, this model also incorporates temporal information by analyzing sequential frames. This additional temporal context is crucial, as behaviors are often not apparent from a single frame alone.</p>
<p>We considered three behaviors: eating, drinking and <italic>none</italic> (no behavior of interest) (<xref ref-type="table" rid="T1">Table 1</xref>). As animals can be provided with food from either a hopper or a bowl, video material from both conditions was subdivided into respective subclasses. This subdivision into more homogeneous classes increased accuracy. During postprocessing, both subclasses were recombined into a single <italic>eating</italic> category. Additionally, we included several challenging <italic>not drinking</italic> and <italic>not eating</italic> fragments in which animals were standing near or sniffing the drinking nipple or food hopper/bowl without actually drinking or eating. This was done to help the model learn to distinguish between similar appearances more accurately.</p>
<p>The data was split into &#x201c;training-validation-test&#x201d; according to a 60%-15%&#x2013;25% ratio with balanced day/night and bars/glass. Since the behavior model operates on entire videos and individual videos cannot be divided across different data sets, these ratios were approximate at the frame level. To address the class imbalance in the dataset during training, we employed random weighted sampling.</p>
<p>We fine-tuned a small Vision Transformer (ViT-S/16) with joint space-time attention (<xref ref-type="bibr" rid="B14">Dosovitskiy et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Arnab et al., 2021</xref>; <xref ref-type="bibr" rid="B34">Tong et al., 2022</xref>), initialized from a VideoMAE checkpoint, pretrained on Kinetics-400(<xref ref-type="bibr" rid="B34">Tong et al., 2022</xref>), as our behavior classifier. Inputs were 16-frame clips at 224 &#xd7; 224, sampled with a temporal stride of 4, yielding a 64-frame window (&#x223c;2.5&#xa0;s at 25fps). Training used a AdamW optimizer with a cosine decay learning rate schedule (base 1 &#xd7; 10<sup>&#x2212;3</sup>, minimum 1 &#xd7; 10<sup>&#x2212;6</sup>), weight decay of 0.05 and 3 warm-up epochs, for a total of 15 epochs. Similarly, the best model on the validation set was selected. Gradient accumulation yielded an effective batch size of 150. Clips were letterbox-padded and normalized; augmentations included random horizontal flipping and temporal jitter of the bounding boxes to simulate detector noise. To further mitigate imperfect bounding boxes, we added 50 pixels of padding to each side of the bounding box to make sure the entire animal was visible.</p>
</sec>
<sec id="s2-4-7">
<label>2.4.7</label>
<title>ClinObs classifier</title>
<p>The ClinObs classifier is an extended version of the behavior classifier. Consistent with the approach used for the behavior model, this expanded dataset also included examples designed to represent nuanced clinical observations and the challenges in distinguishing them from similar behaviors and background activity (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<p>Building upon the behavior classifier, we trained the ClinObs model using the same ViT-S/16 architecture with joint space-time attention (<xref ref-type="bibr" rid="B14">Dosovitskiy et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Arnab et al., 2021</xref>; <xref ref-type="bibr" rid="B34">Tong et al., 2022</xref>), initialized from a VideoMAE checkpoint pretrained on Kinetics-400 (<xref ref-type="bibr" rid="B34">Tong et al., 2022</xref>). We split the data into training, validation, and test sets using the same 60%-15%&#x2013;25% ratio, still maintaining balanced representation of day/night conditions and bars/glass environments. We used a batch size of 14 with gradient accumulation over 10 batches, yielding an effective batch size of 140. The model&#x2019;s input was the same as the behavior model: 224 &#xd7; 224 images, with each input clip consisting of 16 frames sampled with a stride of 4. We still address the class imbalance with random weighted sampling. We trained the model for 75 epochs, including a 15-epoch warm-up period, utilizing an AdamW optimizer with a cosine decay learning rate schedule (base 1 &#xd7; 10<sup>&#x2212;3</sup>, minimum 1 &#xd7; 10<sup>&#x2212;6</sup> and weight decay of 0.05). Once more, the best model on the validation set was selected. Consistent with the behavior model, we applied letterbox padding and normalization of the clips, implemented temporal jitter augmentation, and added 50 pixels of padding to each side of the bounding box. To prevent overfitting, we implemented mixup (<xref ref-type="bibr" rid="B40">Zhang et al., 2017</xref>) and cutmix (<xref ref-type="bibr" rid="B39">Yun et al., 2019</xref>) augmentation techniques, alongside RandAug (<xref ref-type="bibr" rid="B13">Cubuk et al., 2020</xref>) with 9 augmentations and a magnitude of 15. As before, we split eating and eating-bowl during training; additionally, we split ataxia and IVM into subcategories representing varying degrees of severity (slight-moderate-severe) during training, with these subcategories merged for testing.</p>
</sec>
</sec>
<sec id="s2-5">
<label>2.5</label>
<title>Field validation of our AI model pipeline</title>
<p>For all field validations explained below, cameras recorded at 25 or 30fps and a subsampling of 4 was applied in our AI pipeline; meaning that only every fourth frame was analyzed. We applied this subsampling to maintain an acceptable, close-to-real-time processing speed of our pipeline which is important for future implementation. The performance of our pipeline was assessed by repeatedly processing a 1-h single-camera video and a 1-h double-camera video ten times each. The average processing time for the single-camera feed (30fps) was 54&#xa0;min and 0&#xa0;s (SD 32&#xa0;s), demonstrating real-time performance. In contrast, processing the double-camera feed (30fps) required an average of 1&#xa0;h 48&#xa0;min and 58&#xa0;s (SD 102&#xa0;s). All tests were conducted on the same single Nvidia L4 GPU.</p>
<p>
<xref ref-type="table" rid="T2">Table 2</xref> provides an overview of the different video fragments that were analyzed for each field validation.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Overview of the video fragments analyzed for each field validation. Number of selected video fragments per day/night, including snippet duration, total video time and number of frames that were analyzed. It is specified whether the fragments contain single- or group-housed animals, the type of enclosure (bars or glass fronts), how the ground truth was established and whether the animals were wearing harnesses.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Field validation</th>
<th colspan="2" align="center">N&#xb0; video fragments</th>
<th rowspan="2" align="center">Length video fragment</th>
<th rowspan="2" align="center">Total video time</th>
<th rowspan="2" align="center">Total N&#xb0; frames</th>
<th rowspan="2" align="center">Single/group housed</th>
<th rowspan="2" align="center">Glass/Bars</th>
<th rowspan="2" align="center">Ground truth</th>
<th rowspan="2" align="center">Animals wearing harnesses?</th>
</tr>
<tr>
<th align="center">Day</th>
<th align="center">Night</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td colspan="10" align="left">AI activity tracking</td>
</tr>
<tr>
<td align="right">Activity tracking</td>
<td align="center">6</td>
<td align="center">6</td>
<td align="center">2&#xa0;h</td>
<td align="center">24&#xa0;h</td>
<td align="center">N.A.</td>
<td align="center">Group (N &#x3d; 3)</td>
<td align="center">Bars</td>
<td align="center">Actiwatch-nano&#xae;</td>
<td align="center">Yes</td>
</tr>
<tr>
<td rowspan="3" align="right">reID and tracking</td>
<td align="center">11</td>
<td align="center">10</td>
<td align="center">5&#xa0;min</td>
<td align="center">101.5&#xa0;min</td>
<td align="center">39,748</td>
<td align="center">Group (N &#x3d; 3)</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="center">7</td>
<td align="center">7</td>
<td align="center">5&#xa0;min</td>
<td align="center">70&#xa0;min</td>
<td align="center">26,225</td>
<td align="center">Group (N &#x3d; 3)</td>
<td align="center">Glass</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="center">9 (3/group)</td>
<td align="center">9 (3/group)</td>
<td align="center">5&#xa0;min</td>
<td align="center">90&#xa0;min (30&#xa0;min/group)</td>
<td align="center">33,939 (&#x223c;11,300/group)</td>
<td align="center">Group (N &#x3d; 2)</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td rowspan="3" align="right">Mobile/immobile</td>
<td align="center">14</td>
<td align="center">0</td>
<td align="center">10&#xa0;min</td>
<td align="center">140&#xa0;min</td>
<td align="center">50,713</td>
<td align="center">Single</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">No</td>
</tr>
<tr>
<td align="center">3 &#x2a;</td>
<td align="center">3 &#x2a;</td>
<td align="center">5&#xa0;min</td>
<td align="center">90&#xa0;min</td>
<td align="center">28,771</td>
<td align="center">Group (N &#x3d; 3)</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td colspan="9" align="left">
<italic>&#x2a; Each video fragment contained 3 different animals for which the mobile/immobile was assessed individually</italic>
</td>
</tr>
<tr>
<td align="right">Pose</td>
<td align="center">14</td>
<td align="center">0</td>
<td align="center">10&#xa0;min</td>
<td align="center">140&#xa0;min</td>
<td align="center">50,713</td>
<td align="center">Single</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">No</td>
</tr>
<tr>
<td colspan="10" align="left">Behaviors</td>
</tr>
<tr>
<td rowspan="2" align="right">Eating-hopper (incl. &#x201c;not eating&#x201d;)</td>
<td align="center">4</td>
<td align="center">0</td>
<td align="center">15&#xa0;min</td>
<td align="center">1&#xa0;h</td>
<td align="center">22,511</td>
<td align="center">Single</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">0</td>
<td align="center">15&#xa0;min</td>
<td align="center">1&#xa0;h</td>
<td align="center">22,518</td>
<td align="center">Single</td>
<td align="center">Glass</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td rowspan="2" align="right">Eating-bowl (incl. &#x201c;not eating&#x201d;)</td>
<td align="center">4</td>
<td align="center">0</td>
<td align="center">15&#xa0;min</td>
<td align="center">1&#xa0;h</td>
<td align="center">22,201</td>
<td align="center">Single</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">0</td>
<td align="center">15&#xa0;min</td>
<td align="center">1&#xa0;h</td>
<td align="center">22,170</td>
<td align="center">Single</td>
<td align="center">Glass</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td rowspan="2" align="right">Drinking</td>
<td align="center">13</td>
<td align="center">2</td>
<td align="center">Variable</td>
<td align="center">7.9&#xa0;min</td>
<td align="center">2,969</td>
<td align="center">Mixed</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="center">12</td>
<td align="center">2</td>
<td align="center">Variable</td>
<td align="center">9.5&#xa0;min</td>
<td align="center">3,579</td>
<td align="center">Mixed</td>
<td align="center">Glass</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td rowspan="2" align="right">&#x201c;Not drinking&#x201d;</td>
<td align="center">7</td>
<td align="center">2</td>
<td align="center">Variable</td>
<td align="center">6.2&#xa0;min</td>
<td align="center">2,339</td>
<td align="center">Mixed</td>
<td align="center">Bars</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="center">12</td>
<td align="center">2</td>
<td align="center">Variable</td>
<td align="center">7.2&#xa0;min</td>
<td align="center">2,702</td>
<td align="center">Mixed</td>
<td align="center">Glass</td>
<td align="center">Manual</td>
<td align="center">Yes</td>
</tr>
<tr>
<td align="right">Real-life validation</td>
<td align="center">10</td>
<td align="center">0</td>
<td align="center">4&#xa0;h</td>
<td align="center">40&#xa0;h</td>
<td align="center">&#x223c;1,000,000</td>
<td align="center">Single</td>
<td align="center">Bars</td>
<td align="center">Manual (event-level)</td>
<td align="center">Yes</td>
</tr>
<tr>
<td colspan="10" align="left">ClinObs</td>
</tr>
<tr>
<td align="right">Ataxia, IVM<break/>Head shaking</td>
<td colspan="2" align="center">27 (9 animals, 3&#xa0;days)</td>
<td align="center">24&#xa0;h</td>
<td align="center">648&#xa0;h</td>
<td align="center">&#x223c;17,500,000</td>
<td align="center">Mixed</td>
<td align="center">Bars</td>
<td align="center">
<italic>In person</italic> and<break/>Manual event-level</td>
<td align="center">Yes</td>
</tr>
</tbody>
</table>
</table-wrap>
<sec id="s2-5-1">
<label>2.5.1</label>
<title>Field validation of activity tracking, reID and behavioral classification</title>
<p>We selected six female Beagle dogs from our colony to wear the three different harness-types: Y-St (two animals), B-Sq (two animals) and R-N (two animals). Animals were group-housed with three groups of N &#x3d; 2 in a room with bars to include all three possible harness combinations. Subsequently, groups were reorganized into two groups of N &#x3d; 3 in a room with glass fronts, followed by bars. After a habituation period in a room with bars, all animals (two groups of N &#x3d; 3) were equipped with an Actiwatch-Nano&#xae; accelerometer attached to their harnesses to record their activity.</p>
<p>For the activity tracking validation, 2-h videos were selected with varying activity levels in all animals, both during day and night (<xref ref-type="table" rid="T2">Table 2</xref>). The Actiwatch-Nano&#xae; provided one read out (activity count) per minute. These were compared to the AI activity levels which were calculated using the Euclidean distance (in pixel, as explained above in &#x201c;AI activity tracking and mobile/immobile&#x201d; and <xref ref-type="fig" rid="F4">Figure 4A</xref>). Accelerometer and AI data were compared in two ways: i) by visual inspection of individual actigrams; and ii) by assessing their correlation. For the latter, we calculated the non-parametric Spearman correlation coefficient using Graph Pad Prism 10.1.2 for activity read-outs per minute, and binned per 15&#xa0;min and per 2-h period.</p>
<p>To validate our tracking and reID model, snippets were specifically selected to include a variety of interactions in group-housed animals to challenge our model: walking or running around, animals crossing each other, playing, resting separately and together, etc. (<xref ref-type="table" rid="T2">Table 2</xref>). The ground truth ID was manually established on every frame and tracking metrics were calculated using the motmetrics python library (<xref ref-type="bibr" rid="B20">Heindl, 2019</xref>).</p>
<p>For the validation of the behavioral classifier, we evaluated the performance of <italic>drinking</italic>, <italic>eating</italic> from a food hopper, <italic>eating</italic> from a bowl, and <italic>none</italic> (<xref ref-type="table" rid="T2">Table 2</xref>). The <italic>none</italic> category was evaluated on the <italic>eating</italic> and <italic>drinking</italic> fragments as they contained typical <italic>none</italic> behavior like animals walking around, jumping on and off bench, animal-animal interactions, etc. To challenge our model, we also included several <italic>not eating</italic> and <italic>not drinking</italic> fragments. The overall confusion matrix was generated by summing frames across all videos for both the manually established ground truth and AI model.</p>
<p>In a final real-life validation, 4-h footage spanning the entire food access period, was analyzed for 10 animals (<xref ref-type="table" rid="T2">Table 2</xref>). Our model predictions were manually checked on video and the entire footage was also visually checked for missed detections by the model. Accuracy was calculated on event-level:<disp-formula id="equ2">
<mml:math id="m8">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mo>&#x23;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>e</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>y</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mo>&#x23;</mml:mo>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>e</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>m</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>e</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>x</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mn>100</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
</sec>
<sec id="s2-5-2">
<label>2.5.2</label>
<title>Field validation of mobile/immobile</title>
<p>The validation of mobile/immobile was done in two parts (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
<p>For the first part, 10&#xa0;min fragments were selected of single-housed animals from prior studies in which either increased or decreased activity was registered. The ground truth whether the animal was mobile or immobile was manually established on every frame. To determine the most optimal cutoff for mobile/immobile classification, an ROC (Receiver Operating Characteristic) curve was calculated that plots the TPR (True Positive Rate) in function of the FPR (False Positive Rate) for every cutoff. We evaluated values between 0 and 20 with 0.01 increments and calculated the Youden&#x2019;s Index for every point to find the best trade-off between TPR and FPR:<disp-formula id="equ3">
<mml:math id="m9">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:msup>
<mml:mi>n</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mi>s</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>I</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:math>
</disp-formula>
</p>
<p>The highest Youden&#x2019;s Index of 0.91 corresponded to a cutoff of 10.0. This cutoff was subsequently validated in group-housed animals.</p>
</sec>
<sec id="s2-5-3">
<label>2.5.3</label>
<title>Field validation of pose classification</title>
<p>The same 10&#xa0;min video fragments with increased/decreased activity were analyzed to have a more balanced distribution of the more &#x201c;excited&#x201d; poses (standing, standing up and standing down) vs. the more &#x2018;quiet&#x2019; poses (sitting and lying) (<xref ref-type="table" rid="T2">Table 2</xref>).</p>
</sec>
<sec id="s2-5-4">
<label>2.5.4</label>
<title>Field validation of ClinObs classification</title>
<p>We selected nine female Beagle dogs and randomly divided them into three groups of three animals wearing the different harness types. All animals were observed for two baseline days prior to dosing. On the third day, each group received a different treatment by oral gavage: 1) Acepromazine 2.5&#xa0;mg eq/kg (Sigma-Aldrich&#xae;, Burlington, MA, US); 2) Transcutol&#xae; HP 1500&#xa0;mg/kg (Gattefoss&#xe9;, Saint-Priest, France); and 3) Water as vehicle control (<xref ref-type="fig" rid="F7">Figure 7a</xref>). As per standard protocol, we observed the animals <italic>in person</italic> for any change in behavior and/or ClinObs during specific time periods post-dosing: 5&#xa0;min, 30&#xa0;min, 1&#xa0;h, 2&#xa0;h, 4&#xa0;h, 7&#xa0;h, and 10&#xa0;h. To assess whether our AI model was able to detect the same ClinObs and behavioral changes as noted during the <italic>in person</italic> observations, we analyzed the 24-h footage of both baseline days and the dosing day using our AI pipeline, followed by human QC of significant/aberrant AI signals (as illustrated in <xref ref-type="fig" rid="F7">Figures 7c,d</xref>). For visualization purposes, framewise AI predictions were summarized per minute by binning the number of predictions per class within that minute.</p>
<p>On top of the framewise predictions, we implemented an objective, adaptive baseline thresholding strategy to highlight significant differences between the baseline and treatment days. For this, a moving average with a 5-min time window was applied on the framewise predictions to account for the temporal aspect. Individual thresholds were then calculated for each animal and each ClinObs class by determining the <italic>minimum</italic> number of predictions per minute required to suppress all predictions for that class during the baseline period. This threshold was subsequently applied to the dosing day data.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<sec id="s3-1">
<label>3.1</label>
<title>AI model approach</title>
<p>Our goal was to develop an integrated AI model for continuous monitoring of canine activity, behavior and clinical observations (ClinObs) tailored for experimental purposes in nonclinical research and compliant with animal welfare standards. To accomplish this, we developed a pipeline consisting of multiple ANNs (<xref ref-type="fig" rid="F1">Figure 1a</xref>): a detector, an identification and tracking module, followed by pose, behavior and ClinObs classifiers. As our AI model would be implemented during both day and night, animals are wearing colored harnesses with distinct reflective patterns (<xref ref-type="fig" rid="F1">Figure 1b</xref>) to facilitate visual identification. In addition, two distinctive animal room setups were used, with either bar or glass fronts (<xref ref-type="fig" rid="F1">Figure 1c</xref>). Unlike typical rodent setups that utilize top-down camera views, we apply front view cameras for accurate recognition of behaviors and ClinObs.</p>
<p>In our pipeline, the interconnected ANNs rely on the output of the previous models, making it crucial to develop robust ANNs and add redundancy to our pipeline. On top of a performant detector ANN, the most crucial part in our integrated AI model is accurate animal identification and tracking as the ANNs in subsequent stages of the pipeline require consistent cut-outs of the same animal through time for correct predictions. In practical situations, animals are typically group-housed and have access to multiple kennels, resulting in movement across different camera views. To achieve the most performant identification and prevent duplicate IDs, camera streams of adjacent kennels are processed jointly (<xref ref-type="fig" rid="F2">Figure 2</xref> and Methods 2.4.3).</p>
<p>We first validated the performances of individual ANNs on their respective test sets (<xref ref-type="fig" rid="F3">Figure 3</xref>). These datasets, however, do not account for error propagation across the integrated pipeline, where mistakes from one ANN may cascade to subsequent stages. To address this, we additionally performed a large-scale field validation on extended video recordings, evaluating the end-to-end performance of activity tracking (including reID and pose classification), behavior, and ClinObs classification.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>ANN Performance on test datasets. <bold>(a)</bold> The precision-recall curve of our highest performing detector ANN at different confidence thresholds for all (green), day-only (orange), night-only (blue) detections. The selected threshold (marked as X) of 45.6% corresponds to the optimal F1 point of the precision and recall. The TIDE analysis shows the types of mistakes made by the detector. <bold>(b)</bold> Performance of our harness classifier ANN. <bold>(c&#x2013;f)</bold> Tracking performance based on the tracking and reID strategy from <xref ref-type="fig" rid="F2">Figure 2</xref>. Tracking metrics are evaluated on a total of 8,638 frames giving 23,283 targets divided over several day and night videos with animals mostly separate (Day/Night-S) or clustered close together (Day/Night-C). All metrics are represented as mean &#xb1; SD with each dot representing an individual video. IDs MT/PT/ML: number of IDs that are Mostly Tracked (&#x3e;80%, green)/Partially Tracked (20%&#x2013;80%, orange)/Mostly Lost (&#x3c;20%, red). <bold>(g)</bold> Performance of our pose classifier ANN using detector-generated cut-outs. False positive and false negative detections are excluded from the confusion matrix, as animals were not wearing harnesses for correct identification and tracking. <bold>(h)</bold> Frame-level performance of our behavior classifier ANN on manually annotated videos using dot-2-box. <bold>(i)</bold> Frame-level performance of our behavior and ClinObs classifier ANN on manually annotated videos using dot-2-box: recall is depicted in the confusion matrix and precision in the bar chart. IVM: involuntary muscle movement (tremors and/or twitches).</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g003.tif">
<alt-text content-type="machine-generated">Infographic presenting a multi-stage animal detection and behavior classification pipeline using deep learning models, including YOLOv2 for detection, ResNet-18 for harness and pose classification, and ViT Small for behavior classification. Multiple panels display precision-recall curves, confusion matrices, bar graphs for performance metrics across day and night, and error analysis. Small icons of animals and colored bars distinguish different models and outcomes.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Datasets</title>
<p>Our dataset consists of &#x3e;1.8 million manually annotated frames across 1,125 video snippets that span a large variation of single- and group-housed canines with and without visual identifiers, different poses and especially behaviors and ClinObs seen from our fixed video surveillance cameras (<xref ref-type="table" rid="T1">Table 1</xref>). The dataset includes footage from five different animal rooms with an identical camera view and blue background, but with variables such as enrichment (toys, blankets), bedding and different front door views.</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>ANN performance on their respective test sets</title>
<sec id="s3-3-1">
<label>3.3.1</label>
<title>Detector</title>
<p>Out of the several detector ANNs evaluated (<xref ref-type="sec" rid="s12">Supplementary Figure S1A</xref>), the best performing model was YOLOv2 (<xref ref-type="bibr" rid="B29">Redmon and Farhadi, 2017</xref>). Our detector ANN reached both high precision and recall, reflected by a F1 score of 94.8% (<xref ref-type="fig" rid="F3">Figure 3a</xref>). Performance was slightly better on daytime footage (F1 of 96.1%) than during night (91.1%). This was expected given that approximately 65% of our data contained day footage, while nocturnal conditions pose greater challenges. To verify the accuracy of the generated bounding boxes that downstream ANNs rely on, we calculated the average Intersection over Union (IoU) between the detector output and the annotated bounding boxes. Our detector ANN reached an overall average IoU of 80.5%, with a lower IoU at night (73.9%) compared to day (82.7%) due to reduced visibility in greyscale imaging and the natural tendency of canines to cluster together while sleeping. These values are considered very high since an IoU &#x2265;50% is typically regarded as a true positive match in computer vision literature (<xref ref-type="bibr" rid="B16">Everingham et al., 2010</xref>; <xref ref-type="bibr" rid="B29">Redmon and Farhadi, 2017</xref>).</p>
<p>A TIDE (Toolkit for Identifying Detection and segmentation Errors) analysis (<xref ref-type="bibr" rid="B9">Bolya et al., 2020</xref>) offered insight in the sources of errors our detector model made. It showed that all types of mistakes made by our detector ANN occurred more during the night (&#x2264;3%) compared to the day (&#x2264;1%), correlating to the ANN&#x2019;s decreased performance (<xref ref-type="fig" rid="F3">Figure 3a</xref>). Most errors at night were missed detections followed by duplicate detections (canines were generally not wearing harnesses in this footage) and localization errors (low IoU overlap). The detector ANN showed very low background detections (erroneous identification of non-existing canines) across both conditions, indicating a low propensity for false positives.</p>
<p>Note that YOLOv2 was relevant at the time of our research. Although, in the meantime novel architectures are available, we opted not to test them as our detector model and the resulting animal tracking (<xref ref-type="sec" rid="s3-3-3">Section 3.3.3</xref>, <xref ref-type="sec" rid="s3-4-2">3.4.2</xref>.) are very performant and the potential gain would be minimal.</p>
</sec>
<sec id="s3-3-2">
<label>3.3.2</label>
<title>Animal identification</title>
<p>Before collecting a large dataset to train a robust animal identification model, we initially gathered smaller datasets to determine the most effective approach (see Methods 2.4.2, <xref ref-type="sec" rid="s12">Supplementary Figure S1B</xref>). This preliminary step allowed us to evaluate different strategies and select the best-performing methods for our final model.</p>
<p>Our final approach utilizes an ResNet-18-based ANN that can classify 3 IDs based on color during the day and the reflective pattern during the night: <italic>Black-Reflective squares</italic> (B-Sq), <italic>Red-No Reflection</italic> (R-N), and <italic>Yellow-Reflective stripes</italic> (Y-St) (<xref ref-type="fig" rid="F1">Figure 1b</xref>). This ANN was trained on a larger dataset that included more challenging data, such as occluded animals and image cut-outs that contained multiple harnesses (<xref ref-type="table" rid="T1">Table 1</xref>). Our final harness classifier ANN achieved an overall top-1 accuracy of 80.3% with recognition of the different harness types varying from 71.9% (B-Sq) to 85.0% (R-N) (<xref ref-type="fig" rid="F3">Figure 3B</xref>).</p>
</sec>
<sec id="s3-3-3">
<label>3.3.3</label>
<title>Animal tracking and reID</title>
<p>Our strategy of combining a modified Jonker Volgenant algorithm (<xref ref-type="bibr" rid="B22">Jonker and Volgenant, 1987</xref>; <xref ref-type="bibr" rid="B32">SciPy, 2008</xref>; <xref ref-type="bibr" rid="B12">Crouse, 2016</xref>) with a Kalman filter across multiple camera views (<xref ref-type="fig" rid="F2">Figure 2</xref>) resulted in excellent tracking and reID performance (<xref ref-type="fig" rid="F3">Figures 3c&#x2013;f</xref>), where reID (re-identification) is defined as assigning the correct harness color (ID) to every animal on every frame. In general, there were no notable differences in tracking performance between separate and clustered animals, apart from a tendency of more fragmentations in the clustered videos. This was also the case for day and night, where day scored only marginally higher for IDF1, IDP and IDR. Across all conditions, we reached average scores of 85.1% MOTA, 93.0% IDF1, 98.4% IDP and 88.4% IDR; and 11.0% missed detections (<xref ref-type="fig" rid="F3">Figures 3c&#x2013;f</xref>).</p>
<p>IDF1 scores for both separate and clustered animals were outstanding for day (&#x223c;94%) and night (&#x223c;91&#x2013;92%) conditions, implicating that the correct ID was assigned to nearly each detected animal (<xref ref-type="fig" rid="F3">Figure 3c</xref>). This was also reflected in the close to perfect IDP score (<xref ref-type="fig" rid="F3">Figure 3f</xref>). Of the 71 ground truth identities, 57 IDs were mostly tracked (tracked for more than 80% of their lifespan), 11 were partially tracked (between 20%&#x2013;80%), and only 3 were mostly lost (less than 20%) (<xref ref-type="fig" rid="F3">Figure 3e</xref>). The MOTA score accounts for missed detections, false positives, and ID switches, which typically leads to lower values than IDF1 that focuses more on identity preservation. Nevertheless, our MOTA scores of 77.0%&#x2013;89.5% still indicated strong tracking performance (<xref ref-type="fig" rid="F3">Figure 3d</xref>). Indeed, apart from a few outliers, the number of missed detections was low across all video snippets, which is also mirrored in high IDR scores (&#x223c;86&#x2013;90%) (<xref ref-type="fig" rid="F3">Figure 3f</xref>). Considering the huge number of 23,283 targets (i.e., individual animal cut-outs), we observed a negligible number of switches, fragmentations and false positives (<xref ref-type="fig" rid="F3">Figure 3f</xref>).</p>
<p>The fact that our IDR score was higher than the performance of our harness classifier model alone, implied that the Jonker-Volgenant algorithm had a positive effect on ID consistency.</p>
</sec>
<sec id="s3-3-4">
<label>3.3.4</label>
<title>Pose classifier</title>
<p>The pose classifier receives cut-outs from a single canine through time and predicts the pose on a frame level (<xref ref-type="fig" rid="F1">Figure 1a</xref>). Our ResNet-18 model reached a top-1 accuracy of 95.2% when using detector-generated cut-outs instead of manually annotated ones (<xref ref-type="fig" rid="F3">Figure 3d</xref>). Lying and Standing classes were the most performant (accuracies of &#x223c;97%), followed by Standing Up (88.7%) and Sitting (82.9%). Standing Down was the least performant class with 55.0% accuracy (<xref ref-type="fig" rid="F3">Figure 3d</xref>), also related to the very low number of occurrences in our dataset compared to the other poses (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
</sec>
<sec id="s3-3-5">
<label>3.3.5</label>
<title>Behavior classifier</title>
<p>Our initial ViT Small model for behavior was trained for drinking and eating (<xref ref-type="fig" rid="F3">Figure 3e</xref>). The model achieved a top-1 accuracy of 94.0% and a class accuracy of 92.7% calculated on a frame-level. It performed well on both behavior classes, with accuracies of 93.1% for drinking and 89.6% for eating, while rarely confusing these (<xref ref-type="fig" rid="F3">Figure 3e</xref>). When errors occurred, they were typically misclassifications between a behavior and the background class (<italic>none</italic>), or <italic>vice versa</italic>, suggesting that the model has learned to distinguish these behavioral patterns. Furthermore, high accuracy on the background class <italic>none</italic> (95.4%) indicates a low number of false positives (<xref ref-type="fig" rid="F3">Figure 3e</xref>).</p>
</sec>
<sec id="s3-3-6">
<label>3.3.6</label>
<title>ClinObs classifier</title>
<p>Building onwards from the ViT Small Behavior model, our ClinObs model demonstrated promising results in detecting 11 distinct behaviors and clinical observations on a frame-level, achieving a top-1 accuracy of 79.0% and a class accuracy of 47.9%. Our model maintained strong performance on behavioral classes such as drinking and eating, highlighting its ability to reliably detect common observations. As expected, classes with limited representation in the training data such as anxiety, limb stiffness, and vomiting presented greater challenges for the model. Because of this extreme&#x2014;but unavoidable&#x2014;training data&#x2019;s class imbalance, the performance ranged substantially between classes: drinking (93.2%), none (90.8%), eating (87.3%), circling (86.6%), convulsion (43.2%), ataxia (40.4%), head shaking (40.1%), limping (35.2%), involuntary muscle movements (IVM, 30.1%), anxiety (20.0%), vomiting (6.1%) and limb stiff (2.1%) (<xref ref-type="fig" rid="F3">Figure 3f</xref>). Notably, our model occasionally confused IVM with convulsion (4.9%), convulsion with ataxia (16.6%), and limb stiff with limping (30.4%), which is understandable due to the visual similarity of these ClinObs (<xref ref-type="fig" rid="F3">Figure 3f</xref>). Because of the inherent class imbalance, the model showed a greater inclination toward predicting the background class None. Across nearly all classes, the precision was higher than the recall, indicating that the model leans more towards correctly predicting instances, at the expense of missing some true positive events.</p>
</sec>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Field validation of our integrated AI pipeline</title>
<sec id="s3-4-1">
<label>3.4.1</label>
<title>Field validation of activity tracking</title>
<p>Accurate AI Activity tracking relies on correct outputs from the detector, harness classifier and tracker (<xref ref-type="fig" rid="F1">Figure 1a</xref>). To validate our AI Activity tracking, we compared it to Actiwatch-Nano&#xae;, an accelerometer-based device routinely used during nonclinical studies performed in this facility. We calculated the animals&#x2019; activity based on the movement of their bounding boxes (Methods 2.4.4, <xref ref-type="fig" rid="F4">Figure 4a</xref>).</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Field validation of Activity Tracking and Pose Classification. <bold>(a)</bold> For the AI activity tracking, in postprocessing, the Euclidean distance (in pixel) is calculated between the center of the bounding box on the current and the previous frame/detection for the same animal. The pixel difference represents a measure of movement per frame. Based on the pixel difference, a cutoff of 10 was established to classify bounding box/animal movements on a frame-level as &#x2018;mobile&#x2019; or &#x2018;immobile&#x2019; which are compared to manual assessment. Mobile refers to animals moving from one point in space to another, while immobile refers to animals remaining in the same location. <bold>(b)</bold> Comparison of AI activity tracking to Actiwatch-Nano&#xae; on individual actigram level and by correlation of activity read-outs binned per minute, 15&#xa0;min and per 2&#xa0;h. (<bold>(c)</bold>, left) Confusion matrices for mobile/immobile validation on single- (day) and group-housed (day and night) animals. (<bold>(c)</bold>, right) Confusion matrix for the validation of our pose classifier compared to manual ground truth on single-housed animals with increased and decreased activity.</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g004.tif">
<alt-text content-type="machine-generated">Figure containing three panels summarizing AI-based animal activity tracking and validation: (a) schematic showing pixel-based movement detection compared to accelerometer counts; (b) line graphs comparing AI and Nano activity data across light and dark cycles, and scatter plots displaying strong correlations; (c) confusion matrices for mobile/immobile and pose validation, highlighting high agreement between ground truth and model predictions for single and group housed animals.</alt-text>
</graphic>
</fig>
<p>After normalization of both different metrics, 2-h actigrams for both activity measurements were visually very similar for all animals, across a range of activity levels (<xref ref-type="fig" rid="F4">Figures 4b</xref>, <xref ref-type="fig" rid="F5">5a</xref>). During daytime, normalized accelerometer values were sometimes &#x201c;larger&#x201d; compared to AI. These generally corresponded to periods of excitement in which animals were jumping or rubbing on the floor. As the accelerometer incorporates movements over x, y and z-axes, these short, intense movements resulted in very high counts while the actual &#x201c;distance moved&#x201d; was much smaller. As our AI activity tracking uses a uniform pixels-based measurement irrespective of the intensity, it resulted in a more objective representation of the actual &#x201c;distance moved&#x201d;. For linear movements (animals walking or running), there was a perfect overlay of activity read-outs of both techniques upon normalization. During the night, AI activity patterns were still highly similar to the accelerometer but showed more differences due to imperfect bounding boxes (night is a more challenging scenario, see also <xref ref-type="fig" rid="F3">Figure 3a</xref>) and occasional ID switches. Similar to daytime, AI activity tracking more accurately represented the &#x2018;distance moved&#x27;.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Field validation of reID and tracking. <bold>(a)</bold> To validate our tracking and reID models, several 5-min snippets (blue shading) were selected with variating activity levels and animal interactions across diverse conditions (different room setups, group sizes and day/night conditions). As illustrated here for the bars setup, snippets (blue bars) are selected from the 2-h actigram videos during day and night from all six animals, which were divided into two groups (G1 and G2). <bold>(b)</bold> Tracking metrics were calculated for each individual snippet and pooled per group and per condition (day/night) as mean &#xb1; SD with each symbol representing an individual video snippet. Tracking metrics for the bars setup (N &#x3d; 3) were evaluated on a total of 39,748 frames giving 119,324 targets. IDs MT/PT: number of IDs that are Mostly Tracked (&#x3e;80%, green)/Partially Tracked (20%&#x2013;80%, orange). <bold>(c)</bold> Detailed tracking metrics per group and condition are merged to obtain overall tracking performance for our two room setups (bars and glass) and smaller group sizes of 2 animals. Overall tracking metrics are represented in mean &#xb1; SD with each symbol representing an individual video snippet. For glass (N &#x3d; 3), 26,225 frames were analyzed giving 78,679 targets. For bars (N &#x3d; 2), &#x223c;11,300 frames were analyzed per harness combination giving a total of 67,878 targets. Horizontal rows in the confusion matrices may sum to over 100% due to the occasional assignment of double IDs (one correct and an additional wrong ID).</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g005.tif">
<alt-text content-type="machine-generated">Scientific figure consisting of three panels: a) line graphs displaying activity data for two groups with AI and Nano labels, annotated with animal icons and time markers; b) bar charts and count plots presenting tracking evaluation metrics including MOTA, IDF1, IDs, Missed, IDP, and IDR, separated by color; c) confusion matrices and bar charts summarizing model ID classification and tracking performance across different setups and lighting conditions with symbols indicating group types.</alt-text>
</graphic>
</fig>
<p>The similarity in the two methods was emphasized by a strong correlation between AI and accelerometer activity outcomes across all time spans: r &#x3d; 0.8825 for 1-min intervals which improved further to 0.9257 and 0.9650 for 15&#xa0;min and 2&#xa0;h respectively (<xref ref-type="fig" rid="F4">Figure 4b</xref>).</p>
<p>Along with detailed actigrams, it is useful to have a more general overview of the animals&#x2019; activity by classifying the time spent as either mobile (moving from one point in space to another) or immobile (remaining in the same location). In single-housed animals, our AI model achieved an excellent classification of both mobile (96.8%) and immobile (94.0%) occurrences (<xref ref-type="fig" rid="F4">Figure 4c</xref>, left panel). Most errors occurred in immobile classifications, due to imperfect bounding boxes or bounding boxes changing shape without animals actually moving, e.g., when tail wagging or heads moving while remaining in the same location. Equally, in group-housed animals (N &#x3d; 3), we reached a very good classification accuracy despite this challenging setting: 90.4% for immobile and 81.0% for mobile (<xref ref-type="fig" rid="F4">Figure 4c</xref>, left panel). The drop in performance was primarily due to imperfect bounding boxes when animals cross each other and/or at night.</p>
</sec>
<sec id="s3-4-2">
<label>3.4.2</label>
<title>Field validation of reID and tracking</title>
<p>While the impressive results described above already imply strong reID and tracking, we manually assessed their performance in detail on several 5-min snippets as described in the Methods section and illustrated for bars setup in <xref ref-type="fig" rid="F5">Figure 5a</xref>. For the latter during daytime, all tracking metrics were close to perfect, reflected in 32 of 33 total IDs being mostly tracked (&#x3e;80% of their lifespan). At night, there were more missed detections due to the challenges of recognizing animals in greyscale colors and their tendency to cluster together while sleeping. This is reflected in lower MOTA and IDR scores, and 16 out of 30 IDs being partially tracked. However, IDP scores remained high, indicating that when an animal was detected, the correct ID was assigned most of the time (<xref ref-type="fig" rid="F5">Figure 5b</xref>).</p>
<p>The small differences in reID performance between the two evaluated groups (N &#x3d; 3 each) are attributed to chance and individual behavior differences, e.g., whether they like being close to each other and their preference where to stand, sit or rest. Merging results for both groups in the bars setup across day and night yielded an excellent overall reID accuracy of 95.3%&#x2013;96.2% and average scores of 85.1% MOTA, 91.4% IDF1, 100% IDP, 85.3% IDR and 14.9% missed detections (<xref ref-type="fig" rid="F5">Figure 5c</xref>).</p>
<p>For the setup with glass fronts and group sizes of N &#x3d; 3, we achieved a similar outstanding reID accuracy ranging from 92.5% to 100% across day and night footage. Also tracking metrics were similar, with high IDF1 (87.0%) and IDP (100%) scores; and occasional missed detections (21.0%) which are reflected in the MOTA (79.0%) and IDR (79.9%) scores (<xref ref-type="fig" rid="F5">Figure 5c</xref>).</p>
<p>In the assessment of reID and tracking performance for three harness combinations with group sizes of N &#x3d; 2, all combinations achieved at least 94.8% reID accuracy across day and night. The R-N/Y-St combination proved to be superior with the fewest missed detections (5.1%) and highest scores across all metrics (94.9% MOTA, 97.2% IDF1, 100% IDP and 94.9% IDR) (<xref ref-type="fig" rid="F5">Figure 5c</xref>).</p>
</sec>
<sec id="s3-4-3">
<label>3.4.3</label>
<title>Field validation of pose classification</title>
<p>Our pose classifier achieved an excellent recognition of Standing (97.4%) and Standing Up (94.6%) when implemented in the pipeline, followed by Standing Down (86.1%). Lying and Sitting classes were the least performant with accuracies of 70.4% and 66.5% respectively. Lying was quite often confused with Standing, and Sitting was mixed up with Lying and Standing (<xref ref-type="fig" rid="F4">Figure 4c</xref>, right panel).</p>
</sec>
<sec id="s3-4-4">
<label>3.4.4</label>
<title>Field validation of behavioral classification</title>
<p>Our behavioral classifier demonstrated excellent frame-level performance when implemented in the pipeline (<xref ref-type="fig" rid="F6">Figure 6b</xref>), similar to the test set (<xref ref-type="fig" rid="F3">Figure 3e</xref>). For <italic>eating</italic>, we assessed four different conditions (<xref ref-type="fig" rid="F6">Figure 6d</xref>) where <italic>Eating hopper-glass</italic> consistently excelled across all fragments, followed by <italic>Eating Bowl-Bars</italic> (<xref ref-type="fig" rid="F6">Figure 6a</xref>). The <italic>Hopper-Bars</italic> and <italic>Bowl-Glass</italic> conditions showed more variation between different fragments. However, the lowest-performing video still achieved 73% frame-level accuracy in both conditions. Overall, <italic>eating</italic> reached an outstanding frame-level accuracy of 93% (<xref ref-type="fig" rid="F6">Figures 6a,b</xref>).</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Field validation of Behavioral Classifier. <bold>(a)</bold> Overview of the frame-level accuracies for eating and drinking classes. The orange and blue shaded areas correspond to the overall mean accuracies which were calculated by summation of the different eating and drinking conditions listed in the y-axis. <bold>(b)</bold> Confusion matrix with frame-level accuracies (in %) of our behavioral classifier when implemented in the pipeline. <bold>(c)</bold> Frame-by-frame overlay of model predictions (AI) and ground truth (GT) annotations for five representative fragments. Red shaded areas highlight mismatches. <bold>(d)</bold> Representative screenshots of the different eating and drinking conditions, taken from analyzed video fragments <bold>(e)</bold> Real-life implementation of our AI model pipeline on 4&#xa0;h food-access footage for two representative animals, with ground truth assessment in the top bar. To enable visualization, framewise AI predictions were binned per minute for each behavioral class. Class accuracies were calculated on event-level over ten animals.</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g006.tif">
<alt-text content-type="machine-generated">Composite figure displaying evaluation of an AI system for recognizing eating and drinking behaviors in dogs. Panels include a scatter plot of accuracy by condition (a), confusion matrix (b), behavior prediction timelines (c), example images with bounding boxes identifying eating and drinking (d), and a timeline chart comparing behavior duration and ground truth annotations (e).</alt-text>
</graphic>
</fig>
<p>
<italic>Drinking</italic> was in general excellently detected for both glass and bars during day and night, with at least 84% accuracy and often reaching 100% (frame-level). Only two snippets showed low accuracy of 47% and 3% (<xref ref-type="fig" rid="F6">Figure 6a</xref>) due to the animal pausing frequently when drinking and an inadequately predicted bounding box that did not include the animal&#x2019;s head, respectively. Across all snippets, <italic>drinking</italic> showed an excellent frame-level accuracy of 90% (<xref ref-type="fig" rid="F6">Figures 6a&#x2013;d</xref>).</p>
<p>Finally, the background class <italic>none</italic> achieved a frame-level accuracy of 99% (<xref ref-type="fig" rid="F6">Figure 6b</xref>), with incorporation of the challenging snippets comparable to the two behaviors (<italic>not eating</italic> and <italic>not drinking</italic>), again indicating that our model generates a low number of false positives. Similar to our test set, the model never mixed-up <italic>eating</italic> and <italic>drinking</italic>; misclassifications were limited to confusion between the behavior and the background class (<italic>none</italic>). <xref ref-type="fig" rid="F6">Figure 6c</xref> also illustrates the near-perfect overlay of our AI predictions on the manual ground truth at frame level.</p>
<p>In real-life situations, accurate frame-by-frame predictions are less critical as long as overall events are detected. Analysis of continuous 4&#xa0;h food-access footage from ten animals with different eating patterns (2 representative examples shown in <xref ref-type="fig" rid="F6">Figure 6e</xref>) revealed high event-level accuracy of 96% for <italic>eating</italic> and 91% for <italic>drinking</italic>, with minimal missed or incorrect detections.</p>
</sec>
<sec id="s3-4-5">
<label>3.4.5</label>
<title>Field validation of ClinObs classification</title>
<p>We performed a validation study with two reference compounds (Acepromazine 2.5&#xa0;mg eq./kg P.O. and Transcutol&#xae; HP 1500&#xa0;mg/kg P.O.) and one control group (water) to validate our AI pipeline&#x2019;s ability to detect clinical signs in a realistic experimental setting (<xref ref-type="fig" rid="F7">Figure 7a</xref>). For this, we investigated whether our model could identify the same behavioral changes and ClinObs as noted during the standard <italic>in person</italic> observations.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Field validation of ClinObs Classifier. <bold>(a)</bold> Study Design: 3 groups consisting of 3 animals/group, dosed with either Vehicle P.O. (H<sub>2</sub>O, V), Acepromazine 2.5&#xa0;mg. eq/kg P.O. <bold>(a)</bold> or Transcutol&#xae; HP 1500&#xa0;mg/kg P.O. (T). <bold>(b)</bold> Evolution of 24&#xa0;h activity (mean &#xb1; SD) across the baseline and dosing days for the different treatment groups. Activity is represented as percentage normalized to the average baseline activity of the corresponding animal. Statistical analysis included an ANOVA on the percent difference relative to the baseline mean, followed by t-tests comparing each treatment with the vehicle, using a Dunnett correction for multiple comparisons. <bold>(c)</bold> Schematic representation of the <italic>in person</italic> observations and AI model detections across the different time periods post-dosing for the Transcutol&#xae; HP group. A representative, framewise AI model output is shown for ataxia in one animal on a baseline day and the dosing day. Significant and/or aberrant AI signals that were manually checked are marked with a green (true positive, TP) or red (false positive, FP) asterisk. <bold>(d)</bold> Schematic representation of the <italic>in person</italic> observations and AI model detections across the different time periods post-dosing for the Acepromazine group. A representative, framewise AI model output is shown for IVM (involuntary muscle movements) in one animal on a baseline day and the dosing day. Significant and/or aberrant AI signals that were manually checked are marked with a green (true positive, TP) or red (false positive, FP) asterisk. <bold>(e)</bold> Remaining ataxia signals (per minute) on the dosing day after applying an adaptive baseline thresholding strategy to eliminate the baseline signals. <bold>(f)</bold> Remaining IVM signals (per minute) on the dosing day after applying an adaptive baseline thresholding strategy to eliminate the baseline signals. <bold>(g)</bold> Manually assessed event-level precision of significant/aberrant AI detections.</p>
</caption>
<graphic xlink:href="ftox-08-1758963-g007.tif">
<alt-text content-type="machine-generated">Multicomponent scientific figure summarizing an AI-based study for automated behavioral tracking in dogs. Panel a outlines the study design and model outputs. Panel b presents a line graph comparing activity levels across treatment groups and baseline days, highlighting significant differences. Panels c and d display timelines and event tracking data for Transcutol HP and Acepromazine treatments, showing ataxia, limping, and involuntary movements, with color-coded bars, vertical dosing time markers, and line plots distinguishing true-positive and false-positive signals. Panels e and f present binned activity signal graphs. Panel g bar chart evaluates manual validation precision for model-predicted behaviors.</alt-text>
</graphic>
</fig>
<p>Transcutol&#xae; HP is a commercially available vehicle that induces ataxia and IVM (in canines), when used in its pure form (internal data). Our model accurately detected ataxia in all animals within the same period as the <italic>in person</italic> observations. In one animal, limping was also occasionally assigned together with ataxia (<xref ref-type="fig" rid="F7">Figure 7c</xref>). When implementing our adaptive baseline thresholding strategy to filter out baseline noise and highlight significant changes on the dosing day, clear <italic>Ataxia</italic> signals remain within the expected time window (<xref ref-type="fig" rid="F7">Figure 7e</xref>). In addition, our model detected <italic>IVM</italic> in the same two animals as the <italic>in person</italic> observations with a slight difference in IVM duration (<xref ref-type="fig" rid="F7">Figure 7c</xref>).</p>
<p>Acepromazine is a known sedative that induces unstable gait (ataxia-like) and IVM (in canines)(<xref ref-type="bibr" rid="B36">VCA-Animal-Hospitals, 2023</xref>). Our AI activity tracking successfully detected the decreased activity observed during <italic>in person</italic> assessments in these animals compared to baseline (<xref ref-type="fig" rid="F7">Figure 7b</xref>). Despite the sedative effect with only short periods of movement, our model successfully identified the ataxia-like gait as either <italic>Ataxia</italic> or <italic>Limping</italic> in all animals, during the majority of the expected time frame (<xref ref-type="fig" rid="F7">Figure 7d</xref>). As they are both phenotypes of abnormal gait, AI outputs for <italic>Ataxia</italic> and <italic>Limping</italic> signals were merged together for the manual analysis to boost the model&#x2019;s performance and enable better differentiation compared to the baseline noise. When plotted together with the animals&#x2019; activity, true positive signals (high abnormal gait-to-low activity) could be clearly distinguished from the baseline noise (low abnormal gait-to-high activity) (<xref ref-type="sec" rid="s12">Supplementary Figure S3</xref>). Furthermore, our model successfully detected the IVM in all animals within the expected time window with exception of one animal where the AI detections were shorter in duration compared to <italic>in person</italic> observations (<xref ref-type="fig" rid="F7">Figure 7d</xref>). Upon implementing our adaptive baseline thresholding strategy, clear IVM signals remain, although fewer in number as (also correct) baseline events are filtered out (<xref ref-type="fig" rid="F7">Figure 7e</xref>).</p>
<p>A control group was included to investigate whether this mock-treatment would result in observable differences in AI signals after dosing compared to baseline. While our model correctly detected IVM and ataxia-like signals in some animals on the treatment day, the AI signal patterns were highly similar compared to baseline (<xref ref-type="sec" rid="s12">Supplementary Figure S2A,B</xref>). Similarly, applying our thresholding strategy resulted in almost no notable signals (<xref ref-type="sec" rid="s12">Supplementary Figure S2C,D</xref>), again showing that our model can identify actual treatment-related observations.</p>
<p>As final part of the field validation, we performed a manual video analysis for a large number of substantial AI signals and/or patterns to evaluate the precision of our model detections on event-level (<xref ref-type="fig" rid="F7">Figure 7g</xref>), opposed to the framewise precision on the test set (<xref ref-type="fig" rid="F3">Figure 3f</xref>). As mentioned above for the Acepromazine-dosed group, we pooled ataxia and limping detections for this analysis as an <italic>abnormal gait</italic> phenotype. For almost all model detections in the study, we showed strong to excellent precisions: circling (70%), IVM (70%), ataxia-limping (72%) and head shaking (95%). While convulsion detections showed 0% precisions, it is important to note that in many detections, the animal was rubbing (35%) or showing ataxia (6%); which can be highly comparable to an actual convulsion.</p>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>To enable continuous monitoring of activity, behavior and abnormalities (ClinObs) during non-clinical safety studies in canines, we developed and implemented a novel video-based AI model pipeline composed of multiple interconnected ANNs. This pipeline integrates a detector, an identification module and a tracker module linking detections from the same animal across different frames together in a single track that is fed into dedicated pose, behavior and ClinObs classifiers (<xref ref-type="fig" rid="F1">Figure 1</xref>). The identification module plays a critical role in enabling single- and group-housed conditions, according to animal welfare standards during continuous monitoring.</p>
<p>With our unique approach, models receive only image/video data as input, no other sensors are required, as they are completely pixel appearance-based, in contrast to the commonly used skeleton/keypoint based models. This approach enables recognition of various behaviors and visually subtle ClinObs that would not be discernible when relying on keypoint data alone. Furthermore, the approach offers high flexibility, supporting training for diverse observations and could accommodate a variety of camera configurations, including non-top-down viewpoints involving occlusions.</p>
<p>Given that appearance-based models require extensive data to achieve robustness&#x2013;particularly for detecting subtle behavioral features&#x2013;we optimized our labeling process by employing a single-dot annotation strategy that allowed efficient labeling of over 1.8 million frames.</p>
<p>A key aspect in applying computer vision techniques in practice is translating performance metrics, obtained from initial test sets using short minute-length video snippets, into real-word scenarios involving hours or even days of footage. The initial test set often represents a limited part of the data set (in our case &#x223c;20&#x2013;25%) that can never fully represent the variation present in real-life situations. Therefore, our work included an extensive field validation of our ANNs on hours of footage, corroborating the results observed on our test sets.</p>
<p>Our first key achievement is the successful implementation of multi-animal identification and tracking over multiple often-occluded camera feeds, made robust by utilizing colored harnesses with distinct reflective patterns. This approach enabled long-term individual animal tracking within groups, enabling downstream model outputs (poses, activity, behaviors and ClinObs) to be attributed to specific animals without compromising welfare-related social interactions or spacing requirements. We have implemented the harnesses in several studies up to 4 weeks without any irritation or impact on the skin or fur of the animals (internal data). To achieve performant tracking, we implemented the Jonker-Volgenant algorithm on weighted scores of the detector, ID classifier and IoU. When comparing the same group size (N &#x3d; 3) and setup (bars), tracking metrics were nearly identical between test set and field validation (<xref ref-type="fig" rid="F3">Figures 3c</xref>, <xref ref-type="fig" rid="F5">5</xref>). This outstanding performance was reflected in the excellent correlation between our AI model and the accelerometer (<xref ref-type="fig" rid="F4">Figure 4</xref>) - even in the most challenging situations of group-housed animals and at night. Moreover, our AI activity tracking resulted in a more objective representation of the actual &#x2018;distance moved&#x2019;. As missed detections for longer periods of time (exceeding some frames) only occurred when animals were immobile and at night (accelerometer values close to or zero, typically when sleeping), they did not interfere with accurate activity tracking. Their downstream impact is also less of a concern since most behaviors and ClinObs occur in mobile animals or include at least some movement. However, clinical signs that occur in truly sedentary animals (e.g., IVM) are at risk to be missed when they occur at night.</p>
<p>Our pose classifier performed similarly for Standing and Standing Up across the test and field validation set. For Lying and Sitting the performance dropped during the field validation while for Standing Down the performance increased (<xref ref-type="fig" rid="F3">Figures 3d</xref>, <xref ref-type="fig" rid="F4">4c</xref>). These differences in accuracy are likely due to insufficiently varied data in the test set&#x2013;again highlighting the value of field validation. Like the tracking module, the behavioral classifier performed equally well on the test set vs. field validation: drinking 93.1% vs. 90.0%, eating 89.6% vs. 93.9% and none 95.4% vs. 99%, respectively. Event-level accuracies remained similar to those on frame-level, due to a small number of misclassified behaviors.</p>
<p>The disparity between top-1 (79.0%) and class (47.9%) accuracy of our ClinObs model underscores the class imbalance inherent in the dataset, reflecting that performance varied considerably across different observation types; linked to their representation in the training data. Observations like IVM also proved inherently difficult to classify as they are characterized by subtle visual cues. Furthermore, the prevalence of the background class <italic>none</italic> resulted in a higher precision relative to recall. While model predictions tend to be correct, the model exhibits caution in recognizing all ClinObs due to its preference for predicting <italic>none</italic>, which consequently results in a higher risk for false negatives. The occasional confusion between similar ClinObs classes is a potential source of error that, while not ideal, can be mitigated by the presence of a human reviewer. Crucially, the model&#x2019;s robust detection of key signs such as circling (86.6%) underscores its potential to assist in non-clinical assessments.</p>
<p>As a final part, we validated our AI pipeline for the detection of clinical signs by using reference compounds for ataxia and IVM. While on the test set, the performance for these classes was suboptimal ranging from 30.1% (IVM) to 35.2% (limping) and 40.4% (ataxia); our AI model succeeded in detecting the <italic>in person</italic> observed ataxia and IVM in all animals within the expected time frame. In contrast to the clear Transcutol&#xae; HP-induced ataxia, Acepromazine-treated animals showed more of an ataxia-like, unstable gait phenotype. This explained why our model more often confused ataxia with limping in these animals, while this rarely happened in the Transcutol&#xae; HP group. For the analysis, <italic>Ataxia</italic> and <italic>Limping</italic> signals were therefore merged. While our model&#x2019;s inability to distinguish between these two classes in certain phenotypes is a definite limitation, there will always be a need for a human reviewer (at least for the foreseeable future) who can mitigate and assign the correct classification.</p>
<p>Overall, we demonstrated a consistent, high performance of our AI pipeline in real-word scenarios, both on our fully annotated test sets, as well as in extensive field studies. Our activity and behavioral tracking modules are highly reliable, and even for subtle ClinObs, we showed very promising results and were able to detect clinically relevant observations amongst the model-generated background.</p>
<p>This can significantly improve not only drug safety evaluation, but also the refinement of these studies by continuous animal welfare and health monitoring. It aligns perfectly with the FDA Modernization Act 2.0 that focuses on the use of new approach methodologies (NAMs) and implementation of AI (<xref ref-type="bibr" rid="B42">Zushin et al., 2023</xref>) where the aimed reduction of animal numbers emphasizes the importance to extract the most possible information out of every performed <italic>in vivo</italic> study. We believe that AI-video monitoring should become a standard practice in the coming years to map out detailed <italic>in vitro-in vivo</italic> correlations. To our knowledge, we are the first to develop such a robust and integrated AI pipeline for canines aimed at preclinical research purposes. As our proposed AI pipeline is designed to be generic, it should allow for application for diverse observations across camera setups and species after sufficient retraining. In addition, our identification strategy using a clear visual identifier can be easily transferred to a number of non-rodent species, such as minipigs or non-human primates.</p>
<p>In our future research, we aim to enhance pose classification and validate it in group-housed conditions; and further develop our behavior and ClinObs classifier model. This includes augmenting underperforming classes with more data and introducing new classes. Since this process will take time, we plan to boost the performance of our model in the meantime by visually combining outputs of several modules. For example, a <italic>Standing</italic> pose that persists for an extended period, combined with low activity and high immobile time, could indicate neurological symptoms such as abnormal posture or catalepsy. Similarly, AI model output can be combined across similar classes, as illustrated for <italic>Ataxia</italic> and <italic>Limping</italic> in Acepromazine-treated animals (<xref ref-type="sec" rid="s12">Supplementary Figure S3</xref>). When adding additional information on the animals&#x2019; activity, signals can be identified that warrant further manual investigation, such as high <italic>Abnormal gait</italic> detections compared to low animal movement. Additionally, we plan to explore anomaly detection techniques to improve detection of rare and novel, unseen ClinObs, as well as few-shot techniques to cover the entire long-tailed training data distribution range.</p>
<p>In conclusion, our proposed pixel appearance-based computer vision AI approach holds tremendous potential to significantly impact the field of preclinical and behavioral research. It can provide a complete and detailed mapping of individual animal behavior which enables the investigation of behavioral patterns and premonitory signs, and a true quantification of spontaneous occurrences compared to on-study events. This significant improvement in observational accuracy will enhance our understanding of drug safety and as a NAM, contribute to the refinement of preclinical <italic>in vivo</italic> studies through the application of the 3Rs principle (reducing, refining, and replacing the use of animals). Moreover, it underscores a commitment to animal welfare and health monitoring, ensuring a positive contribution to ethical research practices.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The datasets presented in this article are not readily available because they are proprietary to Johnson and Johnson. These can be obtained from the corresponding author on reasonable request and upon permission of Johnson and Johnson. Requests to access the datasets should be directed to ikopljar@its.jnj.com.</p>
</sec>
<sec sec-type="ethics-statement" id="s6">
<title>Ethics statement</title>
<p>The animal studies were approved by Ethics committee on Animal Experiments of the research center of Johnson and Johnson (Beerse, Belgium). The studies were conducted in accordance with the local legislation and institutional requirements.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>EE: Investigation, Writing &#x2013; review and editing, Writing &#x2013; original draft, Formal Analysis, Data curation, Validation, Methodology, Visualization. JP: Validation, Visualization, Formal Analysis, Investigation, Writing &#x2013; review and editing, Methodology, Software, Writing &#x2013; original draft. TO: Methodology, Writing &#x2013; review and editing, Investigation, Formal Analysis, Software, Writing &#x2013; original draft, Validation, Visualization. FD: Writing &#x2013; original draft, Formal Analysis, Software, Methodology, Investigation, Writing &#x2013; review and editing, Validation. SD: Resources, Formal Analysis, Data curation, Conceptualization, Methodology, Writing &#x2013; review and editing. GT: Writing &#x2013; review and editing, Methodology, Conceptualization. MV: Writing &#x2013; review and editing, Methodology, Conceptualization. BF: Methodology, Conceptualization, Writing &#x2013; review and editing. TG: Conceptualization, Supervision, Writing &#x2013; review and editing, Investigation, Methodology, Resources. IK: Funding acquisition, Writing &#x2013; review and editing, Resources, Project administration, Supervision, Methodology, Conceptualization.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>The authors wish to thank all the researchers for the annotation of the data sets.</p>
</ack>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>Authors EE, SL, GT, MV, BF and IK are employed by J&#x26;J Innovative Medicine, Janssen Research &#x26; Development.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s12">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/ftox.2026.1758963/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/ftox.2026.1758963/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S1</label>
<caption>
<p>Comparative analysis supporting Detector ANN and harness selection. <bold>(A)</bold> Test set performance (F1) of all object detectors we compared and their IoU with the manually annotated bounding boxes. YOLOv2 was selected to maximize F1 and maintain satisfactory IoU. <bold>(B)</bold> Confusion matrix of our 6-ID model, based on which Black-Dots and Grey-Fluo were discarded.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S2</label>
<caption>
<p>AI Model output for vehicle animals in Field validation study. <bold>(A)</bold> A representative, framewise AI model output for ataxia in one vehicle-treated animal on a baseline day and the dosing day. Significant and/or aberrant AI signals that were manually checked are marked with a green (true positive, TP) or red (false positive, FP) asterix. <bold>(B)</bold> A representative, framewise AI model output for IVM (involuntary muscle movement) in the same vehicle animal on the same days. Significant and/or aberrant AI signals that were manually checked are marked with a green (true positive, TP) or red (false positive, FP) asterix. <bold>(C)</bold> Remaining ataxia signals (per minute) on the dosing day after applying an adaptive baseline thresholding strategy to eliminate the baseline signals. <bold>(D)</bold> Remaining IVM signals (per minute) on the dosing day after applying an adaptive baseline thresholding strategy to eliminate the baseline signals.</p>
</caption>
</supplementary-material>
<supplementary-material>
<label>SUPPLEMENTARY FIGURE S3</label>
<caption>
<p>Merging ClinObs classes to improve model performance. <bold>(A)</bold> A representative, framewise AI model output for ataxia in one Acepromazine-treated animal on a baseline day and the dosing day. <bold>(B)</bold> A representative, framewise AI model output for limping in the same Acepromazine-treated animal on the same days. <bold>(C)</bold> Framewise AI model output when merging ataxia and limping detections. Significant and/or aberrant AI signals that were manually checked are marked with a green (true positive, TP) or red (false positive, FP) asterix.</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Image3.tif" id="SM1" mimetype="application/tif" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image2.tif" id="SM2" mimetype="application/tif" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image1.tif" id="SM3" mimetype="application/tif" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Acatay</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Sommer</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schumann</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Beyerer</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Comprehensive evaluation of deep learning based detection methods for vehicle detection in aerial imagery</article-title>,&#x201d; in <conf-name>2018 15th IEEE International Conference on Advanced Video and Signal Based Surveillance (AVSS): IEEE</conf-name>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Arnab</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Dehghani</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Heigold</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Lu&#x10d;i&#x107;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Schmid</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Vivit: a video vision transformer</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>6836</fpage>&#x2013;<lpage>6846</lpage>.</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Authier</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Arezzo</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Pouliot</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Accardi</surname>
<given-names>M. V.</given-names>
</name>
<name>
<surname>Boulay</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Troncy</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>EEG: characteristics of drug-induced seizures in rats, dogs and non-human primates</article-title>. <source>J. Pharmacol. Toxicol. Methods</source> <volume>97</volume>, <fpage>52</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1016/j.vascn.2019.03.004</pub-id>
<pub-id pub-id-type="pmid">30922951</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bala</surname>
<given-names>P. C.</given-names>
</name>
<name>
<surname>Eisenreich</surname>
<given-names>B. R.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>S. B. M.</given-names>
</name>
<name>
<surname>Hayden</surname>
<given-names>B. Y.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>H. S.</given-names>
</name>
<name>
<surname>Zimmermann</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Automated markerless pose estimation in freely moving macaques with OpenMonkeyStudio</article-title>. <source>Nat. Commun.</source> <volume>11</volume> (<issue>1</issue>), <fpage>4560</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-020-18441-5</pub-id>
<pub-id pub-id-type="pmid">32917899</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berman</surname>
<given-names>G. J.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Bialek</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Shaevitz</surname>
<given-names>J. W.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Mapping the stereotyped behaviour of freely moving fruit flies</article-title>. <source>J. R. Soc. Interface</source> <volume>11</volume> (<issue>99</issue>). <pub-id pub-id-type="doi">10.1098/rsif.2014.0672</pub-id>
<pub-id pub-id-type="pmid">25142523</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berridge</surname>
<given-names>B. R.</given-names>
</name>
<name>
<surname>Baran</surname>
<given-names>S. W.</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Bratcher-Petersen</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Ellis</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>C.-N.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Digitalization of toxicology: improving preclinical to clinical translation</article-title>. <source>Front. Toxicol.</source> <volume>6</volume>, <fpage>6</fpage>&#x2013;<lpage>2024</lpage>. <pub-id pub-id-type="doi">10.3389/ftox.2024.1377542</pub-id>
<pub-id pub-id-type="pmid">38605940</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bochkovskiy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>H.-Y. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Yolov4: optimal speed and accuracy of object detection</article-title>. <comment>arXiv preprint arXiv:2004.10934</comment>.</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bohnslav</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Wimalasena</surname>
<given-names>N. K.</given-names>
</name>
<name>
<surname>Clausing</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>Y. Y.</given-names>
</name>
<name>
<surname>Yarmolinsky</surname>
<given-names>D. A.</given-names>
</name>
<name>
<surname>Cruz</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>DeepEthogram, a machine learning pipeline for supervised behavior classification from raw pixels</article-title>. <source>Elife</source> <volume>10</volume>, <fpage>e63377</fpage>. <pub-id pub-id-type="doi">10.7554/eLife.63377</pub-id>
<pub-id pub-id-type="pmid">34473051</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bolya</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Foley</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hays</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hoffman</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Tide: a general toolbox for identifying object detection errors</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name>, <fpage>558</fpage>&#x2013;<lpage>573</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Coker</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Berumen</surname>
<given-names>M. L.</given-names>
</name>
<name>
<surname>Costelloe</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Beery</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). &#x201c;<article-title>Mammalnet: a large-scale video benchmark for mammal recognition and behavior understanding</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, <fpage>13052</fpage>&#x2013;<lpage>13061</lpage>.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cools</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Kopljar</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Fetene</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Borghys</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Use of the actiwatch-mini&#xae; in dog safety studies as an early indicator for drug-induced behavioural changes</article-title>. <source>J. Pharmacol. Toxicol. Methods</source> <volume>104</volume>, <fpage>106896</fpage>. <pub-id pub-id-type="doi">10.1016/j.vascn.2020.106896</pub-id>
<pub-id pub-id-type="pmid">32622755</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Crouse</surname>
<given-names>D. F.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>On implementing 2D rectangular assignment algorithms</article-title>. <source>IEEE Trans. Aerosp. Electron. Syst.</source> <volume>52</volume> (<issue>4</issue>), <fpage>1679</fpage>&#x2013;<lpage>1696</lpage>. <pub-id pub-id-type="doi">10.1109/TAES.2016.140952</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Cubuk</surname>
<given-names>E. D.</given-names>
</name>
<name>
<surname>Zoph</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Shlens</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q. V.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Randaugment: practical automated data augmentation with a reduced search space</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops</conf-name>, <fpage>702</fpage>&#x2013;<lpage>703</lpage>.</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dosovitskiy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Beyer</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Kolesnikov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Weissenborn</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Zhai</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Unterthiner</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>An image is worth 16x16 words: transformers for image recognition at scale</article-title>. <comment>arXiv preprint arXiv:2010.11929</comment>.</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Eberhardt</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Tekle</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Teuns</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Witters</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Feyen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>De Landtsheer</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Application of video surveillance in preclinical safety studies in canines: understanding the interobserver reliability and validity to recognize clinical behavior</article-title>. <source>PLOS ONE</source> <volume>20</volume> (<issue>6</issue>), <fpage>e0326916</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0326916</pub-id>
<pub-id pub-id-type="pmid">40577280</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Everingham</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Van Gool</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Williams</surname>
<given-names>C. K. I.</given-names>
</name>
<name>
<surname>Winn</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>The pascal visual object classes (VOC) challenge</article-title>. <source>Int. J. Comput. Vis.</source> <volume>88</volume> (<issue>2</issue>), <fpage>303</fpage>&#x2013;<lpage>338</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-009-0275-4</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Franzoni</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Biondi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Milani</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Advanced techniques for automated emotion recognition in dogs from video data through deep learning</article-title>. <source>Neural Comput. Appl.</source> <volume>36</volume>, <fpage>17669</fpage>&#x2013;<lpage>17688</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-024-10042-3</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Menegon</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Scollo</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Norton</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Occlusion-robust detection of sow-induced piglet crushing incidents using spatial and motion reasoning</article-title>. <source>Comput. Electron. Agric.</source> <volume>231</volume>, <fpage>109961</fpage>. <pub-id pub-id-type="doi">10.1016/j.compag.2025.109961</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). &#x201c;<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <fpage>770</fpage>&#x2013;<lpage>778</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.90</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heindl</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Motmetrics: a python library for multi-object tracking metrics</article-title>. <source>GitHub</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://github.com/cheind/py-motmetrics">https://github.com/cheind/py-motmetrics</ext-link> (Accessed December 1, 2025).</comment>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Isik</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Unal</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Open-source software for automated rodent behavioral analysis</article-title>. <source>Front. Neurosci.</source> <volume>17</volume>, <fpage>17</fpage>&#x2013;<lpage>2023</lpage>. <pub-id pub-id-type="doi">10.3389/fnins.2023.1149027</pub-id>
<pub-id pub-id-type="pmid">37139530</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jonker</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Volgenant</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>1987</year>). <article-title>A shortest augmenting path algorithm for dense and sparse linear assignment problems</article-title>. <source>Computing</source> <volume>38</volume> (<issue>4</issue>), <fpage>325</fpage>&#x2013;<lpage>340</lpage>. <pub-id pub-id-type="doi">10.1007/bf02278710</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kahnau</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Mieske</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wilzopolski</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kalliokoski</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Mandillo</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>H&#xf6;lter</surname>
<given-names>S. M.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>A systematic review of the development and application of home cage monitoring in laboratory mice and rats</article-title>. <source>BMC Biol.</source> <volume>21</volume> (<issue>1</issue>), <fpage>256</fpage>. <pub-id pub-id-type="doi">10.1186/s12915-023-01751-7</pub-id>
<pub-id pub-id-type="pmid">37953247</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kalman</surname>
<given-names>R. E.</given-names>
</name>
</person-group> (<year>1960</year>). <article-title>A new approach to linear filtering and prediction problems</article-title>. <source>Trans. ASME- J. Basic Eng.</source> <volume>82</volume> (<issue>Series D</issue>), <fpage>35</fpage>&#x2013;<lpage>45</lpage>. <pub-id pub-id-type="doi">10.1115/1.3662552</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lauer</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Menegas</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Nath</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Multi-animal pose estimation, identification and tracking with DeepLabCut</article-title>. <source>Nat. Methods</source> <volume>19</volume> (<issue>4</issue>), <fpage>496</fpage>&#x2013;<lpage>504</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-022-01443-0</pub-id>
<pub-id pub-id-type="pmid">35414125</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ophoff</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Van Beeck</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Goedem&#xe9;</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Improving object detection in VHR aerial orthomosaics</article-title>,&#x201d; in <conf-name>European Conference on Computer Vision</conf-name> (<publisher-name>Springer</publisher-name>), <fpage>268</fpage>&#x2013;<lpage>282</lpage>.</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Orciani</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ballesteros</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Troncy</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Berthome</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bujold</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Bennamoune</surname>
<given-names>N.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>The spontaneous incidence of neurological clinical signs in preclinical species using cage-side observations or high-definition video monitoring: a retrospective analysis</article-title>. <source>Int. J. Toxicol.</source> <volume>43</volume> (<issue>2</issue>), <fpage>123</fpage>&#x2013;<lpage>133</lpage>. <pub-id pub-id-type="doi">10.1177/10915818231218984</pub-id>
<pub-id pub-id-type="pmid">38063479</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pereira</surname>
<given-names>T. D.</given-names>
</name>
<name>
<surname>Tabris</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Matsliah</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Turner</surname>
<given-names>D. M.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ravindranath</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>SLEAP: a deep learning system for multi-animal pose tracking</article-title>. <source>Nat. Methods</source> <volume>19</volume> (<issue>4</issue>), <fpage>486</fpage>&#x2013;<lpage>495</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-022-01426-1</pub-id>
<pub-id pub-id-type="pmid">35379947</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>YOLO9000: better, faster, stronger</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>. <pub-id pub-id-type="doi">10.48550/arXiv.1612.08242</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Redmon</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Farhadi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Yolov3: an incremental improvement</article-title>. <comment>arXiv preprint arXiv:1804.02767</comment>.</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ristani</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Solera</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Cucchiara</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Tomasi</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Performance measures and a data set for multi-target, multi-camera tracking</article-title>. <source>Comput. Vis. &#x2013; ECCV 2016 Work.</source>, <fpage>17</fpage>&#x2013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1609.01775</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="web">
<collab>SciPy</collab> (<year>2008</year>). <article-title>Scipy.optimize: linear_sum_assignment</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html">https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.linear_sum_assignment.html</ext-link> (Accessed December 1, 2025).</comment>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sillito</surname>
<given-names>R. R.</given-names>
</name>
<name>
<surname>Sutherland</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Milne</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Giuliano</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Sigfridsson</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Rolf</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2025</year>). <article-title>Rodent home cage monitoring for preclinical safety pharmacology assessment: results of a multi-company validation evaluating nonclinical and clinical data from three compounds</article-title>. <source>Front. Toxicol.</source> <volume>7</volume>, <fpage>7</fpage>&#x2013;<lpage>2025</lpage>. <pub-id pub-id-type="doi">10.3389/ftox.2025.1655330</pub-id>
<pub-id pub-id-type="pmid">41267921</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tong</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Videomae: masked autoencoders are data-efficient learners for self-supervised video pre-training</article-title>. <source>Adv. Neural Information Processing Systems</source> <volume>35</volume>, <fpage>10078</fpage>&#x2013;<lpage>10093</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2203.12602</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van Etten</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>You only look twice: rapid multi-scale object detection in satellite imagery</article-title>. <source>arXiv Preprint arXiv:1805.09512</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1805.09512</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="web">
<collab>VCA-Animal-Hospitals</collab> (<year>2023</year>). <article-title>
<italic>Acepromazine</italic>
</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://vcahospitals.com/know-your-pet/acepromazine">https://vcahospitals.com/know-your-pet/acepromazine</ext-link> (Accessed December 1, 2025).</comment>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wiltschko</surname>
<given-names>A. B.</given-names>
</name>
<name>
<surname>Johnson</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Iurilli</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Peterson</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Katon</surname>
<given-names>J. M.</given-names>
</name>
<name>
<surname>Pashkovski</surname>
<given-names>S. L.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Mapping sub-second structure in mouse behavior</article-title>. <source>Neuron</source> <volume>88</volume> (<issue>6</issue>), <fpage>1121</fpage>&#x2013;<lpage>1135</lpage>. <pub-id pub-id-type="doi">10.1016/j.neuron.2015.11.031</pub-id>
<pub-id pub-id-type="pmid">26687221</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Using a CNN-LSTM for basic behaviors detection of a single dairy cow in a complex environment</article-title>. <source>Comput. Electron. Agric.</source> <volume>182</volume>, <fpage>106016</fpage>. <pub-id pub-id-type="doi">10.1016/j.compag.2021.106016</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Oh</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Chun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Choe</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Cutmix: regularization strategy to train strong classifiers with localizable features</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>, <fpage>6023</fpage>&#x2013;<lpage>6032</lpage>.</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Cisse</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dauphin</surname>
<given-names>Y. N.</given-names>
</name>
<name>
<surname>Lopez-Paz</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Mixup: beyond empirical risk minimization</article-title>. <source>arXiv Preprint arXiv:1710.09412</source>. <pub-id pub-id-type="doi">10.48550/arXiv.1710.09412</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhong</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>del-Blanco</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Berj&#xf3;n</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jaureguizar</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Garc&#xed;a</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>AnimalMotionCLIP: embedding motion in CLIP for animal behavior analysis</article-title>. <source>arXiv Preprint arXiv:2505.00569</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2505.00569</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zushin</surname>
<given-names>P.-J. H.</given-names>
</name>
<name>
<surname>Mukherjee</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J. C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>FDA modernization act 2.0: transitioning beyond animal models with human cells, organoids, and AI/ML-based approaches</article-title>. <source>J. Clin. Investigation</source> <volume>133</volume> (<issue>21</issue>), <fpage>e175824</fpage>. <pub-id pub-id-type="doi">10.1172/JCI175824</pub-id>
<pub-id pub-id-type="pmid">37909337</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1920020/overview">Chang-Ning Liu</ext-link>, Pfizer, United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3340833/overview">Devon Martin</ext-link>, North Carolina State University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3343373/overview">Sierra Boyd</ext-link>, Genentech Inc., United States</p>
</fn>
</fn-group>
</back>
</article>