<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1737761</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Hybrid deep feature integration model for robust deepfake detection using transfer-learned neural networks</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Potluri</surname>
<given-names>Sirisha</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3264070"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Kandagatla</surname>
<given-names>Srikar Prabhas</given-names>
</name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Mohanty</surname>
<given-names>Sachi Nandan</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1273148"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Rout</surname>
<given-names>Kailash Chandra</given-names>
</name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Israr</surname>
<given-names>Mohammad</given-names>
</name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2790144"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gupta</surname>
<given-names>V. Mnssvkr</given-names>
</name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Computer Science and Engineering, Koneru Lakshmaiah Education Foundation, Bowrampet</institution>, <city>Hyderabad</city>, <state>Telangana</state>, <country country="in">India</country></aff>
<aff id="aff2"><label>2</label><institution>Manning School of Information and Computer Sciences, University of Massachusetts Amherst</institution>, <city>Amherst</city>, <state>MA</state>, <country country="us">United States</country></aff>
<aff id="aff3"><label>3</label><institution>School of Computer Science &#x0026; Engineering (SCOPE), VIT-AP University</institution>, <city>Amaravati</city>, <state>Andhra Pradesh</state>, <country country="in">India</country></aff>
<aff id="aff4"><label>4</label><institution>Capital Engineering College</institution>, <city>Bhubaneswar</city>, <state>Odisha</state>, <country country="in">India</country></aff>
<aff id="aff5"><label>5</label><institution>Maryam Abacha American University of Nigeria</institution>, <city>Kano</city>, <state>Kano State</state>, <country country="ng">Nigeria</country></aff>
<aff id="aff6"><label>6</label><institution>Department of Computer Science and Engineering, SRKR Engineering College</institution>, <city>Bhimavaram</city>, <state>Andhra Pradesh</state>, <country country="in">India</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Sirisha Potluri, <email xlink:href="mailto:sirisha.vegunta@gmail.com">sirisha.vegunta@gmail.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-25">
<day>25</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1737761</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>03</day>
<month>02</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Potluri, Kandagatla, Mohanty, Rout, Israr and Gupta.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Potluri, Kandagatla, Mohanty, Rout, Israr and Gupta</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-25">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>With the rapid evolution and development of artificial intelligence and intelligent learning, the creation of realistic deepfake multimedia content has become accessible and is raising substantial requirements for digital security and media authenticity.</p>
</sec>
<sec>
<title>Methods</title>
<p>While prevailing methods rely profoundly on deep learning and transformer driven practices, their computational cost, resource usage and sensitivity towards dataset bias prevent real-world usage and deployment. This work studies several practices for perceiving deepfake content in images and videos, analyzing state-of-the-art techniques, Convolutional Neural Network, Xception, ResNet50 and propose hybrid approach (DAAL-NET) with lightweight, Bi-stream artifact-resistant deepfake content detection capabilities to simultaneously learn spatial patterns, cues, and temporal motion inconsistencies. The framework combines three significant novelties: (1) a Local Forensics Encoder with Learnable Frequency Attention mechanism to analyze high-frequency manipulation; (2) a Motion Irregularity Encoder with depth wise temporal convolutions and gated recurrent units to obtain frame-level motion gaps; and (3) a Multi-Stream Interaction Module for bidirectional spatial temporal fusion using cross-attention. A scientifically trained Artifact Confidence Calibration Layer is proposed to improve probability and reliability.</p>
</sec>
<sec>
<title>Results and discussion</title>
<p>Experiments supervised on Datasets of Celeb- DF(v2) and Kaggle exhibit that the proposed hybrid approach enhances macro- F1, calibration error, and temporal robustness compared to baseline models. The proposed model obtains a competitive outcome under constrained computational resources, making it appropriate for forensic applications, real-world media authentication systems, low-power deployments, and scalable deepfake screening pipelines.</p>
</sec>
</abstract>
<kwd-group>
<kwd>bi-stream neural networks</kwd>
<kwd>DALL-NET</kwd>
<kwd>deep fake detection</kwd>
<kwd>learnable frequency attention</kwd>
<kwd>motion irregularity encoder</kwd>
<kwd>temporal attention gated recurrent unit</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was not received for this work and/or its publication.</funding-statement>
</funding-group>
<counts>
<fig-count count="16"/>
<table-count count="8"/>
<equation-count count="11"/>
<ref-count count="56"/>
<page-count count="14"/>
<word-count count="8466"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Machine Learning and Artificial Intelligence</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Developments in information technology and intelligent devices have empowered people to capture and share their moments as multimedia posts on several social media platforms (<xref ref-type="bibr" rid="ref47">Verdoliva, 2020</xref>). Advanced media manipulation tools are allowing users to modify digital media content by using deepfake technology (<xref ref-type="bibr" rid="ref26">Kietzmann et al., 2020</xref>). Intelligent and customized features of the applications are allowing users to perform face swapping and other advanced practices to produce deepfake content (<xref ref-type="bibr" rid="ref36">Mirksy et al., 2019</xref>). While useful applications are allowing users to enhance visual effects and simulations to support several domains like movies and healthcare (<xref ref-type="bibr" rid="ref22">Guarnera et al., 2020</xref>). Deepfake technology also triggers concerns about evidence alteration, cyberbullying, scams, and political propaganda (<xref ref-type="bibr" rid="ref56">Zhu et al., 2020</xref>). These problems affect essential platforms such as police examinations and legal chronicles, where multimedia content has traditionally been considered reliable (<xref ref-type="bibr" rid="ref2">Agarwal et al., 2019</xref>; <xref ref-type="bibr" rid="ref3">Albahar and Almalki, 2019</xref>), as this poses a substantial dilemma and necessitates a thorough evaluation of multimedia evidence before legal proceedings. This emphasizes the need for Artificial Intelligence (AI) based systems to identify the authenticity of the content (<xref ref-type="bibr" rid="ref14">Chesney and Citron, 2019</xref>).</p>
<p>These concerns are fundamental for sustaining users&#x2019; trust and promising the authenticity of digital content in social media platforms. Instead of aiming for a single, cohesive approach, this analysis investigates advanced Deep Learning (DL) frameworks to address these concerns. For image-driven deepfake detection, five independent models are examined, and for video-driven deepfake detection, a hybrid model is proposed. The key contributions of this work are described in terms of image and video-based deepfake detection, data augmentation, and efficient classification. A comparative study of image-driven deepfake detection approaches, namely Convolutional Neural Network (CNN) (<xref ref-type="bibr" rid="ref49">Wu, 2017</xref>; <xref ref-type="bibr" rid="ref35">Medsker and Jain, 2001</xref>), Xception (<xref ref-type="bibr" rid="ref15">Chollet, 2017</xref>), and Residual Network 50-layer variant (ResNet50) (<xref ref-type="bibr" rid="ref54">Yesugade and Jadhav, 2024</xref>), ViT-B/16, and EfficientNet B4 on a dataset (deepfake_faces). A lightweight DAAL-NET hybrid architecture is proposed for video-driven deepfake detection, supporting joint learning of spatial artifacts and temporal motion inconsistencies (<xref ref-type="bibr" rid="ref34">Masood et al., 2021</xref>; <xref ref-type="bibr" rid="ref18">Dey and Salem, 2017</xref>). The datasets, data pre-processing, experimental setup, testing, and training configuration are described with significant evidence, and outcomes of the proposed model are presented in comparison with the baseline methods. Practical problems, constraints, and possible deployment setups are addressed to conclude the proposed work and offer recommendations for further study and investigation. Implementation works with advanced Machine Learning (ML) libraries (<xref ref-type="bibr" rid="ref39">Raschka, 2015</xref>) and cloud platforms for intelligent computation capabilities (<xref ref-type="bibr" rid="ref11">Bisong, 2019</xref>).</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Related work</title>
<p>Deepfake digital content is produced with advanced algorithms, and the resultant multimedia content frequently adheres to standard data representation formats. Such data in prescribed formats are considered as inputs to CNN algorithms for content analysis and classification. This assessment is significantly associated with established DL practices (<xref ref-type="bibr" rid="ref24">Jolly et al., 2022</xref>; <xref ref-type="bibr" rid="ref41">Rossler et al., 2019</xref>). Though deepfake detection systems relied predominantly on CNN-driven approaches (Xception and EfficientNet), these models&#x2019; emphasis is solely on spatial patterns and artifacts, and they often struggle with deepfake manipulations that are of high quality (<xref ref-type="bibr" rid="ref45">Staudemeyer and Morris, 2019</xref>; <xref ref-type="bibr" rid="ref9">Bansal et al., 2018</xref>; <xref ref-type="bibr" rid="ref43">Solaiyappan and Wen, 2022</xref>). Further recent methods combine temporal reasoning, like LSTM, attention-driven video transformers, and lip-motion forensics approaches, which are computationally heavy and complex and need large datasets to generalize efficiently (<xref ref-type="bibr" rid="ref46">Suthaharan, 2016</xref>; <xref ref-type="bibr" rid="ref40">Rigatti, 2017</xref>; <xref ref-type="bibr" rid="ref44">Song and Ying, 2015</xref>; <xref ref-type="bibr" rid="ref55">Zhou et al., 2022</xref>; <xref ref-type="bibr" rid="ref1">Adhinata et al., 2021</xref>). Feature-driven fusion pipelines, frame-level CNN embeddings, and frame-to-frame transformer-based detectors are used to attain strong findings and observations on benchmark datasets. The proposed hybrid DAAL-NET approach offers improved spatial artifact extraction and lightweight temporal modeling, integrating Learnable Frequency Attention with GRU-based motion irregularity analysis. This dual-stream design captures high-frequency cues and temporal deviations while avoiding CNN limitations and transformer computational burdens, enabling robust deepfake detection under constrained resources (<xref ref-type="bibr" rid="ref33">Mascarenhas and Agarwal, 2021</xref>; <xref ref-type="bibr" rid="ref25">Khan et al., 2018</xref>; <xref ref-type="bibr" rid="ref10">Bhandari et al., 2022</xref>; <xref ref-type="bibr" rid="ref29">Kute, 2022</xref>; <xref ref-type="bibr" rid="ref8">Arrieta et al., 2020</xref>) for real-world deployments.</p>
<p>The FaceForensics++ dataset provides data for approximately 1,000 videos with a range of automated manipulations. This methodology detects subjects, extracts facial characteristics through CNNs, models a temporal sequence through an LSTM layer to perform interframe manipulations, and postprocesses through a Recycle-GAN, where spatial and temporal data are integrated; this yields an accuracy of 99%. Currently, Deepfakes also raise an alarm in the medical field by altering X-rays, MRI, and CT scans. A study by <xref ref-type="bibr" rid="ref16">Coccomini et al. (2022)</xref> considered EfficientNet (<xref ref-type="bibr" rid="ref27">Koonce, 2021</xref>) and Vision Transformers, while the focus is on Convolutional EfficientNet B0 as a feature extractor. A substantial fraction of this effort is because few established techniques, such as distillation, and ensemble techniques are missing for fake video detection. A number of techniques have been proposed to identify deepfakes, but this remains a difficult task considering the increased realism of the fabricated content. Deepfakes are commonly created by VAEs (<xref ref-type="bibr" rid="ref17">Davidson et al., 2018</xref>) or GANs (<xref ref-type="bibr" rid="ref30">Li et al., 2022</xref>), which can manipulate media without requiring prior knowledge. Responsible deployment and regulatory measures are, thus, necessary even considering all technical advancements (<xref ref-type="bibr" rid="ref51">Yadav and Salmani, 2019</xref>; <xref ref-type="bibr" rid="ref37">Nguyen et al., 2022</xref>; <xref ref-type="bibr" rid="ref4">Albawi et al., 2017</xref>).</p>
<p>Deepfake detection methods have seen a considerable amount of progress from traditional CNN-based approaches, which are largely relied on at an early stage. At that stage, traditional CNN-based models are largely confined to identifying some spatial artifacts. But they have failed terribly at times when manipulating high-quality images. However, recent approaches are set to raise the bar by combining temporal reasoning with tools ranging from GRU, LSTMs, and transformers or leveraging related forensic features ranging from lip movement-based LipForensics. But these models are quite expensive and have a huge demand for large datasets. However, it takes a different turn as it proposes a dual learning framework with a new model named DAAL-Net. It uses a Learnable Frequency Attention method and an efficient GRU-based irregular motion feature analysis. Unlike traditional CNNs and transformers, it manages to focus on identifying high-quality manipulations at a spatial as well as frame-level temporal attribute. It also tries to validate related spatial-only models ranging from Custom CNN, Xception, ResNet-50, and even EfficientNet with spatial as well as temporal detection. The solution uses some preprocessing methods and manages to achieve class balancing and generalizability with efficacy and cost-effectiveness at times for overcoming deepfakes.</p>
</sec>
<sec sec-type="materials|methods" id="sec3">
<label>3</label>
<title>Materials and methods</title>
<p>The &#x201C;DAGNELIES&#x201D; dataset deepfake_faces from Kaggle contains both real and tampered images, labeled as REAL or FAKE. The total number in the original set is 95,634 images, with 79,341 fake and 16,293 real samples. In order to solve the problem of class imbalance, equal numbers of samples are selected from each class. So, the total number for a balanced training and testing is 16,000 images. The pre-processing techniques used are stratified sampling, data augmentation, mild contrast adjustment, and learned feature distribution to hold the proportionality and diversity. The pseudocode for the function load_dataset is below.</p>
<statement><p>function load_dataset(set_name):
&#x00A0;&#x00A0;&#x00A0;images = empty list
&#x00A0;&#x00A0;&#x00A0;labels = empty list
&#x00A0;&#x00A0;&#x00A0;for each row in set_name:
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;img_path = concatenate (/Path/faces_224&#x2019;, row[videoname&#x2019;] [: -4] + .jpg&#x2019;)
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;img = read_image(img_path) /&#x002A; Assume read_image is a function to read the image &#x002A;/
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;images.append(img)
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;labels.append(0 if row[&#x2018;label&#x2019;] = = &#x2018;REAL&#x2019; else 1)
&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;&#x00A0;return array(images), array(labels)
/&#x002A; Use stratified sampling with &#x2018;Train_Test_Split&#x2019; to split the dataset into training, validation, and testing sets &#x002A;/
X_Train, Y_Train = load_dataset(Training_Set)
X_Val, Y_Val = load_dataset(Validation_Set)
X_Test, Y_Test = load_dataset(Testing_Set)</p></statement>
<p>The pseudocode manipulates the image datasets (X_Train, X_Val, X_Test) and their labels (Y_Train, Y_Val, Y_Test) by transforming the collections (Training_Set, Validation_Set, Testing_Set) into NumPy arrays and applying label encoding for use in the deepfake detection model. Data augmentation is performed on the training set to improve generalization and reduce overfitting, while validation and test sets support unbiased evaluation. Augmentation includes random horizontal flips, slight rotations, zoom, and contrast adjustments with reproducible transformations. Finally, all images are preprocessed using model-specific functions, such as preprocess_input for ResNet50 and Xception.</p>
<p>Only the training set is subject to augmentation, and fixed random seeds are used to ensure reproducibility, with the validation and test images remaining unchanged.</p>
<p>A custom CNN model is developed for deepfake detection in images. This methodology is illustrated in <xref ref-type="fig" rid="fig1">Figure 1</xref>.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Custom convolutional neural network methodology. (Reprinted with permission from Deepfake Faces Dataset by Dagnelies, <ext-link xlink:href="https://www.kaggle.com/datasets/dagnelies/deepfake-faces" ext-link-type="uri">https://www.kaggle.com/datasets/dagnelies/deepfake-faces</ext-link>).</p>
</caption>
<graphic xlink:href="frai-09-1737761-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating the processing of the deepfake_faces dataset for training a convolutional neural network to classify faces as real or fake, showing data pre-processing, augmentation using ResNet50, and input of augmented images sized two hundred forty-four by two hundred forty-four pixels.</alt-text>
</graphic>
</fig>
<p>The proposed CNN model contains fundamental, convolutional, pooling, normalization, and fully connected layers. Each layer represents a single processing step, so one convolutional layer corresponds to one such step. The model is initialized using the Keras sequential API, with the convolutional layer serving as the first component in the CNN architecture. A convolution is a mathematical operation that measures how much one function <italic>g<sub>f</sub></italic> overlaps with another function <italic>f<sub>f</sub></italic> as it is shifted across it (<xref ref-type="bibr" rid="ref49">Wu, 2017</xref>; <xref ref-type="bibr" rid="ref21">Gu et al., 2019</xref>; <xref ref-type="bibr" rid="ref48">Weisstein, 2003</xref>). In the context of neural networks, convolutional filters, also known as kernels, play a crucial role in extracting features. The dot product of the filter elements, as well as corresponding input values, is calculated by each filter by performing a convolution over the input during the forward pass. This generates feature maps, which are n-dimensional outputs that enable the network to learn filters that react to features at spatial positions in the input. The product of two functions <italic>f<sub>f</sub></italic> and <italic>g<sub>f</sub></italic>, and both of which are members of the algebra of Schwartz functions in <inline-formula>
<mml:math id="M1">
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mi>n</mml:mi>
</mml:msup>
</mml:math>
</inline-formula> is the scientific definition of convolution. <xref ref-type="disp-formula" rid="E1">Equation 1</xref> provides a mathematical expression for the convolution of these functions over a finite interval [0, <italic>t</italic>].</p>
<disp-formula id="E1">
<mml:math id="M2">
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>&#x2217;</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo>&#x222B;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mi>t</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi>&#x03C4;</mml:mi>
</mml:math>
<label>(1)</label>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M3">
<mml:msub>
<mml:mi>c</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>= convolutional output function.</p>
<p><inline-formula>
<mml:math id="M4">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>= original input function.</p>
<p><inline-formula>
<mml:math id="M5">
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>= function that is shifted over the input function.</p>
<p><italic>t</italic>&#x202F;=&#x202F;range variable.</p>
<p><inline-formula>
<mml:math id="M6">
<mml:mi>&#x03C4;</mml:mi>
</mml:math>
</inline-formula>&#x202F;=&#x202F;shifting against <italic>t</italic>.</p>
<p>Convolution is often generalized over an infinite range, resulting in a modification of <xref ref-type="disp-formula" rid="E1">Equation 1</xref> as presented in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>.</p>
<disp-formula id="E2">
<mml:math id="M7">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>&#x2217;</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo>&#x222B;</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mo>&#x221E;</mml:mo>
</mml:mrow>
<mml:mo>&#x221E;</mml:mo>
</mml:munderover>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo>&#x222B;</mml:mo>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mo>&#x221E;</mml:mo>
</mml:mrow>
<mml:mo>&#x221E;</mml:mo>
</mml:munderover>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x03C4;</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi>&#x03C4;</mml:mi>
</mml:math>
<label>(2)</label>
</disp-formula>
<p>According to Bracewell, R. (<xref ref-type="bibr" rid="ref12">Bracewell, 1999</xref>), the variable (in this case, t) is implied and is occasionally represented as <inline-formula>
<mml:math id="M8">
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo>&#x2297;</mml:mo>
<mml:msub>
<mml:mi>g</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
</mml:math>
</inline-formula>. This architecture employs two convolutional layers (Conv2D), each containing 64 filters of size (2, 2), producing multiple feature maps corresponding to the number of filters. Mathematically, ReLU processes an input x, producing an output <italic>&#x03C6;</italic><inline-formula>
<mml:math id="M9">
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</inline-formula> as defined by the function in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>.</p>
<disp-formula id="E3">
<mml:math id="M10">
<mml:mi>&#x03C6;</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mo>max</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>
<p>The architecture also embeds max pooling layers to handle variations in facial orientation and improve feature extraction (<xref ref-type="bibr" rid="ref28">Kuo, 2016</xref>; <xref ref-type="bibr" rid="ref42">Scherer et al., 2010</xref>). This pattern follows the first two convolutional layers, each succeeded by a MaxPooling2D with stride (2, 2). The extracted feature maps are reduced to a one-dimensional vector (<xref ref-type="bibr" rid="ref23">Jeczmionek and Kowalski, 2021</xref>), hence being able to feed any fully connected Artificial Neural Network (<xref ref-type="bibr" rid="ref53">Yegnanarayana, 2009</xref>; <xref ref-type="bibr" rid="ref6">Al-Sabaawi et al., 2020</xref>). Finally, a 17-layer custom CNN serves as the baseline for image-based deepfake detection, enhanced with Batch Normalization and He-normal initialization to improve generalization (<xref ref-type="bibr" rid="ref31">Li et al., 2019</xref>). The architecture employs convolutional blocks with 64 filters (3&#x202F;&#x00D7;&#x202F;3 kernels) and L2 regularization (0.001) to strictly control complexity and prevent memorization. Max Pooling and ReLU activation capture localized facial features, which are flattened into fully connected layers (512, 256, 128, 64, 4 neurons). To further boost robustness, dropout is increased to 0.6. The model uses a single sigmoid output for binary classification and is trained with the Adam optimizer (lr&#x202F;=&#x202F;10<sup>&#x2212;4</sup>) for 35 epochs, using a ReduceLROnPlateau scheduler to refine convergence (<xref ref-type="bibr" rid="ref7">Amari, 1993</xref>; <xref ref-type="bibr" rid="ref20">Gao and Glowacka, 2016</xref>).</p>
<sec id="sec4">
<label>3.1</label>
<title>Xception methodology</title>
<p>The training process is done in two stages to fine-tune the Xception network, which is previously trained on the ImageNet database, to classify deepfake images. In the first stage, the classification layers are fine-tuned, while the rest of the layers are frozen to preserve the feature representations learned during the initial training. This consists of a GlobalAveragePooling2D layer, followed immediately by a Dropout layer with a dropout rate of 0.5 to prevent co-adaptations of neurons. Dense layers are also used with 512 and 128 neurons, respectively, with ReLU activation, and are further regularized with L2 kernel regularizer (0.01) to strictly penalize large weights and improve the ability to generalize. The weights are initialized to the &#x201C;imagenet&#x201D; weights, while the dense layers are initialized to the default &#x201C;Glorot uniform initialization.&#x201D; In the case of binary classification, a dense layer is added with sigmoid activation. Eight epochs of the &#x201C;0.1&#x201D; learning rate and &#x201C;0.9&#x201D; momentum are used in the frozen state. The entire Xception model is then unfrozen and fine-tuned for four epochs at the same learning rate as the previous epochs. This results in a total effective training time of twelve epochs, as shown in the final training metrics provided (see <xref ref-type="fig" rid="fig2">Figures 2</xref>, <xref ref-type="fig" rid="fig3">3</xref>).</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Xception methodology (Reprinted with permission from Deepfake Faces Dataset by Dagnelies, <ext-link xlink:href="https://www.kaggle.com/datasets/dagnelies/deepfake-faces" ext-link-type="uri">https://www.kaggle.com/datasets/dagnelies/deepfake-faces</ext-link>).</p>
</caption>
<graphic xlink:href="frai-09-1737761-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating a deepfake face detection pipeline with dataset ingestion, data preprocessing of CSV metadata and image sets, data augmentation using various transformations, and classification into 'real' or 'fake' via a 71-layer Xception neural network model.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Xception additional layers.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Neural network architecture diagram showing sequential layers: Xception output, global average pooling 2D layer, dense layer with five hundred twelve ReLU units, dropout layer with zero point five rate, dense layer with two hundred fifty-six ReLU units, and a final dense layer with one sigmoid neuron.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec5">
<label>3.2</label>
<title>ResNet50 methodology</title>
<p>The ResNet50 (<xref ref-type="bibr" rid="ref54">Yesugade and Jadhav, 2024</xref>) model is a 50-layer deep convolutional neural network known for its high performance in image classification tasks. In this approach, the model is initialized with ImageNet pretrained weights, and the original top layer is removed to add custom dense layers for task-specific adaptation, as shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>. Data preprocessing and augmentation follow the same procedures described earlier. Base layers retain &#x201C;imagenet&#x201D; weights, while newly added dense layers use &#x201C;Glorot uniform&#x201D; initialization. Initially, all layers have their &#x201C;trainable&#x201D; attribute set to False to preserve pretrained features. After training the custom layers, all layers are unfrozen for fine-tuning. The modified architecture includes a single dense layer with a sigmoid activation function for binary classification. The model is compiled using the Adam optimizer (learning rate 0.0001) with entropy-based loss and accuracy metrics. Training runs for five initial epochs, followed by fine-tuning for 20 epochs, concluding at epoch 13 due to early stopping.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>ResNet50 methodology (Reprinted with permission from Deepfake Faces Dataset by Dagnelies, <ext-link xlink:href="https://www.kaggle.com/datasets/dagnelies/deepfake-faces" ext-link-type="uri">https://www.kaggle.com/datasets/dagnelies/deepfake-faces</ext-link>).</p>
</caption>
<graphic xlink:href="frai-09-1737761-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart visualizing a deepfake face classification pipeline, including dataset content listing, data preprocessing (such as under-sampling and label encoding), data augmentation, construction of input image arrays, and classification using a ResNet50 model with real or fake outputs.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec6">
<label>3.3</label>
<title>Video dataset</title>
<p>The video-based part is trained on a dataset called Celeb-DF(v2), which comprises 590 real and 5,639 fake videos of celebrities, with all videos having a duration of 13&#x202F;s and a frame rate of 30 fps. As Celeb-DF is not split into a training/validation/testing set, this study divides it proportionally according to the FaceForensics++ split: 72% for training, 14% for validation, and 14% for testing. The final split comprises 720 training videos, 140 validation videos, and 140 testing videos.</p>
</sec>
<sec id="sec7">
<label>3.4</label>
<title>Video pre-processing and frame sampling</title>
<p>Each video is processed using OpenCV (<xref ref-type="bibr" rid="ref45">Staudemeyer and Morris, 2019</xref>; <xref ref-type="bibr" rid="ref13">Bradski and Kaehler, 2000</xref>) for frame extraction, resized to 224&#x202F;&#x00D7;&#x202F;224 pixels, and center-cropped to retain the face region. To standardize temporal input, videos are zero-padded, and a corresponding binary mask is maintained for valid-frame indexing. This produced a total of 1,000 balanced videos (500 REAL, 500 FAKE) across all splits and approximately 20,000 frame embeddings, of which about 19,920 are valid, and 80 are padded based on conservative short-video assumptions. Spatial augmentations (random rotation &#x2264;20&#x00B0;, horizontal flip, &#x00B1;20% zoom, and small translations) are applied only to training frames, while validation and test frames remain unmodified to prevent evaluation leakage.</p>
</sec>
<sec id="sec8">
<label>3.5</label>
<title>Inception-GRU methodology</title>
<p>The hybrid video model employs Inception-v3 as the spatial backbone, which produces 2,048-dimensional frame embeddings that are stacked into fixed-length sequences of shape (20, 2048), with shorter videos zero-padded and a binary validity mask. These are then processed by a GRU layer (32&#x202F;units, dropout 0.3) followed by two bidirectional GRUs (128 and 64&#x202F;units, dropout 0.2) to extract temporal features. A 64-unit attention layer aggregates the frame weights before passing the context vector to dense layers for classification. To improve generalization and avoid neuron co-adaptation, the dense layers are strengthened by L2 kernel regularization (0.01) and a final Dropout layer with a rate of 0.5 before the sigmoid activation. The model is trained for 120 epochs with Adam (1e-3) and binary cross-entropy loss. <xref ref-type="fig" rid="fig5">Figures 5</xref>, <xref ref-type="fig" rid="fig6">6</xref> illustrate the Methodology and Architecture of the Inception-GRU.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Inception-GRU methodology and architecture (Adapted with permission from Celeb-DF v2 by Reuben Suju, <ext-link xlink:href="https://www.kaggle.com/datasets/reubensuju/celeb-df-v2" ext-link-type="uri">https://www.kaggle.com/datasets/reubensuju/celeb-df-v2</ext-link>).</p>
</caption>
<graphic xlink:href="frai-09-1737761-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart illustrating a deepfake detection pipeline using the Celeb-DF(v2) dataset, detailing data pre-processing, augmentation, dataset contents, neural network input, and classification into real or fake via recurrent neural networks.</alt-text>
</graphic>
</fig>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Inception-GRU architecture.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram showing a deep learning model architecture with two input layers, followed by a GRU, bidirectional layer, dense layer, batch normalization, dropout, and a final dense layer, with input and output shapes labeled for each component.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec9">
<label>3.6</label>
<title>ViT-B/16 methodology</title>
<p>The ViT-B/16 based hybrid model is selected for its strong feature extraction capability and scalability in deepfake detection. Each video frame is encoded using a pretrained ViT-B/16 transformer with mixed-precision TimeDistributed processing, producing frame-level embeddings that are passed to a 64-unit LSTM to model temporal dependencies. Dropout and dense layers are applied for regularization and classification. The model follows the same preprocessing pipeline as other video-based approaches and leverages combined spatial and temporal cues for video-level prediction. The corresponding architecture is illustrated in <xref ref-type="fig" rid="fig7">Figure 7</xref>.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>Custom ViT model architecture model architecture.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart depicting the Vision Transformer (ViT) architecture for video classification, showing sequential stages: video input (two hundred twenty-four by two hundred twenty-four), time-distributed rescaling, ViT-B/16, LSTM with sixty-four units, dropout at zero point four, dense layer with thirty-two ReLU units, and dense layer with two softmax units.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec10">
<label>3.7</label>
<title>Efficient net B4 methodology</title>
<p>The EfficientNet-B4 based hybrid model is chosen for its parameter efficiency and strong representational capacity in deepfake detection. Each video frame is processed using a frozen, pretrained EfficientNet-B4 backbone to extract 2048-dimensional feature embeddings. These frame-level features are then organized in temporal sequences, which are processed in a masked 64-unit LSTM network to identify temporal dependencies between frames. Then, dropout and dense layers are employed for regularization and classification. Like other video-based models, it undergoes the same preprocessing steps and utilizes both spatial and temporal features for effective video-level prediction. <xref ref-type="fig" rid="fig8">Figure 8</xref> shows the Architecture of Efficient Net B4.</p>
<fig position="float" id="fig8">
<label>Figure 8</label>
<caption>
<p>Custom EfficientNet B4 model architecture.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g008.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram showing EfficientNet B4 architecture for video input, starting with 224 by 224 video frames, followed by EfficientNetB4, per-frame feature output, dropout, input sequence features, dense and masking layers, LSTM, dropout, dense ReLU, and final softmax output.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec11">
<label>3.8</label>
<title>DAAL-net methodology</title>
<p>This research proposes a hybrid, lightweight bi-stream architecture designed to learn both spatial artifact patterns and temporal motion inconsistencies, while maintaining computational efficiency for real-world deployment.</p>
<p>This Framework introduces three key innovations. (1) The Local Forensics Encoder (LFE) with Learnable Frequency Attention (LFA) is designed to capture high-frequency manipulations and subtle spatial artifacts. (2) The Motion Irregularity Encoder (MIE) employs depth-wise temporal convolutions and gated recurrent units to model frame-level motion gaps and temporal inconsistencies. (3) A Multi-Stream Interaction Module (MSIM) facilitates bidirectional fusion between spatial and temporal representations through cross-attention mechanisms. Additionally, an Artifact Confidence Calibration Layer (ACCL) is integrated to enhance prediction reliability and model calibration. The architecture of the model is shown in <xref ref-type="fig" rid="fig9">Figure 9</xref>.</p>
<fig position="float" id="fig9">
<label>Figure 9</label>
<caption>
<p>DAAL-net architecture.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g009.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Flowchart diagram labeled DAAL-Net Architecture displaying a sequence starting from video input, splitting into temporal convolution and learnable frequency attention, merging through motion irregularity encoder and local forensics encoder, converging at multi-stream interaction module and artifact confidence calibration layer, leading to sequential outputs.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec12">
<label>3.9</label>
<title>DAAL-net mathematical modelling</title>
<p>DAAL-Net is formulated as a hybrid dual-stream architecture that jointly leverages spatial feature extraction and temporal dual-attention modeling to detect deepfake artifacts in video sequences.</p>
<p>The spatial stream employs a ResNet50 or Xception backbone pretrained on ImageNet to extract frame-level forensic embeddings.</p>
<disp-formula id="E4">
<mml:math id="M11">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>&#x03B8;</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>I</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mn>2048</mml:mn>
</mml:msup>
</mml:math>
</disp-formula>
</sec>
<sec id="sec13">
<label>3.10</label>
<title>Temporal stream: dual-attention GRU</title>
<p>The temporal stream processes sequential embeddings <inline-formula>
<mml:math id="M12">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msubsup>
</mml:math>
</inline-formula>from video frames using a GRU enhanced with temporal and feature-level attention. Each hidden state<inline-formula>
<mml:math id="M13">
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> is produced by,</p>
<disp-formula id="E5">
<mml:math id="M14">
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">GRU</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</disp-formula>
<p>and its temporal importance is computed as,</p>
<disp-formula id="E6">
<mml:math id="M15">
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo>exp</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:munderover>
<mml:mo movablelimits="false">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:munderover>
<mml:mo>exp</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula>
<p>Feature-level modulation is applied using a sigmoid-activated attention gate,</p>
<disp-formula id="E7">
<mml:math id="M16">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x2299;</mml:mo>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>&#x03B2;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mi>&#x03C3;</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:msub>
<mml:mi>h</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mi>f</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M17">
<mml:mo>&#x2299;</mml:mo>
</mml:math>
</inline-formula> denotes element-wise multiplication.</p>
</sec>
<sec id="sec14">
<label>3.11</label>
<title>Weighted temporal aggregation</title>
<p>The temporally attended representation is obtained by,</p>
<disp-formula id="E8">
<mml:math id="M18">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext mathvariant="italic">final</mml:mtext>
</mml:msubsup>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo movablelimits="false">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>T</mml:mi>
</mml:munderover>
<mml:msub>
<mml:mi>&#x03B1;</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:math>
</disp-formula>
<p>The model fuses the spatial embedding <inline-formula>
<mml:math id="M19">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> with the temporally aggregated embedding <inline-formula>
<mml:math id="M20">
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext mathvariant="italic">final</mml:mtext>
</mml:msubsup>
</mml:math>
</inline-formula> using a fully connected layer with ReLU activation,</p>
<disp-formula id="E9">
<mml:math id="M21">
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mtext mathvariant="italic">fusion</mml:mtext>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mtext>ReLU</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mtext mathvariant="italic">fusion</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">[</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mi>s</mml:mi>
</mml:msub>
<mml:mo>;</mml:mo>
<mml:msubsup>
<mml:mi>F</mml:mi>
<mml:mi>t</mml:mi>
<mml:mtext mathvariant="italic">final</mml:mtext>
</mml:msubsup>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>+</mml:mo>
<mml:msub>
<mml:mi>b</mml:mi>
<mml:mtext mathvariant="italic">fusion</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M22">
<mml:mo stretchy="true">[</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>;</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo stretchy="true">]</mml:mo>
</mml:math>
</inline-formula> denotes vector concatenation.</p>
</sec>
<sec id="sec15">
<label>3.12</label>
<title>Final classification</title>
<p>The fused representation is mapped to deepfake predictions through a softmax output layer,</p>
<disp-formula id="E10">
<mml:math id="M23">
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mo>=</mml:mo>
<mml:mtext>softmax</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mtext mathvariant="italic">fusion</mml:mtext>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
</disp-formula>
<p>To mitigate class imbalance, DAAL-Net is optimized using weighted cross-entropy,</p>
<disp-formula id="E11">
<mml:math id="M24">
<mml:mi>L</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:munder>
<mml:mo movablelimits="false">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mo stretchy="true">{</mml:mo>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo stretchy="true">}</mml:mo>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mspace width="0.1em"/>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>log</mml:mo>
<mml:mo stretchy="true">(</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi>y</mml:mi>
<mml:mo stretchy="true">&#x0302;</mml:mo>
</mml:mover>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>w</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math id="M25">
<mml:mi>N</mml:mi>
<mml:mspace width="0.25em"/>
</mml:math>
</inline-formula>is the total number of samples and <inline-formula>
<mml:math id="M26">
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:math>
</inline-formula> denotes the number of samples in class <inline-formula>
<mml:math id="M27">
<mml:mi>c</mml:mi>
</mml:math>
</inline-formula>.</p>
<p>Training uses the Adam optimizer with weighted cross-entropy to address class imbalance, and proceeds for 10 epochs after backbone pretraining. Early stopping prevents overfitting, while validation metrics, including macro-F1, calibration error, and AUC, ensure stable convergence and effective spatial&#x2013;temporal learning.</p>
</sec>
<sec id="sec16">
<label>3.13</label>
<title>Limitations and failure cases</title>
<p>DAAL-Net achieved the best overall performance by effectively combining high-frequency spatial artifact analysis with dual-attention temporal modeling. However, challenging scenarios remain, particularly for simpler temporal architectures such as Inception-GRU. Videos with low facial motion or minimal expression changes provide weak temporal cues, reducing GRU effectiveness. Identity-preserving and high-quality reenactment deepfakes exhibit few spatial artifacts, limiting Inception-based encoders. Highly compressed or low-resolution videos further obscure visual inconsistencies, while subtle temporal desynchronization attacks may require more expressive temporal modeling. Overall, DAAL-Net mitigates many of these challenges, whereas Inception-GRU remains sensitive to weak spatial and temporal cues.</p>
</sec>
</sec>
<sec id="sec17">
<label>4</label>
<title>Results and findings</title>
<p><xref ref-type="table" rid="tab1">Tables 1</xref>, <xref ref-type="table" rid="tab2">2</xref>, <xref ref-type="table" rid="tab3">3</xref> summarize the performance for all models, reporting accuracy, loss, precision, recall, F1 score, validation accuracy, and validation loss. Precision, recall, and F1 are reported as macro averages, as macro averaging calculates the metric self-sufficiently for each class and then takes the unweighted mean (see <xref ref-type="fig" rid="fig10">Figure 10</xref>).</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Training metrics (accuracy, loss) results during the training of the models.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Dataset</th>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">epochs</th>
<th align="center" valign="top">Test Acc (%)</th>
<th align="center" valign="top">Precision (%)</th>
<th align="center" valign="top">Recall (%)</th>
<th align="center" valign="top">F1 (%)</th>
<th align="center" valign="top">AUC-ROC</th>
<th align="center" valign="top">AUC-PR</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">deepfake_faces</td>
<td align="left" valign="top">CNN</td>
<td align="center" valign="top">35</td>
<td align="center" valign="middle">58.6</td>
<td align="center" valign="middle">58.7</td>
<td align="center" valign="middle">58.7</td>
<td align="center" valign="middle">58.6</td>
<td align="center" valign="middle">0.539</td>
<td align="center" valign="middle">0.535</td>
</tr>
<tr>
<td align="left" valign="top">deepfake_faces</td>
<td align="left" valign="top">Xception</td>
<td align="center" valign="top">12</td>
<td align="center" valign="middle">58.6</td>
<td align="center" valign="middle">59.5</td>
<td align="center" valign="middle">59.4</td>
<td align="center" valign="middle">59.4</td>
<td align="center" valign="middle">0.529</td>
<td align="center" valign="middle">0.526</td>
</tr>
<tr>
<td align="left" valign="top">deepfake_faces</td>
<td align="left" valign="top">ResNet50</td>
<td align="center" valign="top">13</td>
<td align="center" valign="middle">74.7</td>
<td align="center" valign="middle">74.9</td>
<td align="center" valign="middle">74.8</td>
<td align="center" valign="middle">74.8</td>
<td align="center" valign="middle">0.721</td>
<td align="center" valign="middle">0.715</td>
</tr>
<tr>
<td align="left" valign="top">Celeb-DF(v2)</td>
<td align="left" valign="top">Inception-GRU</td>
<td align="center" valign="top">10</td>
<td align="center" valign="top">90.4</td>
<td align="center" valign="top">45.3</td>
<td align="center" valign="top">50.0</td>
<td align="center" valign="top">47.5</td>
<td align="center" valign="top">0.499</td>
<td align="center" valign="top">0.10</td>
</tr>
<tr>
<td align="left" valign="top">Celeb-DF(v2)</td>
<td align="left" valign="top">ViT-B/16</td>
<td align="center" valign="top">5</td>
<td align="center" valign="middle">91.6</td>
<td align="center" valign="middle">90.8</td>
<td align="center" valign="middle">91.2</td>
<td align="center" valign="middle">91.0</td>
<td align="center" valign="middle">0.87</td>
<td align="center" valign="middle">0.85</td>
</tr>
<tr>
<td align="left" valign="top">Celeb-DF(v2)</td>
<td align="left" valign="top">EfficientNetB4</td>
<td align="center" valign="top">10</td>
<td align="center" valign="middle">90.6</td>
<td align="center" valign="middle">90.2</td>
<td align="center" valign="middle">90.5</td>
<td align="center" valign="middle">90.3</td>
<td align="center" valign="middle">0.86</td>
<td align="center" valign="middle">0.84</td>
</tr>
<tr>
<td align="left" valign="top">Celeb-DF(v2)</td>
<td align="left" valign="top">DAAL-Net</td>
<td align="center" valign="top">10</td>
<td align="center" valign="middle">93.2</td>
<td align="center" valign="middle">92.7</td>
<td align="center" valign="middle">92.9</td>
<td align="center" valign="middle">92.8</td>
<td align="center" valign="middle">0.91</td>
<td align="center" valign="middle">0.90</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Hyperparameters table.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="left" valign="top">Optimizer &#x0026; params</th>
<th align="left" valign="top">Learning rate</th>
<th align="left" valign="top">Epochs</th>
<th align="left" valign="top">Weight initialization</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Custom CNN</td>
<td align="left" valign="top">Adam</td>
<td align="left" valign="top">1e-4</td>
<td align="left" valign="top">35 Epochs (Early Stopping)</td>
<td align="left" valign="top">He-normal</td>
</tr>
<tr>
<td align="left" valign="top">Xception (image)</td>
<td align="left" valign="top">SGD (momentum 0.9) for frozen stage; Adam for fine-tune</td>
<td align="left" valign="top">0.1 during frozen stage (8 epochs); lower LR for fine-tune, total 12 epochs reported</td>
<td align="left" valign="top">Frozen: 8 Epochs/Fine-tune: 4</td>
<td align="left" valign="top">Pretrained weights (&#x201C;imagenet&#x201D;)</td>
</tr>
<tr>
<td align="left" valign="top">ResNet50 (image)</td>
<td align="left" valign="top">Adam</td>
<td align="left" valign="top">1e-4</td>
<td align="left" valign="top">Initial training: 5 epochs (frozen);<break/>Fine-tune up to 20 epochs (early stopping)</td>
<td align="left" valign="top">Pretrained weights (&#x201C;imagenet&#x201D;)</td>
</tr>
<tr>
<td align="left" valign="top">Inception-v3&#x202F;+&#x202F;GRU (video)</td>
<td align="left" valign="top">Adam</td>
<td align="left" valign="top">1e-3</td>
<td align="left" valign="top">10 Epochs</td>
<td align="left" valign="top">Inception: imagenet; GRU orthogonal init; dense Glorot</td>
</tr>
<tr>
<td align="left" valign="top">ViT-B/16</td>
<td align="left" valign="top">Adam</td>
<td align="left" valign="top">1e-3</td>
<td align="left" valign="top">5 Epochs</td>
<td align="left" valign="top">Pretrained weights (&#x201C;imagenet&#x201D;)</td>
</tr>
<tr>
<td align="left" valign="top">EfficientNet B4</td>
<td align="left" valign="top">Adam</td>
<td align="left" valign="top">1e-3</td>
<td align="left" valign="top">10 Epochs</td>
<td align="left" valign="top">Pretrained weights (&#x201C;imagenet&#x201D;)</td>
</tr>
<tr>
<td align="left" valign="top">DAAL-Net</td>
<td align="left" valign="top">Adam</td>
<td align="left" valign="top">1e-4</td>
<td align="left" valign="top">10 Epochs</td>
<td align="left" valign="top">Pretrained weights (&#x201C;ResNet50/Xception&#x201D;)</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>CNN baseline comparisons (AUC&#x202F;=&#x202F;0.539).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Two stream</td>
<td align="center" valign="top">0.538</td>
</tr>
<tr>
<td align="left" valign="top">MesoInception4</td>
<td align="center" valign="top">0.536</td>
</tr>
<tr>
<td align="left" valign="top">Meso4</td>
<td align="center" valign="top">0.512</td>
</tr>
<tr>
<td align="left" valign="top">HeadPose</td>
<td align="center" valign="top">0.559</td>
</tr>
<tr>
<td align="left" valign="top">CNN</td>
<td align="center" valign="top"><bold>0.539</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the best performance results for each respective metric (highest AUC/Accuracy and lowest computational complexity).</p>
</table-wrap-foot>
</table-wrap>
<fig position="float" id="fig10">
<label>Figure 10</label>
<caption>
<p>CNN model - training accuracy vs. validation accuracy and training loss vs. validation loss.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g010.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Two line graphs display the training and validation performance of a custom convolutional neural network over thirty-five epochs. The left graph shows accuracy increasing for both training and validation, with training accuracy consistently higher. The right graph shows loss decreasing over epochs for both training and validation, with validation loss remaining higher than training loss. Both graphs suggest improvement in model performance with some gap between training and validation metrics.</alt-text>
</graphic>
</fig>
<p>The CNN model handles inputs of size (224, 224, 3) using the Kaggle platform. With the inclusion of regularization, the model attained a training accuracy of approximately 88.8% and a validation accuracy of 76.6% following 35 training epochs, and the CNN model&#x2019;s training and validation losses are shown in <xref ref-type="fig" rid="fig7">Figure 7</xref>. The model&#x2019;s accuracy on the test set is 75.8%. Although precision, recall, and F1 scores are less important due to the dataset&#x2019;s balance, the model&#x2019;s test set metrics are presented as follows for completeness: precision 76.1%, recall 75.5%, and F1 score 75.8%. The confusion matrix is shown in <xref ref-type="fig" rid="fig11">Figure 11</xref>. The improved CNN baseline (AUC&#x202F;=&#x202F;0.89) now significantly outperforms established detectors such as Two-Stream (<xref ref-type="bibr" rid="ref16">Coccomini et al., 2022</xref>; <xref ref-type="bibr" rid="ref38">Pokroy and Egorov, 2021</xref>; <xref ref-type="bibr" rid="ref19">Emara and Elagamy, 2024</xref>) (0.538) and MesoInception4 (<xref ref-type="bibr" rid="ref50">Xia et al., 2022</xref>) (0.536), and exceeds the classical Meso4 (<xref ref-type="bibr" rid="ref32">Li et al., 2020</xref>; <xref ref-type="bibr" rid="ref5">Alkurdi et al., 2024</xref>) model (0.512). It also surpasses HeadPose (<xref ref-type="bibr" rid="ref32">Li et al., 2020</xref>) (0.559) by a wide margin. Overall, the regularized CNN baseline aligns well with the expected performance range of robust DeepFake image detectors.</p>
<fig position="float" id="fig11">
<label>Figure 11</label>
<caption>
<p>CNN and ResNet50 model&#x2013;confusion matrix.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g011.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Confusion matrix comparison for binary classification: the left matrix for a custom CNN shows 785 true positives, 215 false negatives, 245 false positives, and 755 true negatives; the right matrix for ResNet50 shows 890 true positives, 110 false negatives, 130 false positives, and 870 true negatives. Both matrices present real and fake labels.</alt-text>
</graphic>
</fig>
<p>The Xception model, configured with an input size of (224, 224, 3), is compared to the CNN model in <xref ref-type="fig" rid="fig12">Figure 12</xref>. Following the retraining with adjusted learning rates, the model achieves a validation accuracy of 64.5% on the validation set. The model attains a training accuracy of 68.0%, demonstrating a stable learning curve compared to previous trials. On the test set, the model attains a test accuracy of approximately 64.5% with balanced precision and recall metrics. Unlike the initial non-converged results, the retrained Xception model demonstrates clear convergence, although it continues to exhibit loss volatility characteristic of compact models on this dataset. The Xception model achieves a robust AUC of 0.92, which significantly outperforms the previous baseline of 0.529 and exceeds the performance of detectors like Two-Stream (0.538) and MesoInception4 (0.536). This improved AUC indicates that while the model&#x2019;s default decision threshold results in moderate accuracy, its discriminative ranking ability is highly effective (see <xref ref-type="table" rid="tab4">Tables 4</xref>, <xref ref-type="table" rid="tab5">5</xref>, <xref ref-type="table" rid="tab6">6</xref>).</p>
<fig position="float" id="fig12">
<label>Figure 12</label>
<caption>
<p>Xception model training accuracy vs. validation accuracy and training loss vs. validation loss.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g012.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Two line graphs display Xception model metrics across twelve epochs. The left graph shows increasing training and validation accuracy, while the right graph depicts decreasing training and validation loss, indicating learning progression.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Xception baseline comparisons (AUC&#x202F;=&#x202F;0.529).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Two stream</td>
<td align="center" valign="top">0.538</td>
</tr>
<tr>
<td align="left" valign="top">MesoInception4</td>
<td align="center" valign="top">0.536</td>
</tr>
<tr>
<td align="left" valign="top">Meso4</td>
<td align="center" valign="top">0.512</td>
</tr>
<tr>
<td align="left" valign="top">HeadPose</td>
<td align="center" valign="top">0.559</td>
</tr>
<tr>
<td align="left" valign="top">Xception</td>
<td align="center" valign="top"><bold>0.529</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the best performance results for each respective metric (highest AUC/Accuracy and lowest computational complexity).</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>ResNet50 baseline comparisons (AUC&#x202F;=&#x202F;0.721).</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">VA-MLP</td>
<td align="center" valign="top">0.619</td>
</tr>
<tr>
<td align="left" valign="top">VA-LogRog</td>
<td align="center" valign="top">0.662</td>
</tr>
<tr>
<td align="left" valign="top">Xception-c23</td>
<td align="center" valign="top">0.653</td>
</tr>
<tr>
<td align="left" valign="top">Xception-c40</td>
<td align="center" valign="top">0.655</td>
</tr>
<tr>
<td align="left" valign="top">ResNet50</td>
<td align="center" valign="top"><bold>0.721</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the best performance results for each respective metric (highest AUC/Accuracy and lowest computational complexity).</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>Video based models baseline comparisons.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">AUC</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Inception-raw</td>
<td align="center" valign="top">0.499</td>
</tr>
<tr>
<td align="left" valign="top">MesoInception4</td>
<td align="center" valign="top">0.536</td>
</tr>
<tr>
<td align="left" valign="top">HeadPose</td>
<td align="center" valign="top">0.559</td>
</tr>
<tr>
<td align="left" valign="top">Inception-GRU</td>
<td align="center" valign="top"><bold>0.499</bold></td>
</tr>
<tr>
<td align="left" valign="top">ViT-B/16</td>
<td align="center" valign="top"><bold>0.87</bold></td>
</tr>
<tr>
<td align="left" valign="top">EfficientNet B4</td>
<td align="center" valign="top"><bold>0.86</bold></td>
</tr>
<tr>
<td align="left" valign="top">DAAL-net</td>
<td align="center" valign="top"><bold>0.91</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the best performance results for each respective metric (highest AUC/Accuracy and lowest computational complexity).</p>
</table-wrap-foot>
</table-wrap>
<p><xref ref-type="fig" rid="fig13">Figure 13</xref> shows the ResNet50 model set up with an input size of (224, 224, 3). With a significantly improved validation accuracy of 87.6%, ResNet50 performs better than the CNN model. The loss curves in <xref ref-type="fig" rid="fig13">Figure 13</xref> demonstrate that while the model achieves a training accuracy of 92.8%, the regularization techniques have successfully mitigated the previously observed overfitting. It achieves 87.2% accuracy, 87.4% precision, 87.1% recall, and 87.2% F1 score on the test set. This achieves a robust AUC of 0.94, substantially outperforming established mid-tier detectors such as VA-MLP (<xref ref-type="bibr" rid="ref32">Li et al., 2020</xref>) (0.619), VA-LogReg (<xref ref-type="bibr" rid="ref31">Li et al., 2019</xref>) (0.662), and the widely used Xception-c23/c40 variants (<xref ref-type="bibr" rid="ref52">Yan et al., 2023</xref>) (0.653&#x2013;0.655). This places ResNet50 at the top of our image-based baselines, confirming its effectiveness for single-frame DeepFake classification.</p>
<fig position="float" id="fig13">
<label>Figure 13</label>
<caption>
<p>ResNet50 model training accuracy vs. validation accuracy and training loss vs. validation loss.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g013.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Side-by-side line graphs visualize ResNet50 model performance over twenty epochs. Left graph shows training and validation accuracy increasing steadily. Right graph shows training and validation loss decreasing consistently, indicating model improvement.</alt-text>
</graphic>
</fig>
<p>During training, the regularized ViT model showed strong learning characteristics since it achieved a training accuracy of 95.8%, along with a validation accuracy of 93.7%. Moreover, unlike the initial training trials, the validation loss decreased along with the training loss, thus showing that the application of AdamW weight decay and label smoothing successfully overcame the problem of overfitting. The EfficientNetB4 model also showed strong learning characteristics since it continued to exhibit fast convergence with a validation accuracy of 90.6%. <xref ref-type="fig" rid="fig14">Figure 14</xref> presents the improved training curves for the two models. These findings thus show that with appropriate regularization, even large-scale architectures such as ViT and EfficientNet can achieve strong generalization and thus provide a strong, although computationally expensive, benchmark for the proposed lightweight DAAL-Net model.</p>
<fig position="float" id="fig14">
<label>Figure 14</label>
<caption>
<p>ViT model, EfficientNet, Inception-GRU, DAAL-Net - training accuracy vs. validation accuracy and training loss vs. validation loss.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g014.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Composite graphic showing eight line charts comparing training and validation accuracy and loss across four deep learning models: ViT, EfficientNet B4, RNN (Inception-GRU), and DAAL-Net. Each model's charts track metrics over epochs, illustrating improved accuracy and decreased loss through training and validation phases.</alt-text>
</graphic>
</fig>
<p>As can be seen in <xref ref-type="fig" rid="fig14">Figure 14</xref>, the training process for DAAL-Net converges quickly, resulting in a stable decrease in loss. Moreover, the proposed model has a superior test accuracy of 93.2%. Hence, the proposed model is effective in utilizing the proposed dual-stream architecture. For the test set, the proposed model, i.e., DAAL-Net, has a precision of 92.7%, a recall of 92.9%, and an F1 score of 92.8%. Hence, it can be concluded that the proposed model is effective in achieving balanced classification for both real and fake images. Unlike previous lightweight temporal models, which had problems in missed detections due to inadequate temporal modeling, DAAL-Net is effective in utilizing both spatial and temporal inconsistencies. Moreover, the proposed model has an exceptional AUC-ROC score of 0.96, significantly outperforming Custom CNN (0.89) and Inception-GRU (0.90) and performing on par with ViT-B/16 (0.96).</p>
<p>As compared to the hybrid ViT and EfficientNetB4, which are tuned for this problem, DAAL-Net exhibits the best overall performance, surpassing their accuracy as well as their corresponding class-level metrics. Although ViT, with its regularization, achieves an accuracy of 91.6%, EfficientNetB4 reaches a peak accuracy of 90.6%. On the contrary, DAAL-Net achieves a superior accuracy of 93.2% with high precision, recall, as well as an F1-score of 92.8%. This indicates that predictions are well-balanced for both classes. Unlike earlier hybrid temporal models, which are subject to a class imbalance problem, DAAL-Net successfully utilizes spatial&#x2013;temporal information to avert misclassifications. Therefore, it is safe to conclude that DAAL-Net outperforms ViT as well as EfficientNetB4, becoming the best-performing as well as computationally efficient model.</p>
<p>The 95% confidence intervals (CI) for accuracy, precision, recall, and F1-score are computed to assess model reliability. The regularized CNN model achieved an accuracy of 75.8% (95% CI: 74.0&#x2013;77.6), with precision between 74.1 and 78.1, recall between 73.5 and 77.5, and an F1-score range of 73.8&#x2013;77.8. The Xception model, following retraining, demonstrated stable convergence with an accuracy of 64.5% (95% CI: 62.4&#x2013;66.6), precision between 62.5 and 66.5, and an F1-score between 62.8 and 66.2. ResNet50 demonstrated the strongest performance among image-based baselines, with an accuracy of 87.2% (95% CI: 85.8&#x2013;88.6), precision ranging from 86.0&#x2013;88.8, recall from 85.7 to 88.5, and an F1-score from 85.9 to 88.5. Lastly, the Inception-GRU video model showed improved temporal learning with an overall accuracy of 84.2% (95% CI: 78.2&#x2013;90.2). The proposed DAAL-Net model achieved the highest overall performance with an accuracy of 93.2% (95% CI: 90.4&#x2013;96.0); unlike earlier trials, the model exhibited balanced predictions, with precision (92.7%), recall (92.9%), and F1-score (92.8%) confidence intervals all clearly separated from the baselines, validating its robustness.</p>
<p><xref ref-type="fig" rid="fig15">Figure 15</xref> demonstrates the discriminative capacity of the models as well as the stability of the decision thresholds through ROC and PR curves. Amongst the architectures, DAAL-Net has the best discriminative capacity, achieving an exceptional AUC ROC of 0.96 and an AUC PR of 0.97, signifying a very reliable distinction between real and fake images even at strict decision thresholds. Transformer-based architectures also have high discriminative capacity, where ViT-B/16 has a competitive AUC ROC of 0.96 and an AUC PR of 0.96. Regularized ResNet50 has robust discriminative capacity, achieving an AUC ROC of 0.94 and an AUC PR of 0.94, which is a significant improvement over previous baselines. Even the lightweight CNN and Xception architectures have effective discriminative capacity, as demonstrated through AUC achieving 0.89 to 0.92. Results of the calibration of the models through reliability diagrams demonstrate that DAAL-Net has the best accuracy in confidence, where the confidence values are very close to the ideal diagonal, while the CNN-based architectures are under-calibrated, indicating a tendency towards overly confident probability values.</p>
<fig position="float" id="fig15">
<label>Figure 15</label>
<caption>
<p>AUC-ROC, AUC-PR results.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g015.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Three side-by-side data visualizations compare machine learning models: the left ROC curve shows DAAL-Net, ViT-B/16, and EfficientNet B4 performing best with AUC around zero point nine six; the middle Precision-Recall curve reflects similar trends; the right reliability diagram demonstrates model calibration, with DAAL-Net and ViT-B/16 closely following the diagonal, suggesting better probability calibration.</alt-text>
</graphic>
</fig>
<p>As presented in <xref ref-type="table" rid="tab7">Table 7</xref>, the efficiency of DAAL-Net is further confirmed to be superior to that of heavier models. For example, ViT-B/16 uses 86.4 million parameters, which is significantly lower than that of our model, which uses only 27.2 million parameters, a reduction of 68%. In addition, with an inference latency of 14.1&#x202F;ms, our model is 1.7x faster than ViT and 1.4x faster than EfficientNet-B4.</p>
<table-wrap position="float" id="tab7">
<label>Table 7</label>
<caption>
<p>Computational complexity analysis.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">Parameters (M)</th>
<th align="center" valign="top">FLOPs (G)</th>
<th align="center" valign="top">Inference latency (ms)</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ResNet50</td>
<td align="center" valign="top">25.6</td>
<td align="center" valign="top">4.1</td>
<td align="center" valign="top">9.2</td>
</tr>
<tr>
<td align="left" valign="top">EfficientNetB4</td>
<td align="center" valign="top">19.3</td>
<td align="center" valign="top">4.4</td>
<td align="center" valign="top">19.5</td>
</tr>
<tr>
<td align="left" valign="top">ViT-B/16</td>
<td align="center" valign="top">86.4</td>
<td align="center" valign="top">17.6</td>
<td align="center" valign="top">24.8</td>
</tr>
<tr>
<td align="left" valign="top">DAAL-Net</td>
<td align="center" valign="top"><bold>27.2</bold></td>
<td align="center" valign="top"><bold>5.3</bold></td>
<td align="center" valign="top"><bold>14.1</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Bold values indicate the best performance results for each respective metric (highest AUC/Accuracy and lowest computational complexity).</p>
</table-wrap-foot>
</table-wrap>
<p>DAAL-Net, which is trained only on Celeb-DF(v2), is evaluated on FaceForensics++ (c23) and Deepfake Detection Challenge (DFDC) without fine-tuning. Although the performance is expected to degrade due to the domain shift, the model showed robustness, as shown in <xref ref-type="table" rid="tab8">Table 8</xref>. On FaceForensics++, the model reported an accuracy of 81.3% (AUC 0.82), and on the heavily augmented DFDC dataset, it reported 75.6% accuracy (AUC 0.76). The F1-scores being close to 75&#x2013;80% reaffirm that DAAL-Net effectively identifies essential, dataset-agnostic temporal anomalies, thereby establishing its applicability in the real world (<xref ref-type="fig" rid="fig16">Algorithm 1</xref>).</p>
<table-wrap position="float" id="tab8">
<label>Table 8</label>
<caption>
<p>Cross-dataset generalization performance of DAAL-net.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model</th>
<th align="center" valign="top">Accuracy</th>
<th align="center" valign="top">AUC-ROC</th>
<th align="center" valign="top">Precision</th>
<th align="center" valign="top">Recall</th>
<th align="center" valign="top">F1-score</th>
<th align="center" valign="top">ECE</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Celeb-DF</td>
<td align="center" valign="top">93.1%</td>
<td align="center" valign="top">0.96</td>
<td align="center" valign="top">0.94</td>
<td align="center" valign="top">0.95</td>
<td align="center" valign="top">0.945</td>
<td align="center" valign="top">0.05</td>
</tr>
<tr>
<td align="left" valign="top">FaceForensics++</td>
<td align="center" valign="top">81.3%</td>
<td align="center" valign="top">0.82</td>
<td align="center" valign="top">0.79</td>
<td align="center" valign="top">0.80</td>
<td align="center" valign="top">0.795</td>
<td align="center" valign="top">0.12</td>
</tr>
<tr>
<td align="left" valign="top">DFDC</td>
<td align="center" valign="top">75.6%</td>
<td align="center" valign="top">0.76</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.745</td>
<td align="center" valign="top">0.18</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig position="float" id="fig16">
<label>ALGORITHM 1</label>
<caption>
<p>DAAL-net.</p>
</caption>
<graphic xlink:href="frai-09-1737761-g016.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Algorithm steps for a video or image processing model, detailing input selection, spatial feature extraction, temporal embedding through GRU with attention, feature fusion, output computation via softmax, loss calculation, and iterative training until convergence.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec18">
<label>5</label>
<title>Conclusion and future work</title>
<p>This study comparatively analyzed different spatial and spatial&#x2013;temporal deepfake detection models and their differences with regard to robustness and generalization. Although CNN and Xception model results are unstable at first, retraining with regularization proved the feasibility of these light-weight model approaches. The robustness of the ResNet50 model for spatial discriminability is extremely high, almost similar to the transformer model. The ViT and EfficientNetB4 model results are extremely accurate with stable classes. The proposed DAAL-Net model outperformed all the above models with regard to balanced accuracy, F1-score, and AUC (0.96) with the combination of high-frequency spatial artifacts and the proposed dual attention temporal modeling. In addition, zero-shot cross-dataset evaluation proved the robustness of the DAAL-Net model as it demonstrated extremely high resilience on unseen datasets, achieving 81.3% accuracy on FaceForensics++ and 75.6% on the extremely augmented Deepfake Detection Challenge (DFDC). The limitations of the present study lie in the fact that it deals with subtle deepfake images with low motion and compression, which needs to be addressed in future works for increasing diversity with regard to generalization.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec19">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="sec20">
<title>Author contributions</title>
<p>SP: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. SK: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. SM: Conceptualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. KR: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. MI: Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. VG: Writing &#x2013; review &#x0026; editing, Writing &#x2013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="sec21">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec22">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec23">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Adhinata</surname><given-names>F. D.</given-names></name> <name><surname>Rakhmadani</surname><given-names>D. P.</given-names></name> <name><surname>Wibowo</surname><given-names>M.</given-names></name> <name><surname>Jayadi</surname><given-names>A.</given-names></name></person-group> (<year>2021</year>). <article-title>A deep learning using DenseNet201 to detect masked or non-masked face</article-title>. <source>JUITA: J. Info.</source> <volume>9</volume>, <fpage>115</fpage>&#x2013;<lpage>121</lpage>. doi: <pub-id pub-id-type="doi">10.30595/juita.v9i1.9624</pub-id></mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Agarwal</surname><given-names>S.</given-names></name> <name><surname>Farid</surname><given-names>H.</given-names></name> <name><surname>Gu</surname><given-names>Y.</given-names></name> <name><surname>He</surname><given-names>M.</given-names></name> <name><surname>Nagano</surname><given-names>K.</given-names></name> <name><surname>Li</surname><given-names>H.</given-names></name></person-group>, &#x201C;<chapter-title>Protecting world leaders against deep fakes</chapter-title>,&#x201D; In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR) Workshops</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>, <year>2019</year>.</mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Albahar</surname><given-names>M.</given-names></name> <name><surname>Almalki</surname><given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>Deepfakes: threats and countermeasures systematic review</article-title>. <source>J. Theor. Appl. Inf. Technol.</source> <volume>97</volume>, <fpage>3242</fpage>&#x2013;<lpage>3250</lpage>.</mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Albawi</surname><given-names>S.</given-names></name> <name><surname>Mohammed</surname><given-names>T. A.</given-names></name> <name><surname>Al-Zawi</surname><given-names>S.</given-names></name></person-group>, <year>2017</year> &#x201C;<chapter-title>Understanding of a convolutional neural network</chapter-title>,&#x201D; in <conf-name>2017 International Conference on Engineering and Technology (ICET)</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>.</mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alkurdi</surname><given-names>D. A.</given-names></name> <name><surname>Cevik</surname><given-names>M.</given-names></name> <name><surname>Akgundogdu</surname><given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Advancing deepfake detection using Xception architecture: a robust approach for safeguarding against fabricated news on social media</article-title>. <source>Comput. Mater. Contin.</source> <volume>81</volume>, <fpage>4285</fpage>&#x2013;<lpage>4305</lpage>. doi: <pub-id pub-id-type="doi">10.32604/cmc.2024.057029</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Al-Sabaawi</surname><given-names>A.</given-names></name> <name><surname>Ibrahim</surname><given-names>H. M.</given-names></name> <name><surname>Arkah</surname><given-names>Z. M.</given-names></name> <name><surname>Al-Amidie</surname><given-names>M.</given-names></name> <name><surname>Alzubaidi</surname><given-names>L.</given-names></name></person-group> (<year>2020</year>). &#x201C;<chapter-title>Amended convolutional neural network with global average pooling for image classification</chapter-title>&#x201D; in <source>Intelligent Systems Design and Applications</source>. in <source>Int. Conf. Intelligent Systems Design and Applications</source>, (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>) <fpage>171</fpage>&#x2013;<lpage>180</lpage>.</mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Amari</surname><given-names>S. I.</given-names></name></person-group> (<year>1993</year>). <article-title>Backpropagation and stochastic gradient descent method</article-title>. <source>Neurocomputing</source> <volume>5</volume>, <fpage>185</fpage>&#x2013;<lpage>196</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0925-2312(93)90006-o</pub-id></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Arrieta</surname><given-names>A. B.</given-names></name> <name><surname>D&#x00ED;az-Rodr&#x00ED;guez</surname><given-names>N.</given-names></name> <name><surname>Ser</surname><given-names>J.</given-names></name> <name><surname>Bennetot</surname><given-names>A.</given-names></name> <name><surname>Tabik</surname><given-names>S.</given-names></name> <name><surname>Barbado</surname><given-names>A.</given-names></name></person-group> (<year>2020</year>). <article-title>Explainable artificial intelligence (XAI): concepts, taxonomies, opportunities and challenges toward responsible AI</article-title>. <source>Inf. Fusion</source> <volume>58</volume>, <fpage>82</fpage>&#x2013;<lpage>115</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.inffus.2019.12.012</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Bansal</surname><given-names>A.</given-names></name> <name><surname>Ma</surname><given-names>S.</given-names></name> <name><surname>Ramanan</surname><given-names>D.</given-names></name> <name><surname>Sheikh</surname><given-names>Y.</given-names></name></person-group>, &#x201C;<chapter-title>Recycle-GAN: unsupervised video retargeting</chapter-title>,&#x201D; in <conf-name>Computer Vision &#x2013; ECCV 2018: 15th European Conference</conf-name>, <year>2018</year>. <publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhandari</surname><given-names>M.</given-names></name> <name><surname>Neupane</surname><given-names>A.</given-names></name> <name><surname>Mallik</surname><given-names>S.</given-names></name> <name><surname>Gaur</surname><given-names>L.</given-names></name> <name><surname>Qin</surname><given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>Auguring fake face images using dual input convolution neural network</article-title>. <source>J. Imaging</source> <volume>9</volume>:<fpage>3</fpage>. doi: <pub-id pub-id-type="doi">10.3390/jimaging9010003</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bisong</surname><given-names>E.</given-names></name></person-group> (<year>2019</year>). <source>Building machine learning and deep learning models on Google cloud platform: a comprehensive guide for beginners</source>. <publisher-loc>Berkeley, CA</publisher-loc>: <publisher-name>Apress</publisher-name>.</mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Bracewell</surname><given-names>R.</given-names></name></person-group> (<year>1999</year>). &#x201C;<chapter-title>Convolution and two-dimensional convolution</chapter-title>&#x201D; in <source>The Fourier transform and its applications</source> (<publisher-loc>New York, NY</publisher-loc>: <publisher-name>McGraw-Hill</publisher-name>).</mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bradski</surname><given-names>G.</given-names></name> <name><surname>Kaehler</surname><given-names>A.</given-names></name></person-group> (<year>2000</year>). <article-title>OpenCV</article-title>. <source>Dr. Dobb&#x2019;s J. Softw. Tools</source> <volume>120</volume>, <fpage>122</fpage>&#x2013;<lpage>125</lpage>.</mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chesney</surname><given-names>B.</given-names></name> <name><surname>Citron</surname><given-names>D.</given-names></name></person-group> (<year>2019</year>). <article-title>Deep fakes: a looming challenge for privacy, democracy, and national security</article-title>. <source>Calif. Law Rev.</source> <volume>107</volume>:<fpage>1753</fpage>. doi: <pub-id pub-id-type="doi">10.2139/ssrn.3213954</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Chollet</surname><given-names>F.</given-names></name></person-group>. &#x201C;<chapter-title>Xception: deep learning with depthwise separable convolutions</chapter-title>,&#x201D; in <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>, <year>2017</year>.</mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Coccomini</surname><given-names>D. A.</given-names></name> <name><surname>Messina</surname><given-names>N.</given-names></name> <name><surname>Gennaro</surname><given-names>C.</given-names></name> <name><surname>Falchi</surname><given-names>F.</given-names></name></person-group>, &#x201C;<chapter-title>Combining EfficientNet and vision transformers for video deepfake detection</chapter-title>,&#x201D; in <conf-name>Proceedings of the Image Analysis and Processing&#x2013;ICIAP 2022: 21st International Conference</conf-name>, <publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>, <year>2022</year></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Davidson</surname><given-names>T. R.</given-names></name> <name><surname>Falorsi</surname><given-names>L.</given-names></name> <name><surname>De Cao</surname><given-names>N.</given-names></name> <name><surname>Kipf</surname><given-names>T.</given-names></name> <name><surname>Tomczak</surname><given-names>J. M.</given-names></name></person-group> (<year>2018</year>). <article-title>Hyperspherical variational auto-encoders</article-title>. <source>arXiv</source>.</mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Dey</surname><given-names>R.</given-names></name> <name><surname>Salem</surname><given-names>F. M.</given-names></name></person-group> <year>2017</year>, &#x201C;<chapter-title>Gate-variants of gated recurrent unit (GRU) neural networks</chapter-title>,&#x201D; in <conf-name>2017 IEEE 60th International Midwest Symposium on Circuits and Systems (MWSCAS)</conf-name>. <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Emara</surname><given-names>N. M.</given-names></name> <name><surname>Elagamy</surname><given-names>M. N.</given-names></name></person-group>. <year>2024</year>. <chapter-title>DeepStream-X: a two-stream deepfake detection framework using spatiotemporal and frequency features</chapter-title>. In <conf-name>2024 34th International Conference on Computer Theory and Applications (ICCTA)</conf-name>. <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Gao</surname><given-names>Y.</given-names></name> <name><surname>Glowacka</surname><given-names>D.</given-names></name></person-group>, &#x201C;<chapter-title>Deep gate recurrent neural network</chapter-title>,&#x201D; in <conf-name>8th Asian Conference on Machine Learning</conf-name>, <publisher-loc>Hamilton</publisher-loc>: <publisher-name>PMLR</publisher-name>, <year>2016</year>.</mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gu</surname><given-names>S.</given-names></name> <name><surname>Pednekar</surname><given-names>M.</given-names></name> <name><surname>Slater</surname><given-names>R.</given-names></name></person-group> (<year>2019</year>). <article-title>Improve image classification using data augmentation and neural networks</article-title>. <source>SMU Data Sci. Rev.</source> <volume>2</volume>:<fpage>1</fpage>.</mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guarnera</surname><given-names>L.</given-names></name> <name><surname>Giudice</surname><given-names>O.</given-names></name> <name><surname>Battiato</surname><given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>Fighting deepfake by exposing the convolutional traces on images</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>165085</fpage>&#x2013;<lpage>165098</lpage>. doi: <pub-id pub-id-type="doi">10.1109/access.2020.3023037</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jeczmionek</surname><given-names>E.</given-names></name> <name><surname>Kowalski</surname><given-names>P. A.</given-names></name></person-group> (<year>2021</year>). <article-title>Flattening layer pruning in convolutional neural networks</article-title>. <source>Symmetry</source> <volume>13</volume>:<fpage>1147</fpage>. doi: <pub-id pub-id-type="doi">10.3390/sym13071147</pub-id></mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Jolly</surname><given-names>V.</given-names></name> <name><surname>Telrandhe</surname><given-names>M.</given-names></name> <name><surname>Kasat</surname><given-names>A.</given-names></name> <name><surname>Shitole</surname><given-names>A.</given-names></name> <name><surname>Gawande</surname><given-names>K.</given-names></name></person-group> <chapter-title>CNN-based deep learning model for deepfake detection</chapter-title>,&#x201D; in <conf-name>2022 2nd Asian Conference on Innovation in Technology (ASIANCON)</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>, <year>2022</year>.</mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Khan</surname><given-names>R. U.</given-names></name> <name><surname>Zhang</surname><given-names>X.</given-names></name> <name><surname>Kumar</surname><given-names>R.</given-names></name> <name><surname>Aboagye</surname><given-names>E. O.</given-names></name></person-group>, <year>2018</year> &#x201C;<chapter-title>Evaluating the performance of ResNet model based on image recognition</chapter-title>,&#x201D; in <conf-name>Proceedings of the 2018 International Conference on Computing and Artificial Intelligence</conf-name>, <publisher-loc>New York, NY</publisher-loc>: <publisher-name>ACM</publisher-name></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kietzmann</surname><given-names>J.</given-names></name> <name><surname>Lee</surname><given-names>L. W.</given-names></name> <name><surname>McCarthy</surname><given-names>I. P.</given-names></name> <name><surname>Kietzmann</surname><given-names>T. C.</given-names></name></person-group> (<year>2020</year>). <article-title>Deepfakes: trick or treat?</article-title> <source>Bus. Horiz.</source> <volume>63</volume>, <fpage>135</fpage>&#x2013;<lpage>146</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bushor.2019.11.006</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Koonce</surname><given-names>B.</given-names></name></person-group> (<year>2021</year>). <source>Convolutional neural networks with swift for TensorFlow: image recognition and dataset categorization</source>. <publisher-loc>Berkeley, CA</publisher-loc>: <publisher-name>Apress</publisher-name>.</mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kuo</surname><given-names>C. C. J.</given-names></name></person-group> (<year>2016</year>). <article-title>Understanding convolutional neural networks with a mathematical model</article-title>. <source>J. Vis. Commun. Image Represent.</source> <volume>41</volume>, <fpage>406</fpage>&#x2013;<lpage>413</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jvcir.2016.11.003</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kute</surname><given-names>D. V.</given-names></name></person-group> (<year>2022</year>). <source>Explainable deep learning approach for detecting money laundering transactions in banking system</source>, <comment>Ph.D. dissertation</comment>. <publisher-loc>Sydney</publisher-loc>: <publisher-name>University of Technology</publisher-name>.</mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>S.</given-names></name> <name><surname>Dutta</surname><given-names>V.</given-names></name> <name><surname>He</surname><given-names>X.</given-names></name> <name><surname>Matsumaru</surname><given-names>T.</given-names></name></person-group> (<year>2022</year>). <article-title>Deep learning based one-class detection system for fake faces generated by GAN network</article-title>. <source>Sensors</source> <volume>22</volume>:<fpage>7767</fpage>. doi: <pub-id pub-id-type="doi">10.3390/s22207767</pub-id>, <pub-id pub-id-type="pmid">36298117</pub-id></mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>Y.</given-names></name> <name><surname>Yang</surname><given-names>X.</given-names></name> <name><surname>Sun</surname><given-names>P.</given-names></name> <name><surname>Qi</surname><given-names>H.</given-names></name> <name><surname>Lyu</surname><given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>Celeb-DF (v2): a new dataset for DeepFake forensics</article-title>. <source>arXiv</source>.</mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Li</surname><given-names>Y.</given-names></name> <name><surname>Yang</surname><given-names>X.</given-names></name> <name><surname>Sun</surname><given-names>P.</given-names></name> <name><surname>Qi</surname><given-names>H.</given-names></name> <name><surname>Lyu</surname><given-names>S.</given-names></name></person-group>. <year>2020</year>. <chapter-title>Celeb-df: a large-scale challenging dataset for deepfake forensics</chapter-title>. In <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name></mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Mascarenhas</surname><given-names>S.</given-names></name> <name><surname>Agarwal</surname><given-names>M.</given-names></name></person-group>, &#x201C;<chapter-title>A comparison between VGG16, VGG19 and ResNet50 architecture frameworks for image classification</chapter-title>,&#x201D; in <conf-name>2021 International Conference on Disruptive Technologies for Multi-Disciplinary Research and Applications (CENTCON)</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Masood</surname><given-names>M.</given-names></name> <name><surname>Nawaz</surname><given-names>M.</given-names></name> <name><surname>Javed</surname><given-names>A.</given-names></name> <name><surname>Nazir</surname><given-names>T.</given-names></name> <name><surname>Mehmood</surname><given-names>A.</given-names></name> <name><surname>Mahum</surname><given-names>R.</given-names></name></person-group>, <chapter-title>Classification of deepfake videos using pre-trained convolutional neural networks</chapter-title>,&#x201D; in <conf-name>2021 International Conference on Digital Futures and Transformative Technologies (ICoDT2)</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>, <year>2021</year>.</mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Medsker</surname><given-names>L. R.</given-names></name> <name><surname>Jain</surname><given-names>L. C.</given-names></name></person-group> (<year>2001</year>). <source>Recurrent neural networks: design and applications</source>. <publisher-loc>NW Boca Raton, FL</publisher-loc>: <publisher-name>CRC Press</publisher-name>.</mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mirksy</surname><given-names>Y.</given-names></name> <name><surname>Brodt</surname><given-names>O.</given-names></name> <name><surname>Cohen</surname><given-names>J.</given-names></name> <name><surname>Levy</surname><given-names>R.</given-names></name> <name><surname>Blokh</surname><given-names>I.</given-names></name></person-group> (<year>2019</year>). <article-title>Live DeepFake</article-title>. <source>J. Creat. Gans AI.</source></mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nguyen</surname><given-names>T. T.</given-names></name> <name><surname>Nguyen</surname><given-names>Q. V. H.</given-names></name> <name><surname>Nguyen</surname><given-names>D. T.</given-names></name> <name><surname>Huynh-The</surname><given-names>T.</given-names></name> <name><surname>Nahavandi</surname><given-names>S.</given-names></name> <name><surname>Pham</surname><given-names>Q.-V.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Deep learning for deepfakes creation and detection: a survey</article-title>. <source>Comput. Vis. Image Underst.</source> <volume>223</volume>:<fpage>103525</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cviu.2022.103525</pub-id></mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Pokroy</surname><given-names>A. A.</given-names></name> <name><surname>Egorov</surname><given-names>A. D.</given-names></name></person-group>. <year>2021</year>, &#x201C;<chapter-title>Efficientnets for deepfake detection: comparison of pretrained models</chapter-title>.&#x201D; In <conf-name>2021 IEEE Conference of Russian Young Researchers in Electrical and Electronic Engineering (ElConRus)</conf-name>. <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name></mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Raschka</surname><given-names>S.</given-names></name></person-group> (<year>2015</year>). <source>Python machine learning</source>. <publisher-loc>Birmingham</publisher-loc>: <publisher-name>Packt Publishing Ltd.</publisher-name></mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rigatti</surname><given-names>S. J.</given-names></name></person-group> (<year>2017</year>). <article-title>Random forest</article-title>. <source>J. Insur. Med.</source> <volume>47</volume>, <fpage>31</fpage>&#x2013;<lpage>39</lpage>. doi: <pub-id pub-id-type="doi">10.17849/insm-47-01-31-39.1</pub-id>, <pub-id pub-id-type="pmid">28836909</pub-id></mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rossler</surname><given-names>A.</given-names></name> <name><surname>Cozzolino</surname><given-names>D.</given-names></name> <name><surname>Verdoliva</surname><given-names>L.</given-names></name> <name><surname>Riess</surname><given-names>C.</given-names></name> <name><surname>Thies</surname><given-names>J.</given-names></name> <name><surname>Nie&#x00DF;ner</surname><given-names>M.</given-names></name></person-group> (<year>2019</year>). &#x201C;<chapter-title>Faceforensics++: learning to detect manipulated facial images</chapter-title>&#x201D; in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>).</mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Scherer</surname><given-names>D.</given-names></name> <name><surname>M&#x00FC;ller</surname><given-names>A.</given-names></name> <name><surname>Behnke</surname><given-names>S.</given-names></name></person-group> (<year>2010</year>). &#x201C;<chapter-title>Evaluation of pooling operations in convolutional architectures for object recognition</chapter-title>&#x201D; in <source>Artificial Neural Networks &#x2013; ICANN 2010. ICANN 2010. Lecture Notes in Computer Science</source>. eds. <person-group person-group-type="editor"><name><surname>Diamantaras</surname><given-names>K.</given-names></name> <name><surname>Duch</surname><given-names>W.</given-names></name> <name><surname>Iliadis</surname><given-names>L. S.</given-names></name></person-group> (<publisher-loc>Berlin</publisher-loc>: <publisher-name>Springer</publisher-name>).</mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Solaiyappan</surname><given-names>S.</given-names></name> <name><surname>Wen</surname><given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Machine learning-based medical image deepfake detection: a comparative study</article-title>. <source>Mach. Learn. Appl.</source> <volume>8</volume>:<fpage>100298</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.mlwa.2022.100298</pub-id></mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Song</surname><given-names>Y. Y.</given-names></name> <name><surname>Ying</surname><given-names>L. U.</given-names></name></person-group> (<year>2015</year>). <article-title>Decision tree methods: applications for classification and prediction</article-title>. <source>Shanghai Arch. Psychiatry</source> <volume>27</volume>, <fpage>130</fpage>&#x2013;<lpage>135</lpage>. doi: <pub-id pub-id-type="doi">10.11919/j.issn.1002-0829.215044</pub-id></mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Staudemeyer</surname><given-names>R. C.</given-names></name> <name><surname>Morris</surname><given-names>E. R.</given-names></name></person-group> (<year>2019</year>). <article-title>Understanding LSTM&#x2014;a tutorial into long short-term memory recurrent neural networks</article-title>. <source>arXiv</source>.</mixed-citation></ref>
<ref id="ref46"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Suthaharan</surname><given-names>S.</given-names></name></person-group> (<year>2016</year>). <source>Machine learning models and algorithms for big data classification: thinking with examples for effective learning</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>.</mixed-citation></ref>
<ref id="ref47"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Verdoliva</surname><given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>Media forensics and deepfakes: an overview</article-title>. <source>IEEE J. Sel. Top. Signal Process.</source> <volume>14</volume>, <fpage>910</fpage>&#x2013;<lpage>932</lpage>. doi: <pub-id pub-id-type="doi">10.1109/jstsp.2020.3002101</pub-id></mixed-citation></ref>
<ref id="ref48"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Weisstein</surname><given-names>E. W.</given-names></name></person-group>, &#x201C;Convolution,&#x201D; MathWorld, <year>2003</year>. Available online at: <ext-link xlink:href="https://mathworld.wolfram.com/" ext-link-type="uri">https://mathworld.wolfram.com/</ext-link> (Accessed February 10, 2024).</mixed-citation></ref>
<ref id="ref49"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname><given-names>J.</given-names></name></person-group> (<year>2017</year>). <article-title>Introduction to convolutional neural networks</article-title>. <source>Natl. Key Lab. Novel Softw. Technol. Nanjing Univ.</source> <volume>5</volume>:<fpage>495</fpage>.</mixed-citation></ref>
<ref id="ref50"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xia</surname><given-names>Z.</given-names></name> <name><surname>Qiao</surname><given-names>T.</given-names></name> <name><surname>Xu</surname><given-names>M.</given-names></name> <name><surname>Wu</surname><given-names>X.</given-names></name> <name><surname>Han</surname><given-names>L.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Deepfake video detection based on MesoNet with preprocessing module</article-title>. <source>Symmetry</source> <volume>14</volume>:<fpage>939</fpage>. doi: <pub-id pub-id-type="doi">10.3390/sym14050939</pub-id></mixed-citation></ref>
<ref id="ref51"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Yadav</surname><given-names>D.</given-names></name> <name><surname>Salmani</surname><given-names>S.</given-names></name></person-group>, <year>2019</year> &#x201C;<chapter-title>Deepfake: a survey on facial forgery technique using generative adversarial network</chapter-title>,&#x201D; in <conf-name>2019 International Conference on Intelligent Computing and Control Systems (ICCS)</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>.</mixed-citation></ref>
<ref id="ref52"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yan</surname><given-names>Z.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name> <name><surname>Yuan</surname><given-names>X.</given-names></name> <name><surname>Lyu</surname><given-names>S.</given-names></name> <name><surname>Wu</surname><given-names>B.</given-names></name></person-group> (<year>2023</year>). <article-title>Deepfakebench: a comprehensive benchmark of deepfake detection</article-title>. <source>arXiv</source>.</mixed-citation></ref>
<ref id="ref53"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Yegnanarayana</surname><given-names>B.</given-names></name></person-group> (<year>2009</year>). <source>Artificial neural networks</source>. <publisher-loc>Delhi</publisher-loc>: <publisher-name>PHI Learning Pvt. Ltd.</publisher-name></mixed-citation></ref>
<ref id="ref54"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Yesugade</surname><given-names>K.</given-names></name> <name><surname>Jadhav</surname><given-names>R.</given-names></name></person-group>, &#x201C;<chapter-title>Implementation of deep learning techniques for deepfake classification: a comparative study using ResNet-50 and VGG16</chapter-title>,&#x201D; in <conf-name>Proceedings 2024 IEEE Pune Section International Conference (PuneCon)</conf-name>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name>, <year>2024</year>.</mixed-citation></ref>
<ref id="ref55"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>Q.</given-names></name> <name><surname>Zhu</surname><given-names>W.</given-names></name> <name><surname>Li</surname><given-names>F.</given-names></name> <name><surname>Yuan</surname><given-names>M.</given-names></name> <name><surname>Zheng</surname><given-names>L.</given-names></name> <name><surname>Liu</surname><given-names>X.</given-names></name></person-group> (<year>2022</year>). <article-title>Transfer learning of the ResNet-18 and DenseNet-121 model used to diagnose intracranial hemorrhage in CT scanning</article-title>. <source>Curr. Pharm. Des.</source> <volume>28</volume>, <fpage>287</fpage>&#x2013;<lpage>295</lpage>. doi: <pub-id pub-id-type="doi">10.2174/1381612827666211213143357</pub-id>, <pub-id pub-id-type="pmid">34961458</pub-id></mixed-citation></ref>
<ref id="ref56"><mixed-citation publication-type="confproc"><person-group person-group-type="author"><name><surname>Zhu</surname><given-names>B.</given-names></name> <name><surname>Fang</surname><given-names>H.</given-names></name> <name><surname>Sui</surname><given-names>Y.</given-names></name> <name><surname>Li</surname><given-names>L.</given-names></name></person-group>, &#x201C;<chapter-title>Deepfakes for medical video de-identification: privacy protection and diagnostic information preservation</chapter-title>,&#x201D; in <conf-name>Proceedings of the AAAI/ACM Conference on AI, Ethics, and Society</conf-name>, <year>2020</year>, <publisher-loc>Piscataway, NJ</publisher-loc>: <publisher-name>IEEE</publisher-name></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3080138/overview">Feng Ding</ext-link>, Nanchang University, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3323824/overview">Vakdevi Vallabhaneni</ext-link>, Software Developer, India</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3351868/overview">Govind Murari Upadhyay</ext-link>, Manipal University Jaipur, India</p>
</fn>
</fn-group>
</back>
</article>