<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="review-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Big Data</journal-id>
<journal-title>Frontiers in Big Data</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Big Data</abbrev-journal-title>
<issn pub-type="epub">2624-909X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdata.2024.1400024</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Big Data</subject>
<subj-group>
<subject>Review</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Deepfake: definitions, performance metrics and standards, datasets, and a meta-review</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Altuncu</surname> <given-names>Enes</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/2683908/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Franqueira</surname> <given-names>Virginia N. L.</given-names></name>
<uri xlink:href="http://loop.frontiersin.org/people/2823905/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Li</surname> <given-names>Shujun</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2668177/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff><institution>Institute of Cyber Security for Society (iCSS) &#x00026; School of Computing, University of Kent</institution>, <addr-line>Canterbury</addr-line>, <country>United Kingdom</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Nikolaos Pitropakis, Edinburgh Napier University, United Kingdom</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Riccardo Spolaor, Shandong University, China</p>
<p>Christos Chrysoulas, Edinburgh Napier University, United Kingdom</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Shujun Li <email>S.J.Li&#x00040;kent.ac.uk</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>04</day>
<month>09</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>7</volume>
<elocation-id>1400024</elocation-id>
<history>
<date date-type="received">
<day>12</day>
<month>03</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>08</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Altuncu, Franqueira and Li.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Altuncu, Franqueira and Li</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Recent advancements in AI, especially deep learning, have contributed to a significant increase in the creation of new realistic-looking synthetic media (video, image, and audio) and manipulation of existing media, which has led to the creation of the new term &#x0201C;deepfake.&#x0201D; Based on both the research literature and resources in English, this paper gives a comprehensive overview of deepfake, covering multiple important aspects of this emerging concept, including (1) different definitions, (2) commonly used performance metrics and standards, and (3) deepfake-related datasets. In addition, the paper also reports a meta-review of 15 selected deepfake-related survey papers published since 2020, focusing not only on the mentioned aspects but also on the analysis of key challenges and recommendations. We believe that this paper is the most comprehensive review of deepfake in terms of the aspects covered.</p></abstract>
<kwd-group>
<kwd>deepfake</kwd>
<kwd>survey</kwd>
<kwd>definition</kwd>
<kwd>datasets</kwd>
<kwd>standards</kwd>
<kwd>performance metrics</kwd>
</kwd-group>
<contract-sponsor id="cn001">Defence Science and Technology Laboratory<named-content content-type="fundref-id">10.13039/100010418</named-content></contract-sponsor>
<counts>
<fig-count count="1"/>
<table-count count="7"/>
<equation-count count="17"/>
<ref-count count="136"/>
<page-count count="23"/>
<word-count count="18895"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Cybersecurity and Privacy</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Recent advancements in AI and machine learning have increased the capability to produce more realistic media, e.g., video, image, and audio. Especially, state-of-the-art deep learning methods enabled the generation of &#x0201C;deepfakes,&#x0201D; manipulated or synthetic media the realness of which are not easily recognisable by the human eye. Although deepfake is a relatively new phenomenon (having first appeared at the end of 2017), its growth has been remarkable. According to the 2019 and 2020 Deeptrace (now, Sensity) reports on the state of deepfake (Ajder et al., <xref ref-type="bibr" rid="B2">2019</xref>), the number of deepfake videos on the English-speaking internet grew from 7,964 (December 2018) to 14,678 (July 2019) to 85,047 (December 2020), representing a 968% increase from 2018 to 2020. By 2024, the number of available tools for deepfake generation has reached to over 10,000 (Sensity, <xref ref-type="bibr" rid="B88">2024</xref>).</p>
<p>In this work, we review the existing deepfake-related research ecosystem in terms of various aspects, including performance metrics, standards, and datasets. Furthermore, we provide a meta-review of 15 selected deepfake-related survey papers which covers several additional aspects other than the mentioned ones in a systematic manner, such as performance comparison, key challenges, and recommendations.</p>
<p>Despite being a hugely popular term, there is a lack of consensus on the definition of &#x0201C;deepfake&#x0201D; and the boundary between deepfakes and non-deepfakes is not clear cut. For this survey, we adopt a relatively more inclusive approach to cover all forms of manipulated or synthetic media that are considered deepfakes in a broader sense. We also cover closely related topics including biometrics and multimedia forensics, since deepfakes are often used to launch presentation attacks against biometrics-based authentication systems and detection of deepfakes can be considered part of multimedia forensics. A more detailed discussion on different definitions of &#x0201C;deepfake&#x0201D; is given next.</p>
<sec>
<title>1.1 Definitions of the term deepfake</title>
<p>As its name implies, the term &#x0201C;deepfake&#x0201D; is derived from the combination of &#x0201C;deep&#x0201D; [referring to <italic>deep learning</italic> (DL)] and &#x0201C;fake.&#x0201D; It is normally used to refer to the manipulation of existing media (image, video, and/or audio) or the generation of new (synthetic) media using DL-based approaches. The most commonly discussed deepfake data are fake face images, fake speech forgeries, and fake videos that combine both fake images and fake speech forgeries. While having &#x0201C;fake&#x0201D; in the word indicates manipulated or synthesised media, there are plenty of benign applications of the deepfake technology, e.g., for entertainment and creative arts. With this respect, another term &#x0201C;deep synthesis&#x0201D; has been proposed as a more neutral-sounding alternative (Tencent, <xref ref-type="bibr" rid="B97">2020</xref>). This new term, however, has not been widely adopted.</p>
<p>In addition to the lack of a universal definition, as mentioned already, the boundary between deepfakes and non-deep fakes is actually not clear-cut. There are at least two important aspects we should consider, one on detection of and the other on creation of deepfakes.</p>
<p>First, the detection of deepfakes often follows very similar approaches to the detection of traditional fakes generated without using DL techniques. Advanced detection methods have also started leveraging DL to improve their performance, but they do not necessarily need to know how a target media is created (deep or not). To some extent, one could argue that detecting deepfakes does not involve developing deepfake-specific methods (even though some researchers choose to do so), but a more robust and universal detector that can handle any (deep or not) fake media. This can be seen in two closely related topics: biometrics and multimedia forensics. For biometrics, there is a trend of using deep learning techniques to generate fake biometric signals (e.g., face images and videos) for biometric spoofing or presentation attacks. For multimedia forensics, deepfake-based forgeries have become a new threat to the traditional problem of &#x0201C;forgery detection.&#x0201D; For both topics, the detection of biometric spoofing and multimedia forgeries have evolved to consider both deep and non-deep fakes.</p>
<p>Second, one may argue that the word &#x0201C;deep&#x0201D; in &#x0201C;deepfake&#x0201D; does not necessarily refer to the use of &#x0201C;deep learning&#x0201D;, but any &#x0201C;deep&#x0201D; (i.e., sophisticated) technology that creates a very believable fake media. For instance, Brady (<xref ref-type="bibr" rid="B9">2020</xref>) considered deepfake as audio-visual manipulation using &#x0201C;a spectrum of technical sophistication ... and techniques.&#x0201D; They also introduced two new terms, <italic>Shallowfake</italic> and <italic>Cheapfake</italic>, referring to &#x0201C;low-level manipulation of audio-visual media created with (easily) accessible software [or no software] to speed, slow, restage or re-contextualise content.&#x0201D; This broader understanding of &#x0201C;deepfake&#x0201D; has also been adopted by lawmakers for new legislations combating malicious deepfakes. For instance, the following two United States acts define &#x0201C;deepfakes&#x0201D; as follows:</p>
<list list-type="bullet">
<list-item><p>2018 Malicious Deep Fake Prohibition Act<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref>:</p>
<p>&#x000A7;1041.(b).(2): &#x0201C;<italic>the term &#x02018;deep fake&#x00027; means an audiovisual record created or altered in a manner that the record would falsely appear to a reasonable observer to be an authentic record of the actual speech or conduct of an individual</italic>.&#x0201D;</p>
</list-item>
<list-item><p>2019 DEEP FAKES Accountability Act<xref ref-type="fn" rid="fn0002"><sup>2</sup></xref>:</p>
<p>&#x000A7;1041.(n).(3): &#x0201C;<italic>The term &#x02018;deep fake&#x00027; means any video recording, motion-picture film, sound recording, electronic image, or photograph, or any technological representation of speech or conduct substantially derivative thereof</italic></p>
<p><italic>(A) which appears to authentically depict any speech or conduct of a person who did not in fact engage in such speech or conduct; and</italic></p>
<p><italic>(B) the production of which was substantially dependent upon technical means, rather than the ability of another person to physically or verbally impersonate such person</italic>.&#x0201D;</p>
</list-item>
</list>
<p>As we can see from the above legal definitions of &#x0201C;deepfake,&#x0201D; the use of DL as a technology is not mentioned at all. The focus here is on &#x0201C;authenticity&#x0201D;, &#x0201C;impersonation&#x0201D; and (any) &#x0201C;technical means.&#x0201D;</p>
</sec>
<sec>
<title>1.2 Scope and contribution</title>
<p>Based on the above discussion on definitions of deepfake, we can see it is not always straightforward or meaningful to differentiate deepfakes from non-deep fakes. In addition, for our focus on performance evaluation and comparison, the boundary between deepfakes and non-deep fakes is even more blurred. This is because DL is just a special (deeper) form of machine learning (ML), and as a result, DL and non-deep ML methods share many common concepts, metrics and procedures.</p>
<p>Despite the fact that deepfake may be understood in a much broader sense, in this work, we have a sufficiently narrower focus to avoid covering too many topics. We, therefore, decided to define the scope of this survey as follows:</p>
<list list-type="bullet">
<list-item><p>For metrics and standards, we chose to include all commonly used ones for evaluating general ML methods and those specifically defined for evaluating deepfake creation or detection methods.</p></list-item>
<list-item><p>For datasets, we considered those related to fake media covered in the deepfake-related survey papers and those with an explicit mention of the term &#x0201C;deepfake&#x0201D; or a comparable term.</p></list-item>
<list-item><p>For the meta-review, we considered only survey papers whose authors explicitly referred to the term &#x0201C;deepfakes&#x0201D; in the metadata (title, abstract, and keywords).</p></list-item>
</list>
<p>In this paper, we aim to make the following contributions:</p>
<list list-type="bullet">
<list-item><p>We discuss existing definitions of the term &#x0201C;deepfake&#x0201D; and propose a more inclusive definition.</p></list-item>
<list-item><p>We present an overview of the available deepfake-related standards and metrics for evaluating deepfake generation or detection, which have been generally overlooked by previous surveys. The covered metrics include general AI metrics as well as several deepfake-specific metrics for objective and subjective evaluation.</p></list-item>
<list-item><p>We comprehensively cover a wide range of deepfake datasets, considering different modalities&#x02014;image, video, audio, and text. We believe this paper offers the most comprehensive review of deepfake-related datasets so far.</p></list-item>
<list-item><p>We provide a meta-review of 15 deepfake survey papers to draw some high-level insights for monitoring the future development of deepfake-related technologies and their applications.</p></list-item>
</list>
</sec>
<sec>
<title>1.3 Paper organisation</title>
<p>The rest of the paper is as organised as follows. In Section 2, we mention how we collected the survey papers covered in this paper. Then, Section 3 reviews existing deepfake-related performance metrics and standards, followed by Section 4 covering deepfake datasets. In Section 5, we provide a meta-review of the survey papers collected. Finally, the paper concludes with Section 6.</p></sec>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methodology</title>
<p>Research papers covered in this survey (i.e., the deepfake-related survey papers) were identified via systematic searches on the Scopus scientific database. The following search query was used to perform the searches on Scopus:</p>
<disp-quote><p>(deepfake* OR deep-fake* OR &#x0201C;deep fake*&#x0201D;) AND (review OR survey OR overview OR systemati* OR SoK)</p></disp-quote>
<p>The searches returned 117 survey papers in English, published between 2020 and 2024 (inclusive). Out of these papers, 15 papers were selected for consideration in the meta-review. During the selection process, all the papers were carefully reviewed, and only the ones having a substantial comparative angle, e.g., those with performance comparison and/or covering different datasets, tools, challenges, competitions, metrics, etc., were included. Furthermore, for the papers with similar coverage, those published in more decent venues (e.g., higher-ranked journals or more well-known conferences) and/or more cited by other studies were preferred. Finally, we ensured that the final set of papers cover publications from each year between 2020 and 2024.</p>
<p>Deepfake-related datasets were compiled based on the selected survey papers and identified deepfake-related challenges, competitions, and benchmarks. Relevant standards were identified mainly via research papers covered in this survey, the co-authors&#x00027; personal knowledge, and Google Web searches. For performance metrics, we covered those commonly used based on relevant standards, the survey papers, and the identified challenges, competitions, and benchmarks.</p></sec>
<sec id="s3">
<title>3 Deepfake-related performance metrics and standards</title>
<p>In this survey, we focus on performance evaluation and comparison of deepfake generation and detection methods. The metrics used for such performance evaluations are at the core of our discussions. In this section, we review the performance metrics that are commonly used to evaluate deepfake generation and detection algorithms. Note that all metrics covered in this section are also commonly used for evaluating the performance of similar systems that are not for generating or detecting deepfakes. Therefore, this section can be seen as a very brief tutorial on general performance metrics.</p>
<p>In the last subsection, we also briefly discuss how the related performance metrics are covered in formal standards. By &#x0201C;formal standards,&#x0201D; we refer to standards defined following a formal procedure, often by one or more established standardisation bodies such as the International Organization for Standardization (ISO)<xref ref-type="fn" rid="fn0003"><sup>3</sup></xref> and the International Electrotechnical Commission (IEC).<xref ref-type="fn" rid="fn0004"><sup>4</sup></xref> Note that we consider a broad range of documents defined to be standards by standardisation bodies, e.g., International Telecommunication Union (ITU)<xref ref-type="fn" rid="fn0005"><sup>5</sup></xref> recommendations and ISO technical reports (TRs).</p>
<sec>
<title>3.1 The confusion matrix</title>
<p>Deepfake detection is primarily a binary classification problem. A binary classifier takes an input that is <italic>actually positive</italic> or <italic>actually negative</italic> and outputs a binary value denoting it to be <italic>predicted positive</italic> or <italic>predicted negative</italic>. For example, a deepfake detection system will take a suspected image as the input that may be <italic>actually fake</italic> or <italic>actually real</italic> and output <italic>predicted fake</italic> or <italic>predicted real</italic>.</p>
<p>A fundamental tool used in evaluating a binary classifier is the <bold>confusion matrix</bold> that summarises the success and failure of the classification model. On one axis are the two <italic>actual</italic> values and on the other axis are the two <italic>predicted</italic> values. The classification is <italic>successful/correct/true</italic> (true positive and true negative) when the actual and the predicted values match. It is <italic>failed/incorrect/false</italic> (false positive and false negative) when the actual and predicted values do not match. <xref ref-type="table" rid="T1">Table 1</xref> shows the confusion matrix for a binary deepfake classifier (detector). The two cells in green, TP (the number of <bold>true positives</bold>) and TN (the number of <bold>true negatives</bold>), indicate correct prediction results, and the two cells in red, FN (the number of <bold>false negatives</bold>), and FP (the number of <bold>false positives</bold>), indicate two different types of errors when making incorrect prediction results.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Confusion matrix for a binary classifier for detecting deepfake.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th/>
<th valign="top" align="center"><bold>Fake (predicted)</bold></th>
<th valign="top" align="center"><bold>Real (predicted)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Fake (actual)</td>
<td valign="top" align="center" style="background-color:#bfffbf">TP</td>
<td valign="top" align="center" style="background-color:#ffbfbf">FN</td>
</tr> <tr>
<td valign="top" align="left">Real (actual)</td>
<td valign="top" align="center" style="background-color:#ffbfbf">FP</td>
<td valign="top" align="center" style="background-color:#bfffbf">TN</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>3.2 Precision and recall</title>
<p>Based on the four fundamental values introduced in Section 3.1, i.e., TP, TN, FP, and FN, we define two important performance metrics for a binary classifier&#x02014;<bold>precision</bold> and <bold>recall</bold>.</p>
<p>Precision of a binary classifier is defined as the fraction of <italic>actually positive</italic> samples among all the <italic>predicted positives</italic>. In the confusion matrix, it is the fraction of true samples in the first column. It can be formally defined as <xref ref-type="disp-formula" rid="E1">Equation 1</xref>.</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>When the &#x0201C;natural&#x0201D; ratio between positive and negative samples is significantly different from the test set, it is often useful to adjust the weight of the false positives, which leads to the <bold>weighted precision</bold> (wP) defined in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>, where &#x003B1;&#x0003E;0 is a weight determined by the ratio between the negative and positive samples.</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">wP</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Recall of a binary classifier is the fraction of <italic>predicted positive</italic> samples among the <italic>actually positive</italic> samples, as shown in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>. In the confusion matrix, it is the fraction of true samples in the first row.</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Let us consider an example binary classifier that predicts if an image from a database containing both deepfake and real (authentic) images is fake or not. Precision of the classifier is the fraction of correctly classified images among all images classified as deepfake. On the other hand, recall is the fraction of deepfake images identified by the classifier, among all deepfake images in the database.</p>
</sec>
<sec>
<title>3.3 True and false positive rates</title>
<p>Focusing on predicted positive samples, we can also define two metrics: <bold>true positive rate</bold> (TPR), also called <bold>correct detection rate</bold> (CDR), as the fraction of the predicted positive samples among the actually positive samples and <bold>false positive rate</bold> (FPR), also called <bold>false alarm rate</bold> (FAR), as the fraction of the predicted positive samples among the actually negative samples, as shown in <xref ref-type="disp-formula" rid="E4">Equations 4</xref>, <xref ref-type="disp-formula" rid="E5">5</xref>. In the confusion matrix, TPR is the fraction of predicted positive samples in the first row and FPR is the fraction of predicted positive samples in the second row. Note that TPR is basically a different name for <bold>recall</bold> (<xref ref-type="disp-formula" rid="E3">Equation 3</xref>).</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">TPR</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E5"><label>(5)</label><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">FPR</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.4 True and false negative rates</title>
<p>Similar to true and false positive rates, we can define two other rates focusing on negative predicted results: <bold>true negative rate</bold> (TNR) indicating the fraction of the predicted negative samples among the actually negative samples, and <bold>false negative rate</bold> (FNR) indicating the fraction of the predicted negative samples among the actually positive samples, as shown in <xref ref-type="disp-formula" rid="E6">Equations 6</xref>, <xref ref-type="disp-formula" rid="E7">7</xref>.</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">TNR</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E7"><label>(7)</label><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">FNR</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.5 Sensitivity and specificity</title>
<p>In some applications of binary classifiers, especially in biology and medicine, the TPR and the TNR are more commonly used, and they are often called <bold>sensitivity</bold> (TPR) and <bold>specificity</bold> (TNR). The focus of these two terms is on the two types of correctness of the predicted results. These are less used in deepfake-related research, hence, we will not refer to them in the remainder of this paper.</p>
</sec>
<sec>
<title>3.6 Equal error rate</title>
<p>Focusing on error rates means that we need to consider the FPR and the FNR. These two rates normally conflict with each other so that reducing one rate normally leads to an increase in the other. Therefore, rather than trying to reduce both error rates at the same time, which is normally impossible, the more realistic task in practical applications is to find the right balance so that they are both below an acceptable threshold.</p>
<p>In some applications, such as biometrics, people are particularly interested in establishing the so-called <bold>equal error rate</bold> (EER) or <bold>crossover error rate</bold> (CER), the point where the FPR and the FNR are equal. The EER/CER is not necessarily a good metric for some applications, especially when the two types of errors are of different levels of importance, e.g., for detecting critical deepfakes (e.g., fake news that can influence how people cast their votes) we can often tolerate more false positives (false alarms) than false negatives (missed alarms).</p>
</sec>
<sec>
<title>3.7 Accuracy and F-score</title>
<p>In addition to the EER/CER, there are also other metrics that try to reflect both types of errors, in order to give a more balanced indication of the overall performance of a binary classifier. The two most commonly used are <bold>accuracy</bold> and <bold>F-score</bold> (also called <bold>F-measure</bold>). Both metrics can be defined based on the four fundamental values (TP, TN, FP, and FN).</p>
<p>Accuracy of a binary classifier is defined as the fraction of <italic>correctly predicted</italic> samples (true positives and true negatives) among the total number of samples that have been classified, as shown in <xref ref-type="disp-formula" rid="E8">Equation 8</xref>.</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">accuracy</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">TN</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The F-score of a binary classifier is actually a family of metrics. Its general form can be described based on a parameter &#x003B2; as defined in <xref ref-type="disp-formula" rid="E9">Equation 9</xref>.</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x000B7;</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext></mml:mrow><mml:mrow><mml:msup><mml:mrow><mml:mi>&#x003B2;</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mo>&#x000B7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The most widely used edition of all F-scores is the so-called <bold>F1-score</bold>, which is effectively the F-score with &#x003B2; &#x0003D; 1. More precisely, it is defined as shown in <xref ref-type="disp-formula" rid="E10">Equation 10</xref>.</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M10"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x000B7;</mml:mo><mml:mfrac><mml:mrow><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>&#x000B7;</mml:mo><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">precision</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">recall</mml:mtext></mml:mrow></mml:mfrac><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mtext class="textrm" mathvariant="normal">TP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FP</mml:mtext><mml:mo>&#x0002B;</mml:mo><mml:mtext class="textrm" mathvariant="normal">FN</mml:mtext></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.8 Receiver operating characteristic curve and area under curve</title>
<p><bold>Receiver operating characteristic</bold> (ROC) curves are commonly used to measure the performance of binary classifiers that output a score (or probability) of prediction.</p>
<p>Consider the following. Let <italic>S</italic> be the set of all test samples and let the output scores <italic>f</italic>(<italic>s</italic>) (for all <italic>s</italic> &#x02208; <italic>S</italic>) lie in the interval [<italic>a, b</italic>] on the real line. Let <italic>t</italic> &#x02208; [<italic>a, b</italic>] be a prediction threshold for the model, and assume that the classifiers works as follows for all <italic>s</italic> &#x02208; <italic>S</italic>:</p>
<disp-formula id="E11"><label>(11)</label><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">class</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">positive</mml:mtext><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mtext class="textrm" mathvariant="normal">if</mml:mtext><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x02265;</mml:mo><mml:mi>t</mml:mi><mml:mo>,</mml:mo><mml:mtext class="textrm" mathvariant="normal">and</mml:mtext></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">negative</mml:mtext><mml:mo>,</mml:mo></mml:mtd><mml:mtd><mml:mtext class="textrm" mathvariant="normal">otherwise</mml:mtext><mml:mo>.</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Considering <xref ref-type="disp-formula" rid="E11">Equation 11</xref>, it is easy to see that, for <italic>t</italic> &#x0003D; <italic>a</italic>, all the samples will be classified as positive, leading to FN &#x0003D; TN &#x0003D; 0 so TPR &#x0003D; FPR &#x0003D; 1; while for <italic>t</italic> &#x0003D; <italic>b</italic>, all the samples will be classified as negative, leading to FP &#x0003D; TP &#x0003D; 0 so TPR &#x0003D; FPR &#x0003D; 0. For other threshold values between <italic>a</italic> and <italic>b</italic>, the values of TPR and FPR will normally be between 0 and 1. By changing <italic>t</italic> from <italic>a</italic> to <italic>b</italic> continuously, we can normally get a continuous curve that describes how the TPR and FPR values change from (0,0) to (1,1) on the 2D plane. This curve is the ROC curve of the binary classifier.</p>
<p>For a random classifier, assuming that <italic>f</italic>(<italic>s</italic>) distributes uniformly on [<italic>a, b</italic>] for the test set, we can mathematically derive its ROC curve being the TPR &#x0003D; FPR line, whose area under the ROC curve (AUC) is 0.5. For a binary classifier that performs better than a random predictor, we can also mathematically prove that its AUC is always higher than 0.5, with 1 being the best possible value. Note that no binary classifier can have an AUC below 0.5, since one can simply flip the prediction result to get a better predictor with an AUC of 1 &#x02212; AUC. The relationship between the ROC and the AUC is graphically illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>A representative ROC curve showing how TPR and FPR change w.r.t. the (hidden) threshold <italic>t</italic>. The area under the (ROC) curve (AUC) is shown in grey.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fdata-07-1400024-g0001.tif"/>
</fig>
</sec>
<sec>
<title>3.9 Log loss</title>
<p>Another widely used performance metric for binary classifiers that can return a probability score for the predicted label is <bold>log loss</bold>. For a binary classification with a true label <italic>y</italic> &#x02208; {0, 1} and an estimated probability <italic>p</italic> &#x0003D; Pr(<italic>y</italic> &#x0003D; 1), the log loss per sample is the negative log-likelihood of the classifier given the true label, defined as shown in <xref ref-type="disp-formula" rid="E12">Equation 12</xref>.</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M12"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mo class="qopname">log</mml:mo></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>y</mml:mi><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Given a testing set with <italic>n</italic> samples, the log loss score of a binary classifier can be calculated using <xref ref-type="disp-formula" rid="E13">Equation 13</xref>, where <italic>y</italic><sub><italic>i</italic></sub> is 1 if the <italic>i</italic>-th sample is true and 0 if false, and &#x00177;<sub><italic>i</italic></sub> is the predicted probability of <italic>y</italic><sub><italic>i</italic></sub> &#x0003D; 1.</p>
<disp-formula id="E13"><label>(13)</label><mml:math id="M13"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">LL</mml:mtext><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo class="qopname">log</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.10 Extension to multi-class classifiers</title>
<p>All metrics that are defined based on the four basic values TP, TN, FP, and FN can be easily extended to <bold>multi-class classification</bold> by considering the prediction to be true or false individually with respect to each class. For example, if the system is classifying animals (cats, dogs, horses, lions, tigers, etc.), then a true positive prediction of an image to be of a cat, would simultaneously be true negative predictions for the remaining classes (dogs, horses, lions, tigers, etc.). If an image of a cat is incorrectly predicted to be that of a dog, it would be a false negative with respect to a cat, a false positive with respect to a dog, and a true negative with respect to all other classes.</p>
</sec>
<sec>
<title>3.11 Deception success rate</title>
<p>DSR aims to measure to what extent a deepfake detection model can be fooled by the generated deepfakes. It was used for the evaluation of the Audio Deep Synthesis Detection (ADD) 2022 Challenge (Yi et al., <xref ref-type="bibr" rid="B115">2022a</xref>) and defined as follows:</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M14"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">DSR</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>A</mml:mi><mml:mo>*</mml:mo><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In <xref ref-type="disp-formula" rid="E14">Equation 14</xref>, W refers to the total number of incorrect detection samples by all the detection models under the condition of achieving each individual EER performance, A is the count of all the evaluation samples, and N is the number of detection models.</p>
</sec>
<sec>
<title>3.12 Perceptual quality assessment metrics</title>
<p>By definition, the main goal of deepfakes is to make it hard or impossible for human consumers (listeners or viewers) to distinguish fake media from real media. Therefore, when evaluating the quality of deepfake media, the quality perceived by human consumers of the media is key. This calls for a subjective assessment of the perceptual quality of the deepfake media as the &#x0201C;gold standard.&#x0201D; The most widely used subjective perceptual quality assessment (PQA) metric for audio-visual signals is <bold>mean opinion score</bold> (MOS), which has been widely used by the signal processing and multimedia communication communities, including digital TV and other multimedia-related consumer applications. As its name implies, MOS is calculated by averaging the subjective scores given by a number of human judges, normally following a numerical scale between 1 and 5 or between 0 and 100. MOS has been used in some deepfake-related challenges and also for evaluating and comparing the quality (realness/naturalness) of deepfake datasets (see Section 4.7).</p>
<p>As a general subjective PQA metric, MOS has been standardised by the ITU.<xref ref-type="fn" rid="fn0006"><sup>6</sup></xref> There are also ITU standards defining more specific subjective Video Quality Assessment (VQA) metrics and the standard procedures one should follow to conduct VQA user studies, e.g., ITU-T Recommendation P.910 &#x0201C;Subjective video quality assessment methods for multimedia applications.&#x0201D;<xref ref-type="fn" rid="fn0007"><sup>7</sup></xref> Note that the ITU standards focus more on traditional perceptual quality, i.e., how good a signal looks or sounds, even if it looks or sounds not real (e.g., too smooth). On the other hand, for deepfakes, the focus is rather different because what matters is the realness and naturalness of the created media, i.e., how real and natural it looks or sounds, even if it is of low quality. To some extent, we can also consider realness and naturalness as a special aspect of perceptual quality.</p>
<p>One major problem of subjective PQA metrics like MOS is the need to recruit human judges and to have a well-controlled physical testing environment and protocol, which are not easy for many applications. To help reduce the efforts and costs of conducting PQA-related user studies, various objective PQA metrics have been proposed, where the term &#x0201C;objective&#x0201D; refers to the fact that such metrics are human-free, i.e., automatically calculated following a computational algorithm or process. Depending on whether a reference exists, such objective PQA metrics can be largely split into three categories: full-reference (FR) metrics (when the original &#x0201C;perfect-quality&#x0201D; signal is available as the reference), reduced-reference (RR) metrics (when some features of the original &#x0201C;perfect-quality&#x0201D; signal are available as the reference), and no-reference (NR) metrics (when the original signal is unavailable or such an original signal does not exist). For deepfakes, normally NR or RR metrics are more meaningful because the &#x0201C;fake&#x0201D; part of the word means that part of the whole data does not exist in the real world, hence a full reference cannot be obtained. RR metrics are still relevant because deepfakes are often produced for a target&#x00027;s specific attributes (e.g., face and voice), where the reduced reference will be such attributes. NR metrics will be useful to estimate the realness and naturalness of a deepfake, simulating how a human judge would rate it in a controlled subjective PQA user study.</p>
<p>PQA is a very active research area and many PQA metrics have been proposed, some of which have been widely used in real-world products and services, e.g., <bold>mean squared error</bold> (MSE), <bold>peak signal-to-noise ratio</bold> (PSNR), and <bold>structural similarity index measure</bold> (SSIM) for FR PQA of digital images and videos defined as in <xref ref-type="disp-formula" rid="E15">Equations 15</xref>&#x02013;<xref ref-type="disp-formula" rid="E17">17</xref>, respectively, where <inline-formula><mml:math id="M15"><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the reference (the original signal), <inline-formula><mml:math id="M16"><mml:mi>Y</mml:mi><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the signal whose visual quality is assessed, <italic>n</italic> is the number of pixels in <italic>X</italic> and <italic>Y</italic>, <italic>L</italic> is the maximum possible pixel value of <italic>X</italic> and <italic>Y</italic> (e.g., 255 for 8-bit grey-scale images), <inline-formula><mml:math id="M17"><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mi>L</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> and <inline-formula><mml:math id="M18"><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mi>L</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup><mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula> are two stabilising parameters (<italic>k</italic><sub>1</sub> &#x0003D; 0.01 and <italic>k</italic><sub>2</sub> &#x0003D; 0.03 by default). For more about PQA metrics for different types of multimedia signals, we refer readers to some relevant surveys (Akhtar and Falk, <xref ref-type="bibr" rid="B3">2017</xref>; Pal and Triyason, <xref ref-type="bibr" rid="B78">2018</xref>; Zhai and Min, <xref ref-type="bibr" rid="B123">2020</xref>).</p>
<disp-formula id="E15"><label>(15)</label><mml:math id="M19"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">MSE</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>-</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E16"><label>(16)</label><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">PSNR</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mn>10</mml:mn><mml:msub><mml:mrow><mml:mo class="qopname">log</mml:mo></mml:mrow><mml:mrow><mml:mn>10</mml:mn></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msup><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:mtext class="textrm" mathvariant="normal">MSE</mml:mtext></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E17"><label>(17)</label><mml:math id="M21"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mtext class="textrm" mathvariant="normal">SSIM</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:mi>Y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:msub><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003BC;</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msubsup><mml:mrow><mml:mi>&#x003C3;</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msubsup><mml:mo>&#x0002B;</mml:mo><mml:msub><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.13 Multimodal alignment assessment metrics</title>
<p>With the increasing amount of research on leveraging multimodalities in deepfake generation (e.g., text-to-image, text-to-video, and audio-to-video), new evaluation metrics became needed to assess to what extent the covered modalities are aligned. The development of such metrics is challenging as they require a comprehensive and fine-grained cross-modal understanding (Huang et al., <xref ref-type="bibr" rid="B37">2023</xref>).</p>
<p>While MAA metrics have been used to assess different tasks (e.g., image-caption alignment), they can also be considered an alternative to PQA metrics for deepfake evaluation. For instance, with the introduction of state-of-the-art text-to-image models, such as DALL-E,<xref ref-type="fn" rid="fn0008"><sup>8</sup></xref> which enables easier generation of deepfake images from natural language descriptions, text-to-image alignment metrics can be utilised to assess whether the generated image aligns the provided description. With this respect, several metrics based on vision-language models, which can simultaneously learn from images/videos and texts [e.g., BLIP (Li J. et al., <xref ref-type="bibr" rid="B52">2022</xref>) and CLIP (Radford et al., <xref ref-type="bibr" rid="B82">2021</xref>)], have been proposed. Some examples of such metrics include <bold>ClipScore</bold> (Hessel et al., <xref ref-type="bibr" rid="B36">2021</xref>), <bold>disentangled BLIP-VQA</bold> (Huang et al., <xref ref-type="bibr" rid="B37">2023</xref>), and <bold>BLIP-CLIP</bold> (Chefer et al., <xref ref-type="bibr" rid="B12">2023</xref>) for text-to-image generation, as well as <bold>X-CLIP</bold> (Ni et al., <xref ref-type="bibr" rid="B75">2022</xref>) for text-to-video generation.</p>
<p>In addition to vision-language models, MAA metrics have also made use of multimodal large language models by generating a <italic>chain-of-thought</italic> (Wei et al., <xref ref-type="bibr" rid="B107">2022</xref>) through prompts. For example, <bold>X-IQE</bold> (Chen et al., <xref ref-type="bibr" rid="B13">2023</xref>) leverages MiniGPT-4 (Zhu et al., <xref ref-type="bibr" rid="B134">2024</xref>) with prompts prepared with the help of art professionals to evaluate the fidelity, alignment, and aesthetics of the generated images. Another example is <bold>T2VScore-A</bold> (Wu et al., <xref ref-type="bibr" rid="B108">2024</xref>), which uses GPT-3.5 to identify questions, choices, and answers to evaluate alignment between a generated video and a given text prompt. It is then measured based on the accuracy of visual question answering.</p>
<p>While existing MAA metrics mostly focus on the semantic alignment of modalities, temporal alignment is also worth considering for the MAA task. With this respect, <bold>AV-Align</bold> (Yariv et al., <xref ref-type="bibr" rid="B113">2024</xref>) is a metric for assessing temporal alignment between audio-video pairs. It is based on energy peaks of both modalities, i.e., the highest mean magnitude of optical flow for video frames and the onset of the audio waveform.</p>
</sec>
<sec>
<title>3.14 More about standards</title>
<p>Many of the basic performance metrics described in this section have been widely used by deepfake researchers as de facto standards, e.g., EER, log loss and MOS have been widely used in deepfake-related challenges. Also, the combination of precision, recall and F1-score has been widely used to assess the performance of binary classifiers. While there have been a number of ITU standards on PQA to date, there do not seem to be many standardisation efforts on the performance metrics for the evaluation of binary classifiers. This was the case until at least 2017 when ISO and IEC jointly set up the ISO/IEC JTC 1/SC 42,<xref ref-type="fn" rid="fn0009"><sup>9</sup></xref> a standardisation subcommittee (SC) focusing on AI under ISO/IEC JTC 1,<xref ref-type="fn" rid="fn0010"><sup>10</sup></xref> the joint technical committee for standardising &#x0201C;information technology.&#x0201D;</p>
<p>One recent effort that ISO/IEC JTC 1/SC 42 made is to produce the ISO/IEC TR 24029-1:2021 &#x0201C;Artificial Intelligence (AI)&#x02014;Assessment of the robustness of neural networks&#x02014;Part 1: Overview,&#x0201D;<xref ref-type="fn" rid="fn0011"><sup>11</sup></xref> a technical report (TR) that systematically covers many commonly used performance assessment concepts, methods, and metrics. Although the technical report has &#x0201C;neural networks&#x0201D; in its title, most performance assessment concepts, methods, and metrics included are common ones for all supervised machine learning models.</p>
<p>In terms of performance metrics, two other ongoing work items of the ISO/IEC JTC 1/SC 42 that deserve attention are as follows:</p>
<list list-type="bullet">
<list-item><p>ISO/IEC DTS (Draft Technical Specification) 4,213 &#x0201C;Information technology&#x02014;Artificial Intelligence&#x02014; Assessment of machine learning classification performance&#x0201D;<xref ref-type="fn" rid="fn0012"><sup>12</sup></xref></p></list-item>
<list-item><p>ISO/IEC AWI (Approved Work Item) TS (Technical Specifications) 5,471 &#x0201C;Artificial intelligence&#x02014;Quality evaluation guidelines for AI systems&#x0201D;<xref ref-type="fn" rid="fn0013"><sup>13</sup></xref></p></list-item>
</list>
<p>While the ISO/IEC JTC 1/SC 42 was created very recently, another standardisation subcommittee under ISO/IEC JTC1 has a much longer history of nearly 20 years: the ISO/IEC JTC 1/SC 37<xref ref-type="fn" rid="fn0014"><sup>14</sup></xref> that focuses on biometrics-related technology. This standardisation subcommittee is highly relevant for deepfake since deepfake faces can be used to spoof biometrics-based user authentication systems. In this context, the following three standards are of particular relevance:</p>
<p><bold>ISO/IEC 19795-1:2021 &#x0201C;Information technology&#x02014;Biometric performance testing and reporting&#x02014;Part 1: Principles and framework:&#x0201D;</bold><xref ref-type="fn" rid="fn0015"><sup>15</sup></xref> This standard covers general metrics about evaluating biometric systems. Two major metrics in this context are <bold>false accept rate</bold> (FAR) and <bold>false reject rate</bold> (FRR), which refer to the standard FPR and FNR, respectively. This standard also deprecates the use of single-number metrics including the EER and AUC (which were widely used in biometrics-related research in the past).</p>
<p><bold>ISO/IEC 30107-1:2016 &#x0201C;Information technology&#x02014;Biometric presentation attack detection&#x02014;Part 1: Framework:&#x0201D;</bold><xref ref-type="fn" rid="fn0016"><sup>16</sup></xref> This standard defines a general framework about <bold>presentation attack detection</bold> (PAD) mechanisms, where the term &#x0201C;<bold>presentation attack</bold>&#x0201D; refers to the &#x0201C;<italic>presentation of an artefact or of human characteristics to a biometric capture subsystem in a fashion intended to interfere with system policy</italic>.&#x0201D; It focuses on biometric recognition systems, where a PAD mechanism is a binary classifier trying to predict presentation attacks (also called attack presentations, e.g., fake faces) as positive and bona fide (real) presentations as negative.</p>
<p><bold>ISO/IEC 30107-3:2017 &#x0201C;Information technology&#x02014;Biometric presentation attack detection&#x02014;Part 3: Testing and reporting:&#x0201D;</bold><xref ref-type="fn" rid="fn0017"><sup>17</sup></xref> This standard defines a number of special performance metrics for evaluating PAD mechanisms standardised in the ISO/IEC 30107-1:2016. Three such metrics look at error rates: <bold>attack presentation classification error rate</bold> (APCER) referring to the standard FPR, <bold>normal/bona fide presentation classification error rate</bold> (NPCER/BPCER) referring to the standard FNR, and <bold>average classification error rate</bold> (ACER) that is defined as the average of the APCER and the NPCER/BPCER. Such metrics have been used in biometrics-related challenges such as Face Anti-spoofing (Presentation Attack Detection) Challenges.<xref ref-type="fn" rid="fn0018"><sup>18</sup></xref> When deepfake images or videos are used to spoof a biometric system, such standardised metrics will become relevant.</p>
</sec>
<sec>
<title>3.15 Discussion: performance metrics and standards</title>
<p>This section provides a comprehensive summary of performance metrics used for evaluating and benchmarking deepfake generators and detectors. It is rare that all such metrics are used for a specific application. Instead, one or several are chosen based on specific needs. For a deepfake detection system as a binary classifier, many researchers have chosen to use overall metrics such as accuracy, AUC, EER and log loss, but the combination of precision, recall and F1-score is also common. However, there is a growing interest in using more deepfake-focused metrics, including PQA and MAA metrics, especially for evaluating deepfake generation. Other than general evaluation metrics, some deepfake-related challenges and competitions have introduced their own specific metrics, such as DSR. Furthermore, there exist several metrics specific to certain deepfake-related tasks, including code generation, animation generation, and synthetic data generation (Bandi et al., <xref ref-type="bibr" rid="B8">2023</xref>). The use of different performance metrics can make the comparison of different reported results more difficult, so we hope the expected new ISO/IEC standard particularly ISO/IEC 4213 will help.</p>
<p>It is worth mentioning that, in addition to evaluating the performance of deepfake detectors, the introduced performance metrics for evaluating binary classifiers can also be used to evaluate the performance of deepfake generation methods by considering how deepfake detectors fail. For instance, organisers of the Voice Conversion Challenge 2018 and 2020 used this approach to benchmark how well voice conversion (VC) systems can generate high-quality fake speech samples.</p>
<p>Another point we would like to mention is that for deepfake videos there are two levels of performance metrics: those at the frame level (metrics of each frame), and those at the video level (metrics for the whole video). Generally speaking, the latter can be obtained by averaging the former for all frames, potentially following an adaptive weighting scheme, so that more important (key) frames will be counted more.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Deepfake-related datasets</title>
<p>In this section, we cover all deepfake-related datasets we identified from the meta-review of deepfake-related survey papers, deepfake-related challenges, three online collections of deepfake-related datasets on GitHub,<xref ref-type="fn" rid="fn0019"><sup>19</sup></xref><sup>&#x02013;</sup><xref ref-type="fn" rid="fn0021"><sup>21</sup></xref> and the co-authors&#x00027; personal collections. We explain the datasets covered in five categories: deepfake image datasets, deepfake video datasets, deepfake audio/speech datasets, deepfake text datasets, and hybrid deepfake datasets (mainly mixed image and video datasets).</p>
<p>Note that many datasets of real (authentic) media were also used by deepfake researchers for two purposes. First, any detectors would need both fake and real media to demonstrate their performance. Second, real media have also been used to train deepfake generators as the training set. In this section, we include only datasets containing deepfake media, some of which contain both deepfake and real media.</p>
<p>Some datasets, especially those created for deepfake-related challenges and competitions, have separate subsets for training and evaluation (testing) purposes. The split is necessary for such challenges and competitions, but not very useful for people who just want to use such datasets. Therefore, in this section when introducing such datasets we will ignore that level of details and focus on the total number of data including the number of real and fake samples.</p>
<sec>
<title>4.1 Deepfake image datasets</title>
<p><xref ref-type="table" rid="T2">Table 2</xref> shows basic information about the image datasets covered.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Deepfake-related image datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Size</bold></th>
<th valign="top" align="left"><bold>Year</bold></th>
<th valign="top" align="left"><bold>Generation method</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">SwapMe &#x00026; FaceSwap</td>
<td valign="top" align="left">4,310 images</td>
<td valign="top" align="left">2017</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">Fake Faces in the Wild (FFW)</td>
<td valign="top" align="left">53,000 images</td>
<td valign="top" align="left">2018</td>
<td valign="top" align="left">Fake YouTube videos</td>
</tr> <tr>
<td valign="top" align="left">generated.photos datasets</td>
<td valign="top" align="left">2.7 million images</td>
<td valign="top" align="left">Since 2018</td>
<td valign="top" align="left">StyleGAN</td>
</tr> <tr>
<td valign="top" align="left">MesoNet Deepfake Dataset</td>
<td valign="top" align="left">19,509 images</td>
<td valign="top" align="left">2018</td>
<td valign="top" align="left">Face extraction from forged videos</td>
</tr> <tr>
<td valign="top" align="left">100K-Generated-Images</td>
<td valign="top" align="left">100,000 images</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">A GAN generator</td>
</tr> <tr>
<td valign="top" align="left">Ding et al.&#x00027;s swapped face dataset</td>
<td valign="top" align="left">420,053 images</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">iFakeFaceDB</td>
<td valign="top" align="left">87,000 images</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">StyleGAN &#x0002B; GANprintR</td>
</tr> <tr>
<td valign="top" align="left">Faces-HQ</td>
<td valign="top" align="left">40,000 images</td>
<td valign="top" align="left">2019-20</td>
<td valign="top" align="left">Collection from other datasets</td>
</tr> <tr>
<td valign="top" align="left">CelebA-Spoof</td>
<td valign="top" align="left">625,537 images</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Face spoofing</td>
</tr> <tr>
<td valign="top" align="left">Diverse Fake Face Dataset (DFFD)</td>
<td valign="top" align="left">299,039 images</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Multiple facial manipulation methods</td>
</tr> <tr>
<td valign="top" align="left">DiffusionForensics</td>
<td valign="top" align="left">615,200 images</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Pretrained diffusion models</td>
</tr> <tr>
<td valign="top" align="left">DeepFakeFace (DFF)</td>
<td valign="top" align="left">120,000 images</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Diffusion models &#x0002B; Face manipulation methods</td>
</tr></tbody>
</table>
</table-wrap>
<p><bold>SwapMe and FaceSwap dataset</bold> (Zhou et al., <xref ref-type="bibr" rid="B132">2017</xref>): This dataset contains 4,310 images, including 2,300 real images and 2,010 fake images created using FaceSwap<xref ref-type="fn" rid="fn0022"><sup>22</sup></xref> and the SwapMe iOS app (now discontinued). The fake images were generated by tampering with one face in each authentic image with face swapping. The selected images cover diverse events, genders, ages, and races.</p>
<p><bold>Fake Faces in the Wild (FFW) dataset</bold> (Khodabakhsh et al., <xref ref-type="bibr" rid="B44">2018</xref>): This dataset contains 131,500 face images, including 78,500 bona fide images extracted from 150 videos in the FaceForensics dataset and 53,000 fake images extracted from 150 fake videos collected from YouTube. The fake images involve both tampered images and those generated using CGI.</p>
<p><bold>generated.photos datasets</bold><xref ref-type="fn" rid="fn0023"><sup>23</sup></xref>: This is a number of commercial datasets provided by Generated Media, Inc., with up to nearly 2.7 million synthetic face images generated by StyleGAN. A free edition with 10,000 128x128 synthetic images is made available for academic research. The website also provides an interactive face generator<xref ref-type="fn" rid="fn0024"><sup>24</sup></xref> and an API.<xref ref-type="fn" rid="fn0025"><sup>25</sup></xref> The generated.photos datasets have a good diversity: five age groups (infants, children, youth, adults, middle-aged), two genders (male and female), four ethnicities (white, black, Latino, Asian), four eye colours (brown, grey, blue, green), four hair colours (brown, black, blond, grey), three hair length (short, medium, long), facial expressions, three head poses (front facing, left facing, right facing), two emotions (joy and neutral), two face styles (natural, beautified).<xref ref-type="fn" rid="fn0026"><sup>26</sup></xref></p>
<p><bold>MesoNet Deepfake dataset</bold> (Afchar et al., <xref ref-type="bibr" rid="B1">2018</xref>): This dataset includes 19,457 face images,<xref ref-type="fn" rid="fn0027"><sup>27</sup></xref> including 7,948 deepfake images generated from 175 forged videos collected online and 11,509 real face images collected from various online sources. The face images were extracted from the collected videos by utilising a visual object detection method, and around 50 faces per scene were obtained.</p>
<p><bold>100K-Generated-Images</bold> (Karras et al., <xref ref-type="bibr" rid="B42">2019</xref>): This dataset includes 100,000 synthesised face, bedroom, car and cat images by a GAN generator trained based on real images in the FFHQ<xref ref-type="fn" rid="fn0028"><sup>28</sup></xref> and LSUN<xref ref-type="fn" rid="fn0029"><sup>29</sup></xref> datasets (three object types&#x02014;bedrooms, cars and cats&#x02014;for the latter). Note that the name &#x0201C;100K-Generated-Images&#x0201D; was not a proper one as the authors (Karras et al., <xref ref-type="bibr" rid="B42">2019</xref>) just used this to name a sub-folder of their Google Drive shared space, but it was used in one of the survey papers (Tong et al., <xref ref-type="bibr" rid="B100">2020</xref>).</p>
<p><bold>Ding et al. (</bold><xref ref-type="bibr" rid="B20"><bold>2020</bold></xref><bold>)&#x00027;s swapped face dataset</bold>: This dataset contains 420,053 images of celebrities, including 156,930 real ones downloaded using Google Image API and 263,123 fake face-swapped ones created using two different methods (Nirkin&#x00027;s method and Auto-Encoder-GAN). While the former method consisted of multiple techniques pipelined, the latter was fully automated based on a CNN architecture.</p>
<p><bold>iFakeFaceDB</bold> (Neves et al., <xref ref-type="bibr" rid="B73">2020</xref>): This dataset includes 87,000 224 &#x000D7; 224 face images, generated by processing some StyleGAN-generated synthetic images using the GAN-fingerprint Removal approach (GANprintR) proposed by Neves et al. (<xref ref-type="bibr" rid="B73">2020</xref>). It is the replaced version of the <bold>FSRemovalDB</bold> dataset, which contains 150,000 face images generated using an earlier version of GANprintR.</p>
<p><bold>Faces-HQ</bold> (Durall et al., <xref ref-type="bibr" rid="B23">2019</xref>): This dataset includes 40,000 high-resolution images, half real and half deepfake. The images were collected from four sources: the CelebA-HQ dataset,<xref ref-type="fn" rid="fn0030"><sup>30</sup></xref> the Flickr-Faces-HQ dataset (see text footnote <xref ref-type="fn" rid="fn0028"><sup>28</sup></xref>), the 100K-Faces dataset<xref ref-type="fn" rid="fn0031"><sup>31</sup></xref> (not available any longer, see the description of generated.photos datasets), and <ext-link ext-link-type="uri" xlink:href="https://thispersondoesnotexist.com/">thispersondoesnotexist.com</ext-link>.</p>
<p><bold>CelebA-Spoof</bold> (Zhang Y. et al., <xref ref-type="bibr" rid="B128">2020</xref>): This dataset includes 625,537 synthesised face images of 10,177 celebrities, with 43 rich attributes on face, illumination, environment and spoof types. The real images were selected from the CelebA dataset.<xref ref-type="fn" rid="fn0032"><sup>32</sup></xref> The 43 attributes include 40 for real images, covering all facial components and accessories (e.g., skin, nose, eyes, eyebrows, lip, hair, hat, and eyeglass), and 3 for fake images, covering spoof types, environments and illumination conditions.</p>
<p><bold>Diverse fake face dataset</bold> (Dang et al., <xref ref-type="bibr" rid="B15">2020</xref>): This dataset contains 299,039 images, including 58,703 real images sampled from three datasets [FFHQ (see text footnote <xref ref-type="fn" rid="fn0028"><sup>28</sup></xref>), CelebA (see text footnote <xref ref-type="fn" rid="fn0032"><sup>32</sup></xref>), and FaceForensics&#x0002B;&#x0002B;<xref ref-type="fn" rid="fn0033"><sup>33</sup></xref>] and 240,336 fake ones in four main facial manipulation types (identity swap, expression swap, attribute manipulation, and entire synthesis). The images cover two genders (male and female), a wide age group (the majority between 21 and 50 years old), and both low- and high-quality levels.</p>
<p><bold>DiffusionForensics</bold> (Wang et al., <xref ref-type="bibr" rid="B105">2023a</xref>,<xref ref-type="bibr" rid="B106">b</xref>): This dataset contains 615,200 diffusion-generated images, sourced from LSUN-Bedroom (see text footnote <xref ref-type="fn" rid="fn0029"><sup>29</sup></xref>), ImageNet (Deng et al., <xref ref-type="bibr" rid="B18">2009</xref>), and CelebA-HQ (see text footnote <xref ref-type="fn" rid="fn0030"><sup>30</sup></xref>) datasets. The dataset covers images that belong to one of the following image generation methods&#x02014;unconditional, conditional, and text-to-image generation, and contains three fields&#x02014;source image, reconstructed image, and the image referring to the error between these images. Moreover, 11 different generators were leveraged for the generation of the fake images.</p>
<p><bold>DeepFakeFace (DFF)</bold> (Song et al., <xref ref-type="bibr" rid="B90">2023a</xref>,<xref ref-type="bibr" rid="B91">b</xref>): This dataset contains 30,000 real and 90,000 fake celebrity images. The real images were retrieved from the IMDB-Wiki (Rothe et al., <xref ref-type="bibr" rid="B86">2015</xref>) dataset, and the fake ones were generated by utilising three different methods&#x02014;Stable Diffusion v1.5 and Stable Diffusion Inpainting diffusion models, as well as the InsightFace toolbox containing several face manipulation algorithms.</p>
</sec>
<sec>
<title>4.2 Deepfake video datasets</title>
<p><xref ref-type="table" rid="T3">Table 3</xref> shows basic information about the video datasets covered.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Deepfake-related video datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Size</bold></th>
<th valign="top" align="left"><bold>Year</bold></th>
<th valign="top" align="left"><bold>Language/ethnicity</bold></th>
<th valign="top" align="left"><bold>Generation method</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">DeepfakeTIMIT</td>
<td valign="top" align="left">620 videos</td>
<td valign="top" align="left">2018</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">GAN-based face swapping</td>
</tr> <tr>
<td valign="top" align="left">FaceForensics</td>
<td valign="top" align="left">1,004 videos</td>
<td valign="top" align="left">2018</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Face manipulation</td>
</tr> <tr>
<td valign="top" align="left">UADFV</td>
<td valign="top" align="left">98 videos</td>
<td valign="top" align="left">2018</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Multiple GANs</td>
</tr> <tr>
<td valign="top" align="left">FaceForensics&#x0002B;&#x0002B;</td>
<td valign="top" align="left">5,000 videos</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Multiple face manipulation methods</td>
</tr> <tr>
<td valign="top" align="left">Deep Fakes Dataset</td>
<td valign="top" align="left">142 videos</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Collection</td>
</tr> <tr>
<td valign="top" align="left">Celeb-DF v1</td>
<td valign="top" align="left">1,203 videos</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Multiple ethnic groups</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">Celeb-DF v2</td>
<td valign="top" align="left">6,229 videos</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Multiple ethnic groups</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">DFD</td>
<td valign="top" align="left">3,363 videos</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">DeeperForensics-1.0</td>
<td valign="top" align="left">60,000 videos</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">26 nationalities</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">DFDC (full)</td>
<td valign="top" align="left">128,154 videos</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Face/Audio swapping &#x0002B; GANs</td>
</tr> <tr>
<td valign="top" align="left">WildDeepfake</td>
<td valign="top" align="left">7,314 face sequences</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Online collection</td>
</tr> <tr>
<td valign="top" align="left">FFIW10<italic>K</italic></td>
<td valign="top" align="left">20,000 videos</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">Multilingual</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">KoDF</td>
<td valign="top" align="left">37,942 videos</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">Korean subjects</td>
<td valign="top" align="left">Face swapping &#x0002B; Face reenactment</td>
</tr> <tr>
<td valign="top" align="left">VideoForensicsHQ</td>
<td valign="top" align="left">1,737 videos</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Deep Video Portraits (DVP)</td>
</tr> <tr>
<td valign="top" align="left">DF-W</td>
<td valign="top" align="left">1,869 videos</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Online collection</td>
</tr> <tr>
<td valign="top" align="left">FMFCC-V</td>
<td valign="top" align="left">82,392 videos</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">Asian subjects</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">DF-Platter</td>
<td valign="top" align="left">133,260 videos</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">Indian subjects</td>
<td valign="top" align="left">Face swapping</td>
</tr> <tr>
<td valign="top" align="left">AV-Deepfake1M</td>
<td valign="top" align="left">1,146,760 videos</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Face reenactment &#x0002B; Text-to-speech</td>
</tr> <tr>
<td valign="top" align="left">DFDM</td>
<td valign="top" align="left">6,450 videos</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Face swapping</td>
</tr></tbody>
</table>
</table-wrap>
<p><bold>DeepfakeTIMIT</bold> (Korshunov and Marcel, <xref ref-type="bibr" rid="B47">2019</xref>): This dataset contains 620 deepfake face videos, generated by GAN-based face swapping without manipulation of audio, covering 32 subjects and two quality levels (high and low). The videos in the dataset are recordings of people facing the camera and reciting predetermined short English sentences.</p>
<p><bold>FaceForensics (FF)</bold> (R&#x000F6;ssler et al., <xref ref-type="bibr" rid="B84">2018</xref>): This dataset contains 1,004 face videos with over 500,000 frames, covering various quality levels and two types of facial manipulation using the Face2Face approach. This dataset is now replaced by the larger FaceForensics&#x0002B;&#x0002B; dataset (see below).</p>
<p><bold>FaceForensics&#x0002B;&#x0002B; (FF&#x0002B;&#x0002B;)</bold> (R&#x000F6;ssler et al., <xref ref-type="bibr" rid="B85">2019</xref>): This dataset contains 5,000 face videos with over 1.8 million manipulated frames, including 1,000 real videos (with 509,914 frames) downloaded from YouTube, and 4,000 fake videos created using four face manipulation methods (Deepfakes, Face2Face, FaceSwap and NeuralTextures). The videos cover two genders (male and female), and three quality levels (VGA/480p, HD/720p, and FHD/1080p).</p>
<p><bold>UADFV dataset</bold> (Li et al., <xref ref-type="bibr" rid="B54">2018</xref>): This dataset contains 98 face videos, half (49) are real ones downloaded from YouTube, and the other half are fake ones generated using the FakeApp mobile application (which is now discontinued). The video dataset was created to demonstrate a deepfake video detection method based on the detection of eye-blinking behaviours, so all the videos contain at least one eye-blinking event. All fake videos were created by swapping the original face in each of the real videos with the face of the actor Nicolas Cage,<xref ref-type="fn" rid="fn0034"><sup>34</sup></xref> thus, only one subject is represented.</p>
<p><bold>Deep fakes dataset</bold> (Ciftci et al., <xref ref-type="bibr" rid="B14">2020</xref>): This dataset contains 142 &#x0201C;in the wild&#x0201D; deepfake portrait videos, collected from a range of online sources including news articles, online forums, mobile apps, and research presentations. The videos are diverse, covering the source generative model, resolution, compression, illumination, aspect ratio, frame rate, motion, pose, cosmetics, occlusion, content, and context.</p>
<p><bold>Celeb-DF v1</bold><xref ref-type="fn" rid="fn0035"><sup>35</sup></xref>: This dataset contains 1,203 face videos of celebrities, including 408 real videos collected from YouTube with subjects of different ages, ethnic groups and genders, and 795 deepfake videos synthesised from these real videos.</p>
<p><bold>Celeb-DF v2</bold> (Li Y. et al., <xref ref-type="bibr" rid="B56">2020</xref>): This dataset contains 6,229 face videos of celebrities, including 590 real videos collected from YouTube with subjects of different ages, ethnic groups and genders, and 5,639 deepfake videos synthesised from these real videos. The deepfake videos were generated by swapping faces for each pair of the 59 celebrities, using an improved DeepFake synthesis algorithm.</p>
<p><bold>DeepFake detection dataset</bold> (Dufour and Gully, <xref ref-type="bibr" rid="B22">2019</xref>): This dataset contains 3,363 face videos, covering 28 subjects, gender, and skin colour. It was created as a joint effort between two units of Google, Inc.: Google AI<xref ref-type="fn" rid="fn0036"><sup>36</sup></xref> and JigSaw.<xref ref-type="fn" rid="fn0037"><sup>37</sup></xref> The deepfake videos were generated by using publicly available face-swapping methods although no more details were provided.</p>
<p><bold>DeeperForensics-1.0</bold> (Jiang et al., <xref ref-type="bibr" rid="B40">2020</xref>): This dataset contains 60,000 indoor face videos (with 17.6 million frames) generated by face swapping, covering 100 subjects, four skin tones (white, black, yellow, and brown), two genders (male and female), different age groups (20&#x02013;45), 26 nationalities, 7 different angles, 8 face expressions, and different head poses.</p>
<p><bold>DFDC (Deepfake Detection Challenge) full dataset</bold> (Dolhansky et al., <xref ref-type="bibr" rid="B21">2020</xref>): This dataset contains 128,154 face videos of 960 subjects, including 23,654 real videos from 3,426 paid actors and 104,500 deepfake videos created using eight different methods (DF-128, DF-256, MM/NN face swap, NTH, FSGAN, StyleGAN, refinement, and audio swap).</p>
<p><bold>WildDeepfake</bold> (Zi et al., <xref ref-type="bibr" rid="B136">2020</xref>): This dataset contains 7,314 face sequences extracted from 707 deepfake videos that were collected completely from the Internet. It covers diverse scenes, multiple persons in each scene and rich facial expressions. Different from other deepfake video datasets, WildDeepfake contains only face sequences, not the full videos. This makes the dataset more like between an image dataset and a video one. We decided to keep it in the video category since the selection process was still more video-focused.</p>
<p>FFIW10<italic>K</italic> <bold>(Face Forensics in the Wild) dataset</bold> (Zhou et al., <xref ref-type="bibr" rid="B133">2021</xref>): This dataset contains 10,000 real and 10,000 high-quality forgery videos, with video- and face-level annotations. The dataset focuses on a more challenging case for forgery detection: each video involves one to 15 individuals, but only some (a minority of) faces are manipulated. The deepfake videos were generated by utilising three different face-swapping methods&#x02014;two learning-based methods, DeepFaceLab and FSGAN, as well as a graphic-based method, FaceSwap.</p>
<p><bold>Korean DeepFake Detection Dataset</bold> (Kwon et al., <xref ref-type="bibr" rid="B48">2021</xref>): This dataset contains 37,942 videos of paid subjects (395 Koreans and 8 Southeastern Asians), including 62,166 real videos and 175,776 fake ones created using six methods, including three face-swapping methods (i.e., FaceSwap, DeepFaceLab, and FSGAN), one video-driven face reenactment method [i.e., First Order Motion Model (FOMM)], and two audio-driven face reenactment method [i.e., Audio-driven Talking Face HeadPose (ATFHP) and Wav2Lip]. The videos cover a balanced gender ratio and a wide range of age groups.</p>
<p><bold>VideoForensicsHQ</bold> (Fox et al., <xref ref-type="bibr" rid="B26">2021</xref>): This dataset contains 1,737 videos with 1,666,816 frames, including 1,339,843 real frames and 326,973 fake frames generated using Deep Video Portraits (DVP) (Kim et al., <xref ref-type="bibr" rid="B46">2018</xref>), i.e., a method that enables to transfer head pose, facial expression, and eye motion of the source while preserving the target&#x00027;s identity and appearance. The original videos were obtained from three sources: the dataset used in (Kim et al., <xref ref-type="bibr" rid="B45">2019</xref>), the Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS) (Livingstone and Russo, <xref ref-type="bibr" rid="B58">2018</xref>), and YouTube. Most videos have a resolution of 1280 &#x000D7; 720.</p>
<p><bold>DF-W</bold> (Pu et al., <xref ref-type="bibr" rid="B79">2021a</xref>,<xref ref-type="bibr" rid="B80">b</xref>): This dataset contains 1,869 real-world deepfake videos collected from two online video portals: YouTube (1,062) and Bilibili<xref ref-type="fn" rid="fn0038"><sup>38</sup></xref> (807). The authors also collected the same number of real videos from 6 research community datasets for the results reported in their paper (Pu et al., <xref ref-type="bibr" rid="B79">2021a</xref>), but they chose not to release such videos as part of DF-W.</p>
<p><bold>FMFCC-V</bold> (Li G. et al., <xref ref-type="bibr" rid="B50">2022a</xref>,<xref ref-type="bibr" rid="B51">b</xref>): This dataset contains 38,102 deepfake videos and 44,290 pristine videos, corresponding to over 23 million frames. It was created by a group of Chinese researchers from the State Key Laboratory of Information Security, Institute of Information Engineering, Chinese Academy of Sciences, for the accompanying video track of the first Fake Media Forensics Challenge of the China Society of Image and Graphics. The source videos were collected from 83 paid Asian (likely all Chinese) individuals. Then, the synthesised videos were generated by leveraging four face-swapping methods&#x02014;Faceswap, Faceswap-GAN, DeepFaceLab, and Recycle-GAN.</p>
<p><bold>DF-Platter</bold> (Narayan et al., <xref ref-type="bibr" rid="B71">2023a</xref>,<xref ref-type="bibr" rid="B72">b</xref>): This dataset contains 764 real videos of 454 Indian individuals, and 133,260 deepfake videos generated using three state-of-the-art synthesis methods: FSGAN (Nirkin et al., <xref ref-type="bibr" rid="B76">2019</xref>), FaceSwap, and FaceShifter (Li L. et al., <xref ref-type="bibr" rid="B53">2020</xref>). The videos were collected in the wild, particularly from YouTube, considering many diversity factors, such as gender, orientation, skin tone, face size (counted in pixels), lighting conditions, background, and in the presence of occlusion. Each video lasts approximately 20 seconds in duration.</p>
<p><bold>AV-Deepfake1M</bold> (Cai et al., <xref ref-type="bibr" rid="B10">2023a</xref>,<xref ref-type="bibr" rid="B11">b</xref>): This dataset contains over 1 million videos (286,721 real and 860,039 fake), corresponding to 1,886 hours of audio-visual data, generated from 2,068 unique subjects. The dataset covers different video manipulation techniques, including fake audio over fake visuals, fake audio over real visuals, and real audio over fake visuals. The fake audios were generated by an identity-dependent text-to-speech method, VITS, while the TalkLip model was used for face reenactment to generate lip-synchronised fake visual frames.</p>
<p><bold>Deepfakes from different models</bold> (Jia et al., <xref ref-type="bibr" rid="B38">2022a</xref>,<xref ref-type="bibr" rid="B39">b</xref>): This dataset contains 6,450 face-swap deepfake videos generated by five different Autoencoder models based on the Faceswap<xref ref-type="fn" rid="fn0039"><sup>39</sup></xref> software (i.e., Faceswap, Lightweight, IAE, Dfaker, and DFL-H128). For the generation of deepfake videos, real videos in the Celeb-DF dataset were used. The dataset is available upon request.</p>
</sec>
<sec>
<title>4.3 Deepfake audio/speech datasets</title>
<p><xref ref-type="table" rid="T4">Table 4</xref> shows basic information about the audio/speech datasets covered.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Deepfake-related audio datasets (<bold>VC</bold>: Voice Conversion, <bold>TTS</bold>: Text-to-speech, <bold>AS</bold>: Audio Splicing, <bold>SA</bold>: Spoofing Attack, <bold>VS</bold>: Voice Synthesis, <bold>SE</bold>: Speech Enhancement, <bold>CMF</bold>: Copy-Move Forgery).</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Size</bold></th>
<th valign="top" align="left"><bold>Year</bold></th>
<th valign="top" align="left"><bold>Language</bold></th>
<th valign="top" align="left"><bold>Generation method</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">VCC 2016 dataset</td>
<td valign="top" align="left">3,078 utterances</td>
<td valign="top" align="left">2016</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VC</td>
</tr> <tr>
<td valign="top" align="left">VCC 2018 dataset</td>
<td valign="top" align="left">2,582 utterances</td>
<td valign="top" align="left">2018</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VC</td>
</tr> <tr>
<td valign="top" align="left">VCC 2020 dataset</td>
<td valign="top" align="left">3,505 utterances</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">English, Finnish, German, Chinese</td>
<td valign="top" align="left">VC</td>
</tr> <tr>
<td valign="top" align="left">ASVspoof 2019 dataset (LA task)</td>
<td valign="top" align="left">121,461 utterances</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VS, VC</td>
</tr> <tr>
<td valign="top" align="left">ASVspoof 2021 dataset (LA task)</td>
<td valign="top" align="left">164,640 utterances</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VC, TTS, Hybrid SA</td>
</tr> <tr>
<td valign="top" align="left">ASVspoof 2021 dataset (DF task)</td>
<td valign="top" align="left">593,253 utterances</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">SA</td>
</tr> <tr>
<td valign="top" align="left">ReMASC</td>
<td valign="top" align="left">54,712 recordings</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VS</td>
</tr> <tr>
<td valign="top" align="left">FSD</td>
<td valign="top" align="left">650 songs</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Chinese</td>
<td valign="top" align="left">VC, VS</td>
</tr> <tr>
<td valign="top" align="left">DECRO</td>
<td valign="top" align="left">118,382 utterances</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">English, Chinese</td>
<td valign="top" align="left">SA</td>
</tr> <tr>
<td valign="top" align="left">WaveFake</td>
<td valign="top" align="left">134,268 audios</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">English, Japanese</td>
<td valign="top" align="left">TTS</td>
</tr> <tr>
<td valign="top" align="left">HAD</td>
<td valign="top" align="left">160,836 audios</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">Chinese</td>
<td valign="top" align="left">TTS, AS</td>
</tr> <tr>
<td valign="top" align="left">ADD 2022 dataset</td>
<td valign="top" align="left">154,949 audios</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">Chinese</td>
<td valign="top" align="left">VS, VC</td>
</tr> <tr>
<td valign="top" align="left">CMFD</td>
<td valign="top" align="left">5,600 audios</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">English, Chinese</td>
<td valign="top" align="left">CMF</td>
</tr> <tr>
<td valign="top" align="left">In-the-Wild</td>
<td valign="top" align="left">31,779 audios</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">Online collection</td>
</tr> <tr>
<td valign="top" align="left">SceneFake</td>
<td valign="top" align="left">84,480 audios</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">SE</td>
</tr> <tr>
<td valign="top" align="left">EmoFake</td>
<td valign="top" align="left">88,200 audios</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VC</td>
</tr> <tr>
<td valign="top" align="left">PartialSpoof</td>
<td valign="top" align="left">121,461 audios</td>
<td valign="top" align="left">2021&#x02013;2022</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">VC, TTS, AS</td>
</tr> <tr>
<td valign="top" align="left">CFAD</td>
<td valign="top" align="left">331,600 audios</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Chinese</td>
<td valign="top" align="left">TTS, AS</td>
</tr> <tr>
<td valign="top" align="left">ADD 2023 dataset</td>
<td valign="top" align="left">517,068 utterances</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Chinese</td>
<td valign="top" align="left">VC, TTS, AS</td>
</tr> <tr>
<td valign="top" align="left">MLAAD</td>
<td valign="top" align="left">163.9 hours of synthetic voice</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">23 languages</td>
<td valign="top" align="left">TTS</td>
</tr></tbody>
</table>
</table-wrap>
<p>Voice conversion (VC) is a technology that can be used to modify an audio and speech sample so that it appears as if spoken by a different (target) person than the original (source) speaker. Obviously, it can be used to generate deepfake audio/speech samples. The biennial Voice Conversion Challenge<xref ref-type="fn" rid="fn0040"><sup>40</sup></xref> that started in 2016 is a major challenge series on VC. Datasets released from this challenge series are very different from other deepfake datasets: the deepfake data is not included in the original dataset created by the organisers of each challenge, but in the participant submissions (which are retargeted/fake utterances produced by VC systems built by participants). The challenge datasets also include the evaluation (listening-based) results of all submissions. Some fake utterances may be produced by DL-based VC systems, so we consider all datasets from this challenge series relevant for the purpose of this survey.</p>
<p><bold>Voice conversion challenge 2016 database</bold> (Toda et al., <xref ref-type="bibr" rid="B98">2016</xref>): The original dataset created by the challenge organisers was derived from the DAPS (Device and Produced Speech) Dataset (Mysore, <xref ref-type="bibr" rid="B70">2015</xref>). It contains 216 utterances (162 for training and 54 for testing) per speaker from 10 speakers. Participating teams (17) developed their own VC systems for all 25 source-target speaker pairs and then submitted generated utterances for evaluation. At least six participating teams used DL-related techniques (LSTM, DNN) in their VC systems (see <xref ref-type="table" rid="T2">Table 2</xref> of the result analysis paper<xref ref-type="fn" rid="fn0041"><sup>41</sup></xref>), so the submitted utterances can certainly be considered deepfakes.</p>
<p><bold>Voice conversion challenge 2018 database</bold> (Lorenzo-Trueba et al., <xref ref-type="bibr" rid="B61">2018</xref>): The original dataset created by the challenge organisers was also based on the DAPS dataset. It contains 116 utterances (81 for training and 35 for testing) per speaker from 12 speakers in two different tasks (called Hub and Spoke). Participating teams (23 in total, all for Hub and 11 for Spoke) developed their own VC systems for all 16 source-target speaker pairs and then submitted generated utterances for evaluation. Compared to the 2016 challenge, more participating teams used DL-related techniques (e.g., WaveNet, LSTM, DNN, CycleGAN, DRM &#x02013; deep relational models, and ARBM &#x02013; adaptive restricted Boltzmann machines) in their VC systems.</p>
<p><bold>Voice conversion challenge 2020 database</bold> (Yi et al., <xref ref-type="bibr" rid="B119">2020</xref>): This dataset is based on the Effective Multilingual Interaction in Mobile Environments (EMIME) dataset,<xref ref-type="fn" rid="fn0042"><sup>42</sup></xref> a bilingual (Finnish/English, German/English, and Mandarin/English) database. It contains 145 utterances (120 for training and 25 for testing) per speaker from 14 speakers for two different tasks (with 4 &#x000D7; 4 and 4 &#x000D7; 6 source-target speaker pairs, respectively). Participating teams (33 in total, out of which 31 for Task 1 and 28 for Task 2) developed their own VC systems for all source-target speaker pairs and then submitted generated utterances for evaluation. Compared to the 2018 challenge, DL-based VC systems were overwhelmingly used by almost all participating teams (i.e., WaveNet and WaveGAN were among the most used DL-based building blocks).</p>
<p>A major set of deepfake speech datasets were created for the <bold>ASVspoof</bold> (Automatic Speaker Verification Spoofing and Countermeasures) Challenge<xref ref-type="fn" rid="fn0043"><sup>43</sup></xref> (2015&#x02013;2021, held biannually). The datasets for the 2019 and 2021 challenges contain speech data that can be considered deepfakes.</p>
<p><bold>ASVspoof 2019 challenge database</bold> (Wang et al., <xref ref-type="bibr" rid="B104">2020</xref>): This dataset is based on the Voice Cloning Toolkit (VCTK) corpus,<xref ref-type="fn" rid="fn0044"><sup>44</sup></xref> a multi-speaker English speech database captured from 107 speakers (46 males and 61 females). Two attack scenarios were considered: logical access (LA) involving spoofed (synthetic or converted) speech, and physical access (PA) involving replay attacks of previously recorded bona fide recordings). For our purpose in this survey, the LA scenario is more relevant. The LA part of the dataset includes 12,483 bona fide (real) utterances and 108,978 spoofed utterances. Some of the spoofed speech data for the LA scenario were produced using a generative model involving DL-based techniques such as long short-term memory (LSTM),<xref ref-type="fn" rid="fn0045"><sup>45</sup></xref> WaveNet (Oord et al., <xref ref-type="bibr" rid="B102">2016</xref>), WaveRNN (Kalchbrenner et al., <xref ref-type="bibr" rid="B41">2018</xref>), and WaveCycleGAN2 (Tanaka et al., <xref ref-type="bibr" rid="B95">2019</xref>). Note that the challenge organisers did not use the term &#x0201C;deepfake&#x0201D; explicitly, despite the fact that the DL-generated spoofed speech data can be considered as deepfakes.</p>
<p><bold>ASVspoof 2021 challenge&#x02014;logical access database</bold> (Delgado et al., <xref ref-type="bibr" rid="B16">2021a</xref>; Liu et al., <xref ref-type="bibr" rid="B57">2023</xref>): This dataset contains 16,492 bona fide and 148,148 spoofed speech data for the logical access (LA) task. The bona fide speech data in the dataset were sourced from the ASVspoof 2019 LA evaluation database, and the spoofed data were generated by leveraging 13 voice conversion, text-to-speech, and hybrid spoofing attack algorithms.</p>
<p><bold>ASVspoof 2021 challenge&#x02014;speech deepfake database</bold> (Delgado et al., <xref ref-type="bibr" rid="B17">2021b</xref>; Liu et al., <xref ref-type="bibr" rid="B57">2023</xref>): In 2021, the challenge included an explicitly defined track on deepfake, but the task description suggests that the organisers of the challenge considered a broader definition of the term &#x0201C;deepfake&#x0201D; by looking at spoofing human listeners rather than ASV (Automatic Speaker Verification) systems. The dataset includes 20,637 bona fide and 572,616 spoofed speech data. Other than the ASVspoof 2019 LA evaluation database, the bona fide speech data in this dataset were also sourced from VCC 2018 and VCC 2020 datasets. When it comes to spoofed data generation, over 100 different spoofing attack algorithms were used.</p>
<p><bold>ReMASC</bold> (Gong et al., <xref ref-type="bibr" rid="B29">2019a</xref>,<xref ref-type="bibr" rid="B30">b</xref>): This dataset contains 9,240 genuine and 45,472 replayed recordings of voice commands in English performed by English, Chinese, and Indian native speakers. Although the dataset mainly focuses on replay attacks rather than synthesised speech, it also contains fake videos generated using two speech synthesis methods. The dataset also includes a quick evaluation set, which is a small but representative dataset with around 2,000 samples, that can be used for the quick evaluation of developed audio deepfake detection models.</p>
<p><bold>Fake song detection dataset</bold> (Xie et al., <xref ref-type="bibr" rid="B111">2023a</xref>,<xref ref-type="bibr" rid="B112">b</xref>): This dataset contains 200 real and 450 fake songs in Chinese. To generate fake songs, initially, fake singing voices were generated by five state-of-the-art singing voice synthesis (DiffSinger) and singing voice conversion methods (RVC and three variations of SO-VITS). Then, instrumental tracks were extracted from real songs and combined with fake singing voices.</p>
<p><bold>DEepfake CROss-lingual dataset</bold> (Ba et al., <xref ref-type="bibr" rid="B6">2023a</xref>,<xref ref-type="bibr" rid="B7">b</xref>): This dataset consists of English and Chinese speech samples. The English part of the dataset contains 12,484 real and 42,800 fake utterances while the Chinese part consists of 21,218 real and 41,880 fake utterances. For the real speech samples, six Chinese recording datasets and the ASVspoof2019 LA (Wang et al., <xref ref-type="bibr" rid="B104">2020</xref>) dataset were used. The fake utterances, however, were generated by leveraging 10 different spoofing attack algorithms, including HiFiGAN, Multiband-MelGAN, PWG, Tacotron, FastSpeech2, StarGANv2, VITS, NVCNet, Baidu, and Xunfei.</p>
<p><bold>WaveFake</bold> (Frank and Sch&#x000F6;nherr, <xref ref-type="bibr" rid="B27">2021a</xref>,<xref ref-type="bibr" rid="B28">b</xref>): This dataset contains 16,283 real audio clips and 117,985 fake audio clips in English and Japanese, which corresponds to 175 h of audio data. The reference datasets for real videos include LJSPEECH for English and JSUT for Japanese. For the generation of fake audio clips, six different GAN-based architectures (i.e., MelGAN, PWG, Multi-band MelGAN, Full-band MelGAN, HiFi-GAN, and WaveGlow) were leveraged.</p>
<p><bold>Half-truth audio detection dataset</bold> (Yi et al., <xref ref-type="bibr" rid="B114">2021</xref>): This dataset focuses on the attack scenarios in which the attacker hides small fake audio clips into real speech audio. The dataset comprises 53,612 real audio clips and 107,224 fake audio clips in Chinese. The fake audio clips in the dataset cover both fully fake and partially fake samples. While the fully fake samples were generated utilising a text-to-speech method, namely GST Tacotron,<xref ref-type="fn" rid="fn0046"><sup>46</sup></xref> the latter set of videos were based on the manipulation of the audio segments corresponding to selected keywords in the original utterances.</p>
<p><bold>ADD 2022 dataset</bold> (Yi et al., <xref ref-type="bibr" rid="B115">2022a</xref>): This is the original dataset of the first audio deep synthesis detection challenge, held in 2022. The dataset covers data for the three tracks of the challenge&#x02014;low-quality fake audio detection (LF), partially fake audio detection (PF), and audio fake game (FG). In total, the dataset comprises 16,257 real audio samples and 138,692 fake audio samples in Chinese. The dataset is based on common Mandarin speech datasets, including AISHELL-1, AISHELL-3, and AISHELL-4, and the fake audios were generated using mainstream speech synthesis and voice conversion methods.</p>
<p><bold>Copy-Move forged dataset</bold> (Su et al., <xref ref-type="bibr" rid="B92">2023a</xref>,<xref ref-type="bibr" rid="B93">b</xref>): This dataset contains 3,600 real audio samples and 2,000 fake audio samples in two languages &#x02013; English and Chinese. The dataset was based on Librispeech and Chinspeech speech datasets, and the fake samples were generated by applying copy-move forgery to the samples of these two datasets.</p>
<p><bold>In-the-Wild</bold> (M&#x000FC;ller et al., <xref ref-type="bibr" rid="B68">2022</xref>): This dataset focuses on detecting audio deepfakes for politicians and celebrities. It comprises 19,963 real speeches and 11,816 fake audio samples of 58 public figures, derived from publicly available sources. In total, the dataset corresponds to 20.8 h of benign and 17.2 h of spoofed audio data.</p>
<p><bold>SceneFake</bold> (Yi et al., <xref ref-type="bibr" rid="B117">2022b</xref>, <xref ref-type="bibr" rid="B118">2024</xref>): This dataset focuses on detecting the manipulation of acoustic scenes, e.g., airport, street, or shopping, of audios. It consists of 19,838 real and 64,642 fake audio samples in English. The dataset was constructed based on ASVspoof 2019 LA (Wang et al., <xref ref-type="bibr" rid="B104">2020</xref>) dataset and the acoustic scene dataset from DCASE2022<xref ref-type="fn" rid="fn0047"><sup>47</sup></xref> challenge. The fake samples were generated by leveraging different speech enhancement models, including SSub, MMSE, Wiener, FullSubNet, WaveU-Net, and GCRN.</p>
<p><bold>Chinese fake audio detection dataset</bold> (Ma et al., <xref ref-type="bibr" rid="B63">2023a</xref>,<xref ref-type="bibr" rid="B64">b</xref>): This dataset consists of 115,800 real and 115,800 fake audio samples in Chinese. The audio samples in the dataset were generated by using the samples taken from 1302 speakers. The real audio samples were sourced from six different datasets, and twelve types of fake audio samples were generated from the real samples.</p>
<p><bold>EmoFake</bold> (Zhao et al., <xref ref-type="bibr" rid="B130">2023a</xref>,<xref ref-type="bibr" rid="B131">b</xref>): This dataset was designed for the detection of the changed emotion state of original audios. It comprises 35,000 real and 53,200 fake audio samples in English and supports five emotion states &#x02013; <italic>neutral, happy, sad, angry</italic>, and <italic>surprise</italic>. The fake audio samples were generated by leveraging seven open-source emotion voice conversion models&#x02014;VAW-GAN-CWT, DeepEST, Seq2Seq-EVC, CycleGAN-EVC, CycleTransGAN, EmoCycle-GAN, and StarGAN-EVC.</p>
<p><bold>PartialSpoof</bold> (Zhang et al., <xref ref-type="bibr" rid="B124">2021a</xref>,<xref ref-type="bibr" rid="B125">b</xref>): Similar to HAD, this dataset helps the detection of partially fake audio samples. It covers 12,483 real and 108,978 fake samples in English. The real samples in the dataset were collected from the ASVspoof 2019 LA dataset. Then, fake audios were generated by spoofing speech segments of the bona fide audio through voice conversion, text-to-speech, and audio splicing.</p>
<p><bold>ADD 2023 Dataset</bold> (Yi et al., <xref ref-type="bibr" rid="B116">2023</xref>): This is the dataset for the second Audio Deepfake Detection Challenge. The challenge had three main tracks &#x02013; audio fake game (FG), manipulation region location (RL), and deepfake algorithm recognition (AR). In total, the dataset involves 243,194 real utterances and 273,874 fake utterances. The dataset contains utterances from the ADD 2022 Dataset as well as more fake audios generated by manipulating the original utterances with real or synthesised audios.</p>
<p><bold>Multi-language audio anti-spoofing dataset</bold> (M&#x000FC;ller et al., <xref ref-type="bibr" rid="B69">2024</xref>): This dataset is based on MAILABS Speech Dataset and covers 163.9 h of synthetic voice. While the real video clips in the dataset involve human speeches in eight languages (i.e., English, French, German, Italian, Polish, Russian, Spanish, and Ukrainian), the dataset was enriched with computer-generated audio clips in 23 languages. For the generation of fake clips, 54 text-to-speech models comprising 21 different architectures were utilised.</p>
</sec>
<sec>
<title>4.4 Deepfake text datasets</title>
<p><xref ref-type="table" rid="T5">Table 5</xref> shows basic information about the text datasets covered.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Deepfake-related text datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Size</bold></th>
<th valign="top" align="left"><bold>Year</bold></th>
<th valign="top" align="left"><bold>Language</bold></th>
<th valign="top" align="left"><bold>Generation method</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">gpt-2-output-dataset</td>
<td valign="top" align="left">2,340,000 texts</td>
<td valign="top" align="left">2019</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">GPT-2</td>
</tr> <tr>
<td valign="top" align="left">TweepFake</td>
<td valign="top" align="left">25,572 tweets</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">Fake account collection</td>
</tr> <tr>
<td valign="top" align="left">TuringBench</td>
<td valign="top" align="left">168,612 news articles</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">19 LLMs</td>
</tr> <tr>
<td valign="top" align="left">GeneratedTextDetection</td>
<td valign="top" align="left">200 scientific papers &#x0002B; 200 abstracts</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">GPT-2</td>
</tr> <tr>
<td valign="top" align="left">ToxiGen</td>
<td valign="top" align="left">274,000 sentences</td>
<td valign="top" align="left">2022</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">GPT-3</td>
</tr> <tr>
<td valign="top" align="left">MAGE</td>
<td valign="top" align="left">447,674 texts</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">27 LLMs</td>
</tr> <tr>
<td valign="top" align="left">AI-Writer</td>
<td valign="top" align="left">2,000 news articles</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Text-generation service</td>
</tr> <tr>
<td valign="top" align="left">ArticleForge</td>
<td valign="top" align="left">2,000 news articles</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Text-generation service</td>
</tr> <tr>
<td valign="top" align="left">Kafkai</td>
<td valign="top" align="left">2,000 articles</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">Text-generation service</td>
</tr> <tr>
<td valign="top" align="left">RedditBot</td>
<td valign="top" align="left">1,774 comments</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">GPT-3 powered bot</td>
</tr> <tr>
<td valign="top" align="left">IDMGSP</td>
<td valign="top" align="left">29,000 scientific papers</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">Context-free grammar, LLMs</td>
</tr> <tr>
<td valign="top" align="left">AIRABIC</td>
<td valign="top" align="left">1,000 texts</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Arabic</td>
<td valign="top" align="left">ChatGPT</td>
</tr> <tr>
<td valign="top" align="left">HC3</td>
<td valign="top" align="left">125,230 answers to 37,175 questions</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">English, Chinese</td>
<td valign="top" align="left">ChatGPT</td>
</tr> <tr>
<td valign="top" align="left">Deepfake-BG</td>
<td valign="top" align="left">9,824 social media posts</td>
<td valign="top" align="left">2023</td>
<td valign="top" align="left">Bulgarian</td>
<td valign="top" align="left">GPT-2, ChatGPT</td>
</tr> <tr>
<td valign="top" align="left">HC3 Plus</td>
<td valign="top" align="left">214,498 texts</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">English, Chinese</td>
<td valign="top" align="left">ChatGPT</td>
</tr> <tr>
<td valign="top" align="left">CHEAT</td>
<td valign="top" align="left">50,699 abstracts</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">English</td>
<td valign="top" align="left">ChatGPT</td>
</tr></tbody>
</table>
</table-wrap>
<p><bold>gpt-2-output-dataset</bold> (OpenAI, <xref ref-type="bibr" rid="B77">2019</xref>): This dataset contains outputs generated by GPT-2 trained on the OpenAI&#x00027;s Webtext training set. It comprises 250K training, 5K test, and 5K validation samples for each of the nine different GPT-2 models trained, in total, 2,340,000 text samples.</p>
<p><bold>TweepFake</bold> (Fagni et al., <xref ref-type="bibr" rid="B24">2021a</xref>,<xref ref-type="bibr" rid="B25">b</xref>): This dataset consists of 25,572 tweets collected from 23 bots, imitating 17 human accounts. The bots leveraged different methods for generating fake tweets&#x02014;most were based on GPT-2 and RNN although other methods were also used, including Markov Chains and LSTM.</p>
<p><bold>TuringBench</bold> (Uchendu et al., <xref ref-type="bibr" rid="B101">2021</xref>): Focusing on the tasks of the Turing Test and Authorship Attribution for language models, this dataset is based on 10,000 news articles (mostly about politics). The titles of these articles were used to prompt 19 different LLMs (e.g., GPTs, GROVER, CTRL, XLM, PPLM etc.) to generate synthetic news articles similar to the original ones. After preprocessing, the size of the final dataset was reduced to 168,612.</p>
<p><bold>GeneratedTextDetection</bold> (Liyanage et al., <xref ref-type="bibr" rid="B59">2022a</xref>,<xref ref-type="bibr" rid="B60">b</xref>): This dataset contains two different corpora &#x02013; 100 research papers collected from arXiv.org with 100 fully synthetic papers generated using GPT-2 and 100 original abstracts from arXiv.org preprints with 100 abstracts semi-automatically manipulated (i.e., keeping some sentences from the original content) using a GPT-2-based model, Arxiv-NLP.</p>
<p><bold>ToxiGen</bold> (Hartvigsen et al., <xref ref-type="bibr" rid="B33">2022</xref>): Being constructed for hate speech detection, this English dataset contains 137,000 benign and 137,000 toxic sentences, both generated using GPT-3. The sentences in the dataset mention 13 different minority groups without using explicit language.</p>
<p><bold>MAGE (formerly, DeepfakeTextDetect)</bold> (Li et al., <xref ref-type="bibr" rid="B55">2024</xref>): This dataset used different types of real texts collected from existing datasets, including opinion statements, news articles, answers to questions, stories, sentence sets for reasoning, Wikipedia paragraphs, and abstracts of scientific articles. Then, synthetic texts were generated using 27 LLMs with three different prompts. In total, it contains 447,674 human-written and machine-generated texts.</p>
<p><bold>AI-Writer</bold> (Pu et al., <xref ref-type="bibr" rid="B81">2023</xref>): This dataset utilised AI-Writer,<xref ref-type="fn" rid="fn0048"><sup>48</sup></xref> i.e., an online commercial service for generating fake news articles, to generate 1,000 synthetic articles from 1000 real news titles sampled from the RealNews dataset.</p>
<p><bold>ArticleForge</bold> (Pu et al., <xref ref-type="bibr" rid="B81">2023</xref>): Similar to AI-Writer, this dataset was constructed using a commercial text generation service, ArticleForge,<xref ref-type="fn" rid="fn0049"><sup>49</sup></xref> which requires a set of keywords to generate fake news articles. The dataset contains 1,000 real and 1,000 synthetic news articles.</p>
<p><bold>Kafkai</bold> (Pu et al., <xref ref-type="bibr" rid="B81">2023</xref>): This dataset leveraged the Kafkai commercial text generation service, which generates synthetic articles from a chosen category and an initial priming text. For the construction of the dataset, 100 real articles on 10 of the 25 available categories, such as cyber security, SEO, and marketing, were used to generate 1,000 fake articles.</p>
<p><bold>RedditBot</bold> (Pu et al., <xref ref-type="bibr" rid="B81">2023</xref>): This dataset is based on a GPT-3 powered bot posting comments on a popular subreddit on Reddit.com. 887 comments posted by the bot were collected as synthetic comments while the same amount of real comments were sampled randomly from the forum threads with a bot comment.</p>
<p><bold>Identifying machine-generated scientific papers</bold> (Mosca et al., <xref ref-type="bibr" rid="B67">2023</xref>): This dataset contains 16,000 real scientific papers retrieved from arXiv.org and 13,000 fake papers generated by utilising different methods. These methods involve LLMs, i.e., GPT-2, GPT-3, ChatGPT, and Galactica, as well as SCIgen which is based on context-free grammars. In total, the dataset consists of 25 million tokens.</p>
<p><bold>AIRABIC</bold> (Alshammari and EI-Sayed, <xref ref-type="bibr" rid="B5">2023a</xref>,<xref ref-type="bibr" rid="B4">b</xref>): This Arabic dataset includes 500 human-written texts and 500 ChatGPT-generated texts. The human-written texts were sourced from passages from books and news articles, covering both classical and modern standard Arabic. Moreover, the dataset contains different text variations, including single and multi-paragraph compositions, bullet points, and passages with in-text citations.</p>
<p><bold>Human ChatGPT comparison corpus</bold> (Guo et al., <xref ref-type="bibr" rid="B31">2023a</xref>,<xref ref-type="bibr" rid="B32">b</xref>): This bilingual (i.e., English and Chinese) dataset involves questions as well as answers to them provided by humans and ChatGPT to enable a comparison between human and ChatGPT answers to the same questions. In total, it covers 37,175 questions, 80,805 human answers, and 44,425 ChatGPT answers. The questions and human answers were collected from four English and six Chinese question-answering datasets. In addition, more question-answer pairs were constructed by crawling concept-explanation pairs from online encyclopaedias, Wikipedia<xref ref-type="fn" rid="fn0050"><sup>50</sup></xref> and BaiduBaike.<xref ref-type="fn" rid="fn0051"><sup>51</sup></xref></p>
<p><bold>Deepfake-BG</bold> (Temnikova et al., <xref ref-type="bibr" rid="B96">2023</xref>): This dataset consists of 4,912 human-written and 4,912 synthetic social media posts in Bulgarian. The human-written posts were randomly selected from multiple large datasets. The other posts were generated by leveraging two different LLMs&#x02014;a new Bulgarian GPT-2 model, called GPT-WEB-BG, and ChatGPT for Bulgarian.</p>
<p><bold>HC3 Plus</bold> (Su et al., <xref ref-type="bibr" rid="B94">2024</xref>): This is an extended version of the HC3 dataset, covering translation, summarisation, and paraphrasing tasks. It contains 144,528 English texts and 69,970 Chinese texts, all generated by using ChatGPT. In terms of types of texts, the dataset involves news articles, social media posts, and questions.</p>
<p><bold>CHatGPT-writtEn AbsTract</bold> (Yu et al., <xref ref-type="bibr" rid="B121">2024a</xref>,<xref ref-type="bibr" rid="B122">b</xref>): This dataset contains 15,395 human-written abstracts and 35,304 ChatGPT-written abstracts. While human-written abstracts were collected from IEEE Xplore, ChatGPT was utilised to generate three types of synthetic abstracts&#x02014;fully synthetic based on title and keywords, polished version of human-written abstracts, and a mixture of human-written abstracts and their polished versions.</p>
</sec>
<sec>
<title>4.5 Hybrid deepfake datasets</title>
<p><xref ref-type="table" rid="T6">Table 6</xref> shows basic information about the hybrid datasets covered.</p>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Hybrid deepfake-related datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="left"><bold>Size</bold></th>
<th valign="top" align="left"><bold>Year</bold></th>
<th valign="top" align="left"><bold>Language/Ethnicity</bold></th>
<th valign="top" align="left"><bold>Generation method</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">NIST open media forensics challenge datasets</td>
<td valign="top" align="left">Over 1,000 images and over 100 videos</td>
<td valign="top" align="left">2020</td>
<td valign="top" align="left">Not specified</td>
<td valign="top" align="left">GAN</td>
</tr> <tr>
<td valign="top" align="left">ForgeryNet dataset</td>
<td valign="top" align="left">2,896,062 images and 221,247 videos</td>
<td valign="top" align="left">2021</td>
<td valign="top" align="left">Not Specified</td>
<td valign="top" align="left">Face reenactment, Face editing, Face transfer, Face swapping, Face stacked manipulation</td>
</tr> <tr>
<td valign="top" align="left">Homologous deepfake dataset</td>
<td valign="top" align="left">6,802 images and 500 videos</td>
<td valign="top" align="left">2024</td>
<td valign="top" align="left">Chinese</td>
<td valign="top" align="left">Face swapping, Face reenactment, Attribute editing, GAN</td>
</tr></tbody>
</table>
</table-wrap>
<p><bold>NIST OpenMFC (Open Media Forensics Challenge) Datasets</bold><xref ref-type="fn" rid="fn0052"><sup>52</sup></xref>: These datasets were created by the DARPA Media Forensics (MediFor) Program<xref ref-type="fn" rid="fn0053"><sup>53</sup></xref> for the 2020 OpenMFC.<xref ref-type="fn" rid="fn0054"><sup>54</sup></xref> There are two GAN-generated deepfake datasets, one with more than 1,000 deepfake images and the other with over 100 deepfake videos. The datasets were made available to registered participants of the competition only.</p>
<p><bold>ForgeryNet</bold> (He et al., <xref ref-type="bibr" rid="B34">2021</xref>): This dataset is named as &#x0201C;a versatile benchmark for comprehensive forgery analysis.&#x0201D; It contains 2,896,062 images and 221,247 videos, including 1,457,861 fake images and 121,617 fake videos. The videos and images cover seven image-level and eight video-level manipulation approaches, 36 different types of perturbations and more mixed perturbations, and a large number of annotation labels (6.3 million classification labels, 2.9 million manipulated area annotations and 221,247 temporal forgery segment labels). The manipulation approaches belong to five main generation methods&#x02014;face reenactment, face editing, face transfer, face swapping, and face stacked manipulation. The dataset is being used for supporting the Face Forgery Analysis Challenge 2021<xref ref-type="fn" rid="fn0055"><sup>55</sup></xref> at the SenseHuman 2021 (3rd Workshop on Sensing, Understanding and Synthesizing Humans),<xref ref-type="fn" rid="fn0056"><sup>56</sup></xref> co-located at the ICCV 2021 conference.<xref ref-type="fn" rid="fn0057"><sup>57</sup></xref></p>
<p><bold>Homologous deepfake dataset</bold> (Xidian University, Modern Image Processing Lab, <xref ref-type="bibr" rid="B110">2024</xref>): This dataset contains 6,802 fake images, 400 fake videos, and 100 real videos. According to the developers of the dataset, it is the first facial dataset for Chinese people. The dataset covers four types of deepfake multimedia, including full-face generation, attribute editing, face reenactment, and face swap.</p>
</sec>
<sec>
<title>4.6 Deepfake dataset generators</title>
<p>Despite not being datasets per se, dataset generators are systems for producing large datasets more automatically, including generating deepfake datasets. One may argue the automatically generated datasets are fake since they are not produced from real-world scenes. With this respect, we cover some state-of-the-art deepfake dataset generators that can be used to obtain new deepfake datasets, here.</p>
<p><bold>DatasetGAN</bold> (Zhang Y. et al., <xref ref-type="bibr" rid="B127">2021</xref>): This generator focuses on generating image-annotation pairs. It initially requires synthesising a small number of images with a GAN architecture and their annotation by humans. Then, an ensemble of multilayer perceptron (MLP) classifiers is trained on the pixel-wise feature vector of the used GAN architecture. This trained classifier is used as the label synthesis branch of the GAN architecture.</p>
<p><bold>BigDatasetGAN</bold> (Li D. et al., <xref ref-type="bibr" rid="B49">2022</xref>): This is an extended version of DatasetGAN, enhancing the ImageNET dataset, covering 1 million images, with pixel-wise labels. It leverages two ImageNET-pretrained models, BigGAN and VQGAN, to generate 5 samples for each of the 1,000 classes in ImageNET. Similar to DatasetGAN, these samples are manually labelled and used to train a classifier for pixel-level label synthesis.</p>
<p><bold>DatasetDM</bold> (Wu et al., <xref ref-type="bibr" rid="B109">2023</xref>): This aims to enhance the previous generators in terms of the produced image quality and generalisability. It makes use of text-to-image diffusion models (e.g., Stable Diffusion) to generate more realistic synthetic images with several perception annotations, including depth, segmentation, and human pose estimation. It also requires much fewer samples to be manually labelled, compared to the previous generators.</p>
</sec>
<sec>
<title>4.7 Subjective quality of deepfakes in different databases</title>
<p>As mentioned in Section 3.12, subjective quality evaluation is necessary to evaluate the realness, realisticness, and naturalness of deepfake media. While there has been very limited work on this topic, in 2020, Jiang et al. (<xref ref-type="bibr" rid="B40">2020</xref>) conducted a user study on the realness of deepfake videos. They recruited 100 professional participants (most of whom are computer vision researchers), who were asked to evaluate the realness of 30 randomly selected videos from 7 deepfake video datasets (DeeperForensics-1.0, UADFV, DeepFake-TIMIT, Celeb-DF, FaceForensics&#x0002B;&#x0002B;, Deep Fake Detection, and DFDC). Participants were asked to respond to the statement &#x0201C;The video clip looks real.&#x0201D; and gave scores following a five-point Likert scale (1 &#x02013; clearly disagree, 2 &#x02013; weakly disagree, 3 &#x02013; borderline, 4 &#x02013; weakly agree, 5 &#x02013; clearly agree). <xref ref-type="table" rid="T7">Table 7</xref> shows the results. Interestingly, we can see a huge difference between the realness levels of different datasets. What is probably quite surprising is that FaceForensics&#x0002B;&#x0002B;, one of the most widely used deepfake datasets, has a very low MOS score and less than 9% of participants considered the 30 selected videos as real.</p>
<table-wrap position="float" id="T7">
<label>Table 7</label>
<caption><p>Human-judged subjective quality (realness) of deepfake videos in 7 datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>MOS</bold></th>
<th valign="top" align="center"><bold>4&#x0002B; ratings (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">DeeperForensics-1.0</td>
<td valign="top" align="center">3.806</td>
<td valign="top" align="center">64.1%</td>
</tr> <tr>
<td valign="top" align="left">Celeb-DF</td>
<td valign="top" align="center">3.723</td>
<td valign="top" align="center">61.0%</td>
</tr> <tr>
<td valign="top" align="left">DFDC</td>
<td valign="top" align="center">2.539</td>
<td valign="top" align="center">23%</td>
</tr> <tr>
<td valign="top" align="left">Deep Fake Detection</td>
<td valign="top" align="center">2.518</td>
<td valign="top" align="center">21.9%</td>
</tr> <tr>
<td valign="top" align="left">UADFV</td>
<td valign="top" align="center">2.249</td>
<td valign="top" align="center">14.1%</td>
</tr> <tr>
<td valign="top" align="left">DeepFake-TIMIT</td>
<td valign="top" align="center">2.205</td>
<td valign="top" align="center">12.3%</td>
</tr> <tr>
<td valign="top" align="left">FaceForensics&#x0002B;&#x0002B;</td>
<td valign="top" align="center">1.874</td>
<td valign="top" align="center">8.4%</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The MOS scores were not reported by Jiang et al. (<xref ref-type="bibr" rid="B40">2020</xref>), but calculated by us based on the raw data shown in Table 3 of Jiang et al. (<xref ref-type="bibr" rid="B40">2020</xref>).</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.8 Discussion: datasets</title>
<p>Among all deepfake image and video datasets, a significant majority are about face images and videos. This is not surprising since face swapping, face attribution manipulation, and fully synthesised face images are among the hottest topics within deepfake research and real-world applications. We hope more non-face deepfake image and video datasets can be produced to support a broader range of research activities on deepfake.</p>
<p>Several datasets mentioned in this survey overlooked reporting the language(s) and/or ethnicity(ies) they cover, which could be quite useful information in many applications. For those reporting the covered language(s), the majority focused on English, followed by Chinese. This indicates the need for new deepfake datasets covering underrepresented, and especially low-resource, languages and ethnicities.</p>
<p>The subjective quality results shown in <xref ref-type="table" rid="T7">Table 7</xref> indicate that it is important to check the realness of deepfake media to support any performance evaluation or comparison. To ensure that the quality evaluation of datasets is fair, transparent and reliable, standard procedures need defining and a common pool of qualified human experts should be used.</p>
<p>Many authors of deepfake-related datasets attempted to classify such datasets into different generations. Chronologically speaking, we could broadly split such datasets into two generations: before 2019 and since 2019. Typically, datasets created before 2019 are relatively less advanced and smaller, while those created after 2019 tend to be larger, more diverse (i.e., covering more attributes), and of higher quality (i.e., produced by more advanced generative models). This can also be seen from the data in <xref ref-type="table" rid="T7">Table 7</xref>, in which the top two datasets (DeeperForensics-1 and Celeb-DF) fall within the new generation (2020), while others belong to the old generation. In addition to the two generations, a newer generation has also emerged in 2021: a number of very recent datasets started focusing on more realistic deepfakes (i.e., in the wild) or more specified areas of deepfakes (e.g., FFIW<sub>10<italic>K</italic></sub> focusing on multiple faces in the same video, and KoDF focusing on Korean faces). This trend shows that the deepfake research community has grown significantly in the past few years so narrower topics have also started gaining attention and interest from some researchers.</p>
<p>The introduction of conversational AI systems, especially ChatGPT, appeared as a game-changer for deepfake generation due to their high usability and accessibility. They have increasingly been used by researchers to generate deepfake datasets although their current usage is mostly limited to generating deepfake texts. However, we believe that new image, video, audio, and hybrid deepfake datasets can be constructed with such systems, considering the multimodal capabilities of the state-of-the-art generative AI models, e.g., GPT-4o.<xref ref-type="fn" rid="fn0058"><sup>58</sup></xref></p></sec>
</sec>
<sec id="s5">
<title>5 A meta-review of deepfake-related surveys</title>
<p>This section presents a meta-review of 15 selected deepfake-related survey papers published in English (Lyu, <xref ref-type="bibr" rid="B62">2020</xref>; Tolosana et al., <xref ref-type="bibr" rid="B99">2020</xref>; Tong et al., <xref ref-type="bibr" rid="B100">2020</xref>; Verdoliva, <xref ref-type="bibr" rid="B103">2020</xref>; Younus and Hasan, <xref ref-type="bibr" rid="B120">2020</xref>; Zhang T. et al., <xref ref-type="bibr" rid="B126">2020</xref>; Deshmukh and Wankhade, <xref ref-type="bibr" rid="B19">2021</xref>; Mirsky and Lee, <xref ref-type="bibr" rid="B66">2021</xref>; Nguyen et al., <xref ref-type="bibr" rid="B74">2022</xref>; Rana et al., <xref ref-type="bibr" rid="B83">2022</xref>; Seow et al., <xref ref-type="bibr" rid="B89">2022</xref>; Heidari et al., <xref ref-type="bibr" rid="B35">2023</xref>; Khanjani et al., <xref ref-type="bibr" rid="B43">2023</xref>; Masood et al., <xref ref-type="bibr" rid="B65">2023</xref>; Sandotra and Arora, <xref ref-type="bibr" rid="B87">2024</xref>). It covers the following aspects in a systematic manner: definitions and scope, performance metrics, datasets, performance comparison, key challenges and recommendations.</p>
<p>The meta-review aims to draw some high-level insights for monitoring the future development of deepfake-related technologies and their applications.</p>
<sec>
<title>5.1 Definitions and scope</title>
<p>As we discussed in Section 1.1, among researchers, practitioners and lawmakers there is no universally accepted definition of &#x0201C;deepfake&#x0201D; as a term. This is also reflected in how the authors of the 15 survey papers considered this aspect. Most authors talked about the history of deepfakes and pointed out that the term reflects the combination of &#x0201C;deep learning&#x0201D; and &#x0201C;fake,&#x0201D; but some used a broader definition, e.g., Lyu (<xref ref-type="bibr" rid="B62">2020</xref>) defined deepfake as &#x0201C;<italic>high quality fake videos and audios generated by AI algorithms</italic>.&#x0201D; Some authors also referred to deepfake-related legislations, but none of them pointed out that the definitions in some such legislations are completely different from the more technical definitions involving the use of deep learning. No authors discussed the blurred boundary between deepfakes and non-deepfakes.</p>
<p>In terms of the scope, while some authors (correctly) considered all types of media that can be produced by deepfake-related techniques (Lyu, <xref ref-type="bibr" rid="B62">2020</xref>; Tong et al., <xref ref-type="bibr" rid="B100">2020</xref>; Rana et al., <xref ref-type="bibr" rid="B83">2022</xref>; Heidari et al., <xref ref-type="bibr" rid="B35">2023</xref>; Masood et al., <xref ref-type="bibr" rid="B65">2023</xref>; Sandotra and Arora, <xref ref-type="bibr" rid="B87">2024</xref>), some considered only a narrow scope, e.g., authors of Tolosana et al. (<xref ref-type="bibr" rid="B99">2020</xref>), Younus and Hasan (<xref ref-type="bibr" rid="B120">2020</xref>), and Zhang T. et al. (<xref ref-type="bibr" rid="B126">2020</xref>) considered only videos, authors of Verdoliva (<xref ref-type="bibr" rid="B103">2020</xref>), Deshmukh and Wankhade (<xref ref-type="bibr" rid="B19">2021</xref>), Nguyen et al. (<xref ref-type="bibr" rid="B74">2022</xref>), and Seow et al. (<xref ref-type="bibr" rid="B89">2022</xref>) have only considered images and videos, and Khanjani et al. (<xref ref-type="bibr" rid="B43">2023</xref>) only considered audio deepfakes. Another phenomenon we observed is that many authors focused more on face images and videos, and authors of three surveys (Tolosana et al., <xref ref-type="bibr" rid="B99">2020</xref>; Younus and Hasan, <xref ref-type="bibr" rid="B120">2020</xref>; Deshmukh and Wankhade, <xref ref-type="bibr" rid="B19">2021</xref>) even limited the definition of &#x0201C;deepfake&#x0201D; to such a narrow scope:</p>
<list list-type="bullet">
<list-item><p>Deshmukh and Wankhade (<xref ref-type="bibr" rid="B19">2021</xref>) defined it as &#x0201C;<italic>a technology which creates fake images or videos of targeted humans by swapping their faces [by] another character saying or doing things that are not absolutely done by them and humans start believing in such fake as it is not always recognisable with the everyday human eye</italic>;&#x0201D;</p></list-item>
<list-item><p>Younus and Hasan (<xref ref-type="bibr" rid="B120">2020</xref>) considered deepfake as a technique allowing &#x0201C;<italic>any computer user to exchange the face of one person with another digitally in any video</italic>;&#x0201D; and</p></list-item>
<list-item><p>Tolosana et al. (<xref ref-type="bibr" rid="B99">2020</xref>) defined it as &#x0201C;<italic>a deep learning based technique able to create fake videos by swapping the face of a person by the face of another person</italic>.&#x0201D;</p></list-item>
</list>
<p>Such unnecessarily narrow definitions and scopes can lead to confusion and do not help exchanges between researchers and practitioners working on different types of deepfakes.</p>
<p>We call on more researchers to accept a broader definition of &#x0201C;deepfake&#x0201D; so that highly realistic/natural media of any kind generated by a sophisticated automated method (often AI-based) is considered deepfake. Here, we provide two examples of such a broader definition: the image2image (or pixel2pixel) technique (Zhu et al., <xref ref-type="bibr" rid="B135">2017</xref>) that allows the production of deepfake images and videos of any objects, and the so-called &#x0201C;deepfake geography (Zhao et al., <xref ref-type="bibr" rid="B129">2021</xref>),&#x0201D; where AI-based techniques are used to generate realistic-looking satellite images.</p>
<p>Another important fact missed or not sufficiently discussed by authors of all the surveys, except Sandotra and Arora (<xref ref-type="bibr" rid="B87">2024</xref>), is that deepfake techniques can be used for positive applications, e.g., creative arts, entertainment and protecting online users&#x00027; privacy. We call for more researchers and practitioners to follow the proposal in the 2020 Tencent AI White Paper (Tencent, <xref ref-type="bibr" rid="B97">2020</xref>) to start using the more neutral-sounding term &#x0201C;deep synthesis.&#x0201D; Accordingly, we can use different words for different types of data generated using &#x0201C;deep synthesis&#x0201D; techniques, e.g., &#x0201C;deep art,&#x0201D; &#x0201C;deep animation,&#x0201D; &#x0201C;deep music,&#x0201D; and &#x0201C;deepfake.&#x0201D; While authors of most survey papers did not recognise the positive applications of &#x0201C;deepfake&#x0201D; technologies, Seow et al. (<xref ref-type="bibr" rid="B89">2022</xref>) and Sandotra and Arora (<xref ref-type="bibr" rid="B87">2024</xref>) covered positive applications, including entertainment, business, education, art, and medicine. Other than that, some other researchers also considered such applications, e.g., organisers of the Voice Conversion Challenge 2020 (see text footnote <xref ref-type="fn" rid="fn0040"><sup>40</sup></xref>) who said the VC technology (for speech deepfake) &#x0201C;<italic>is useful in many applications, such as customizing audio book and avatar voices, dubbing, movie industry, teleconferencing, singing voice modification, voice restoration after surgery, and cloning of voices of historical persons</italic>.&#x0201D;</p>
</sec>
<sec>
<title>5.2 Performance metrics</title>
<p>Surprisingly, only two of the 15 surveys (Rana et al., <xref ref-type="bibr" rid="B83">2022</xref>; Heidari et al., <xref ref-type="bibr" rid="B35">2023</xref>) have covered performance metrics explicitly. Some directly used performance metrics to explain and compare the performance of covered deepfake generation and detection methods. The most used performance metrics include accuracy, ERR, and AUC. This may be explained by the page constraints of such survey papers, which did not allow the authors to extend their coverage significantly to cover performance metrics systematically. From this perspective, our Section 3 aims to fill this gap by providing a comprehensive coverage of relevant metrics and standards, including those specific to deepfakes. The subjective quality of deepfakes is an area least covered by the surveys, which seems related to an unbalanced coverage of deepfake generation and deepfake detection in terms of performance evaluation and comparison (the former much less than the latter).</p>
</sec>
<sec>
<title>5.3 Datasets</title>
<p>Many of the 15 survey papers list a number of deepfake-related datasets, but none of them has coverage as complete as ours shown in Section 4. Firstly, none of the surveys has covered text datasets, and only three of them (Heidari et al., <xref ref-type="bibr" rid="B35">2023</xref>; Khanjani et al., <xref ref-type="bibr" rid="B43">2023</xref>; Masood et al., <xref ref-type="bibr" rid="B65">2023</xref>) mentioned audio datasets. When it comes to the coverage of image, video, and audio datasets, most surveys only listed more popular ones, instead of a more complete coverage of the available datasets. For instance, none of the surveys have covered the Voice Conversion Challenge 2016/2018/2020 datasets. In addition, more recent deepfake datasets especially those released since 2021 are also not covered by any of the surveys. We believe that our Section 4 is the most comprehensive review of deepfake-related datasets so far.</p>
<p>Some survey papers include datasets that are likely deepfakes, e.g., Verdoliva (<xref ref-type="bibr" rid="B103">2020</xref>) covered many general fake image datasets where the manipulated images were not generated by deep learning or even AI-based methods, and some surveys mentioned ASVspoof 2015 datasets but we did not see the use of deep learning for generating data used in the dataset.</p>
</sec>
<sec>
<title>5.4 Performance comparison</title>
<p>Most surveys have a good coverage of related methods for deepfake generation and detection, but only some explicitly covered performance comparison between different methods (Tolosana et al., <xref ref-type="bibr" rid="B99">2020</xref>; Mirsky and Lee, <xref ref-type="bibr" rid="B66">2021</xref>; Seow et al., <xref ref-type="bibr" rid="B89">2022</xref>; Masood et al., <xref ref-type="bibr" rid="B65">2023</xref>; Sandotra and Arora, <xref ref-type="bibr" rid="B87">2024</xref>).</p>
<p>Due to quality issues of many deepfake-related datasets (discussed in Section 4.7), we need to treat any performance metrics and comparison of different detection methods with caution. Without testing all methods on a sufficiently large, diverse and high-quality deepfake dataset, the performance comparison results can be misleading. This highlights the importance of having more challenges, competitions and benchmarks to encourage performance comparison on standard datasets and using consistent performance metrics.</p>
</sec>
<sec>
<title>5.5 Challenges and recommendations</title>
<p>The authors of some surveys identified some key challenges and future research directions for the deepfake community.</p>
<p>Not surprisingly, how to develop more robust, scalable, generalisable and explainable deepfake detection methods is one of the most discussed key challenges and also a major future research direction (Lyu, <xref ref-type="bibr" rid="B62">2020</xref>; Tong et al., <xref ref-type="bibr" rid="B100">2020</xref>; Verdoliva, <xref ref-type="bibr" rid="B103">2020</xref>; Younus and Hasan, <xref ref-type="bibr" rid="B120">2020</xref>; Deshmukh and Wankhade, <xref ref-type="bibr" rid="B19">2021</xref>; Rana et al., <xref ref-type="bibr" rid="B83">2022</xref>; Heidari et al., <xref ref-type="bibr" rid="B35">2023</xref>; Masood et al., <xref ref-type="bibr" rid="B65">2023</xref>). Considering the arms race between deepfake generation and detection, this research direction will likely remain the hottest topic in deepfake research.</p>
<p>Some surveys (Verdoliva, <xref ref-type="bibr" rid="B103">2020</xref>; Rana et al., <xref ref-type="bibr" rid="B83">2022</xref>) mentioned fusion as a key future research direction, where &#x0201C;fusion&#x0201D; refers to combining different methods (e.g., combining multiple detectors of different types) and data sources (e.g., jointly considering audio-visual analysis) to achieve better performance for deepfake detection. Lyu (<xref ref-type="bibr" rid="B62">2020</xref>) suggested that, for the detection of deepfake videos, we need to consider video-level detection more, which can be considered fusion of detection results of all video frames.</p>
<p>The authors of many surveys (e.g., Lyu, <xref ref-type="bibr" rid="B62">2020</xref>; Younus and Hasan, <xref ref-type="bibr" rid="B120">2020</xref>; Deshmukh and Wankhade, <xref ref-type="bibr" rid="B19">2021</xref>; Masood et al., <xref ref-type="bibr" rid="B65">2023</xref>; Sandotra and Arora, <xref ref-type="bibr" rid="B87">2024</xref>), argued that better (higher-quality, more up-to-date, and more standard) deepfake datasets are needed to develop more effective deepfake detection methods. Lyu (<xref ref-type="bibr" rid="B62">2020</xref>) and Masood et al. (<xref ref-type="bibr" rid="B65">2023</xref>) also suggested that we need to consider <italic>social media laundering</italic> effects in training data and improve the evaluation of datasets. We agree with them on these points. Finally, Rana et al. (<xref ref-type="bibr" rid="B83">2022</xref>) emphasised the differences in experimental settings of existing deepfake research and suggested a unique framework to be developed for the fair evaluation of deepfake detection methods.</p>
<p>There are also other <italic>ad-hoc</italic> recommendations given by the authors of some surveys. For example, Lyu (<xref ref-type="bibr" rid="B62">2020</xref>) argued that deepfake detection should be considered a (more complicated) multi-class, multi-label and local detection problem. Tolosana et al. (<xref ref-type="bibr" rid="B99">2020</xref>) discussed specific research directions for different deepfake generation methods (face synthesis, identity swap, attribute manipulation, and expression swap). Similarly, Heidari et al. (<xref ref-type="bibr" rid="B35">2023</xref>) and Masood et al. (<xref ref-type="bibr" rid="B65">2023</xref>) provided comprehensive discussions on future trends regarding the understanding, generation, detection, and prevention of deepfakes. Regarding preventing deepfakes, Heidari et al. (<xref ref-type="bibr" rid="B35">2023</xref>) and Khanjani et al. (<xref ref-type="bibr" rid="B43">2023</xref>) mentioned that blockchains and distributed ledger technologies can be leveraged for enhanced digital content traceability and identity sovereignty. Finally, Heidari et al. (<xref ref-type="bibr" rid="B35">2023</xref>) and Nguyen et al. (<xref ref-type="bibr" rid="B74">2022</xref>) underlined the importance of considering the human aspects of deepfake detection as well as the societal impacts of deepfakes, indicating the need for more interdisciplinary research on the subject.</p></sec>
</sec>
<sec sec-type="conclusions" id="s6">
<title>6 Conclusion</title>
<p>The rapid growth in the capability to manipulate media or create synthetic media which look realistic and natural paved the way for deepfakes. At first, this paper adopted a critical approach to look at different definitions of the term &#x0201C;deepfake.&#x0201D; In that regard, we point out the different contradicting definitions and call for the wider community to consider how to define a new term that has a more consistent scope and meaning. For instance, replacing &#x0201C;deepfake&#x0201D; with &#x0201C;deep synthesis&#x0201D; can be more inclusive by embracing positive applications of deepfake techniques, e.g., in entertainment and for simulation purposes.</p>
<p>This paper provided a comprehensive overview of multiple aspects of the deepfake ecosystem drawing from the research literature and other online sources. It covers commonly used performance metrics, standards, and related datasets. It also presents a meta-review of 15 selected deepfake-related survey papers published since 2020, covering not only the above-mentioned aspects but also highlighting key challenges and recommendations.</p></sec>
</body>
<back>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>EA: Data curation, Investigation, Methodology, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. VF: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Supervision, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SL: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Supervision, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research, authorship, and/or publication of this article.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://www.congress.gov/bill/115th-congress/senate-bill/3805">https://www.congress.gov/bill/115th-congress/senate-bill/3805</ext-link></p></fn>
<fn id="fn0002"><p><sup>2</sup><ext-link ext-link-type="uri" xlink:href="https://www.congress.gov/bill/116th-congress/house-bill/3230">https://www.congress.gov/bill/116th-congress/house-bill/3230</ext-link></p></fn>
<fn id="fn0003"><p><sup>3</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/">https://www.iso.org/</ext-link></p></fn>
<fn id="fn0004"><p><sup>4</sup><ext-link ext-link-type="uri" xlink:href="https://www.iec.ch/">https://www.iec.ch/</ext-link></p></fn>
<fn id="fn0005"><p><sup>5</sup><ext-link ext-link-type="uri" xlink:href="https://www.itu.int/">https://www.itu.int/</ext-link></p></fn>
<fn id="fn0006"><p><sup>6</sup><ext-link ext-link-type="uri" xlink:href="https://www.itu.int/rec/T-REC-P.800.1-201607-I/en">https://www.itu.int/rec/T-REC-P.800.1-201607-I/en</ext-link></p></fn>
<fn id="fn0007"><p><sup>7</sup><ext-link ext-link-type="uri" xlink:href="https://www.itu.int/rec/T-REC-P.910-200804-I/en">https://www.itu.int/rec/T-REC-P.910-200804-I/en</ext-link></p></fn>
<fn id="fn0008"><p><sup>8</sup><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/dall-e/">https://openai.com/index/dall-e/</ext-link></p></fn>
<fn id="fn0009"><p><sup>9</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/committee/6794475.html">https://www.iso.org/committee/6794475.html</ext-link></p></fn>
<fn id="fn0010"><p><sup>10</sup><ext-link ext-link-type="uri" xlink:href="http://www.iso.org/iso/jtc1_home.html">http://www.iso.org/iso/jtc1_home.html</ext-link></p></fn>
<fn id="fn0011"><p><sup>11</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/standard/77609.html">https://www.iso.org/standard/77609.html</ext-link></p></fn>
<fn id="fn0012"><p><sup>12</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/standard/79799.html">https://www.iso.org/standard/79799.html</ext-link></p></fn>
<fn id="fn0013"><p><sup>13</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/standard/82570.html">https://www.iso.org/standard/82570.html</ext-link></p></fn>
<fn id="fn0014"><p><sup>14</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/committee/313770.html">https://www.iso.org/committee/313770.html</ext-link></p></fn>
<fn id="fn0015"><p><sup>15</sup><bold><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/standard/73515.html">https://www.iso.org/standard/73515.html</ext-link></bold></p></fn>
<fn id="fn0016"><p><sup>16</sup><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/standard/53227.html">https://www.iso.org/standard/53227.html</ext-link></p></fn>
<fn id="fn0017"><p><sup>17</sup><bold><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/standard/67381.html">https://www.iso.org/standard/67381.html</ext-link></bold></p></fn>
<fn id="fn0018"><p><sup>18</sup><ext-link ext-link-type="uri" xlink:href="https://sites.google.com/qq.com/face-anti-spoofing/">https://sites.google.com/qq.com/face-anti-spoofing/</ext-link></p></fn>
<fn id="fn0019"><p><sup>19</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/592McAvoy/fake-face-detection&#x00023;user-content-i-dataset">https://github.com/592McAvoy/fake-face-detection&#x00023;user-content-i-dataset</ext-link></p></fn>
<fn id="fn0020"><p><sup>20</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/santaboi/Diffusion-Deepfake-Detection-Datasets_2023">https://github.com/santaboi/Diffusion-Deepfake-Detection-Datasets_2023</ext-link></p></fn>
<fn id="fn0021"><p><sup>21</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/media-sec-lab/Audio-Deepfake-Detection?tab=readme-ov-file&#x00023;Datasets">https://github.com/media-sec-lab/Audio-Deepfake-Detection?tab=readme-ov-file&#x00023;Datasets</ext-link></p></fn>
<fn id="fn0022"><p><sup>22</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/MarekKowalski/FaceSwap/">https://github.com/MarekKowalski/FaceSwap/</ext-link></p></fn>
<fn id="fn0023"><p><sup>23</sup><ext-link ext-link-type="uri" xlink:href="https://generated.photos/datasets">https://generated.photos/datasets</ext-link></p></fn>
<fn id="fn0024"><p><sup>24</sup><ext-link ext-link-type="uri" xlink:href="https://generated.photos/face-generator/new">https://generated.photos/face-generator/new</ext-link></p></fn>
<fn id="fn0025"><p><sup>25</sup><ext-link ext-link-type="uri" xlink:href="https://generated.photos/api">https://generated.photos/api</ext-link></p></fn>
<fn id="fn0026"><p><sup>26</sup>According to a number of research papers we read, an earlier 100K-Faces dataset was released by generated.photos for academic research in 2018, which was used by many researchers. This dataset is not currently available any longer.</p></fn>
<fn id="fn0027"><p><sup>27</sup><xref ref-type="table" rid="T2">Table 2</xref> of the paper shows the dataset size is 19,509, but the dataset downloaded from pCloud contains just 19,457 images.</p></fn>
<fn id="fn0028"><p><sup>28</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/NVlabs/ffhq-dataset">https://github.com/NVlabs/ffhq-dataset</ext-link></p></fn>
<fn id="fn0029"><p><sup>29</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/fyu/lsun">https://github.com/fyu/lsun</ext-link></p></fn>
<fn id="fn0030"><p><sup>30</sup><ext-link ext-link-type="uri" xlink:href="https://drive.google.com/open?id=0B4qLcYyJmiz0TXY1NG02bzZVRGs">https://drive.google.com/open?id=0B4qLcYyJmiz0TXY1NG02bzZVRGs</ext-link></p></fn>
<fn id="fn0031"><p><sup>31</sup><ext-link ext-link-type="uri" xlink:href="https://generated.photos/">https://generated.photos/</ext-link></p></fn>
<fn id="fn0032"><p><sup>32</sup><ext-link ext-link-type="uri" xlink:href="http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html">http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html</ext-link></p></fn>
<fn id="fn0033"><p><sup>33</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/ondyari/FaceForensics">https://github.com/ondyari/FaceForensics</ext-link></p></fn>
<fn id="fn0034"><p><sup>34</sup><ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Nicolas_Cage">https://en.wikipedia.org/wiki/Nicolas_Cage</ext-link></p></fn>
<fn id="fn0035"><p><sup>35</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/yuezunli/celeb-deepfakeforensics/tree/master/Celeb-DF-v1">https://github.com/yuezunli/celeb-deepfakeforensics/tree/master/Celeb-DF-v1</ext-link></p></fn>
<fn id="fn0036"><p><sup>36</sup><ext-link ext-link-type="uri" xlink:href="https://ai.googleblog.com/">https://ai.googleblog.com/</ext-link></p></fn>
<fn id="fn0037"><p><sup>37</sup><ext-link ext-link-type="uri" xlink:href="https://jigsaw.google.com/">https://jigsaw.google.com/</ext-link></p></fn>
<fn id="fn0038"><p><sup>38</sup>A Chinese video portal: <ext-link ext-link-type="uri" xlink:href="https://www.bilibili.com/">https://www.bilibili.com/</ext-link>.</p></fn>
<fn id="fn0039"><p><sup>39</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/deepfakes/faceswap">https://github.com/deepfakes/faceswap</ext-link></p></fn>
<fn id="fn0040"><p><sup>40</sup><ext-link ext-link-type="uri" xlink:href="http://www.vc-challenge.org/">http://www.vc-challenge.org/</ext-link></p></fn>
<fn id="fn0041"><p><sup>41</sup><ext-link ext-link-type="uri" xlink:href="http://www.vc-challenge.org/vcc2016/papers/SSW9_VCC2016_Results.pdf">http://www.vc-challenge.org/vcc2016/papers/SSW9_VCC2016_Results.pdf</ext-link></p></fn>
<fn id="fn0042"><p><sup>42</sup><ext-link ext-link-type="uri" xlink:href="https://www.emime.org/participate/emime-bilingual-database.html">https://www.emime.org/participate/emime-bilingual-database.html</ext-link></p></fn>
<fn id="fn0043"><p><sup>43</sup><ext-link ext-link-type="uri" xlink:href="https://www.asvspoof.org/">https://www.asvspoof.org/</ext-link></p></fn>
<fn id="fn0044"><p><sup>44</sup><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.7488/ds/1994">https://doi.org/10.7488/ds/1994</ext-link></p></fn>
<fn id="fn0045"><p><sup>45</sup><ext-link ext-link-type="uri" xlink:href="https://www.cs.toronto.edu/&#x0007E;graves/phd.pdf">https://www.cs.toronto.edu/&#x0007E;graves/phd.pdf</ext-link></p></fn>
<fn id="fn0046"><p><sup>46</sup><ext-link ext-link-type="uri" xlink:href="https://github.com/syang1993/gst-tacotron">https://github.com/syang1993/gst-tacotron</ext-link></p></fn>
<fn id="fn0047"><p><sup>47</sup><ext-link ext-link-type="uri" xlink:href="https://dcase.community/challenge2022/index">https://dcase.community/challenge2022/index</ext-link></p></fn>
<fn id="fn0048"><p><sup>48</sup><ext-link ext-link-type="uri" xlink:href="https://ai-writer.com/">https://ai-writer.com/</ext-link></p></fn>
<fn id="fn0049"><p><sup>49</sup><ext-link ext-link-type="uri" xlink:href="https://www.articleforge.com/">https://www.articleforge.com/</ext-link></p></fn>
<fn id="fn0050"><p><sup>50</sup><ext-link ext-link-type="uri" xlink:href="https://www.wikipedia.org/">https://www.wikipedia.org/</ext-link></p></fn>
<fn id="fn0051"><p><sup>51</sup><ext-link ext-link-type="uri" xlink:href="https://baike.baidu.com/">https://baike.baidu.com/</ext-link></p></fn>
<fn id="fn0052"><p><sup>52</sup><ext-link ext-link-type="uri" xlink:href="https://mfc.nist.gov/&#x00023;pills-data">https://mfc.nist.gov/&#x00023;pills-data</ext-link></p></fn>
<fn id="fn0053"><p><sup>53</sup><ext-link ext-link-type="uri" xlink:href="https://www.darpa.mil/program/media-forensics">https://www.darpa.mil/program/media-forensics</ext-link></p></fn>
<fn id="fn0054"><p><sup>54</sup><ext-link ext-link-type="uri" xlink:href="https://mfc.nist.gov/">https://mfc.nist.gov/</ext-link></p></fn>
<fn id="fn0055"><p><sup>55</sup><ext-link ext-link-type="uri" xlink:href="https://competitions.codalab.org/competitions/33386">https://competitions.codalab.org/competitions/33386</ext-link></p></fn>
<fn id="fn0056"><p><sup>56</sup><ext-link ext-link-type="uri" xlink:href="https://sense-human.github.io/">https://sense-human.github.io/</ext-link></p></fn>
<fn id="fn0057"><p><sup>57</sup><ext-link ext-link-type="uri" xlink:href="http://iccv2021.thecvf.com/">http://iccv2021.thecvf.com/</ext-link></p></fn>
<fn id="fn0058"><p><sup>58</sup><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o/">https://openai.com/index/hello-gpt-4o/</ext-link></p></fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Afchar</surname> <given-names>D.</given-names></name> <name><surname>Nozick</surname> <given-names>V.</given-names></name> <name><surname>Yamagishi</surname> <given-names>J.</given-names></name> <name><surname>Echizen</surname> <given-names>I.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;MesoNet: a compact facial video forgery detection network,&#x0201D;</article-title> in <source>Proceedings of the 2018 IEEE International Workshop on Information Forensics and Security</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1109/WIFS.2018.8630761</pub-id></citation>
</ref>
<ref id="B2">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Ajder</surname> <given-names>H.</given-names></name> <name><surname>Patrini</surname> <given-names>G.</given-names></name> <name><surname>Cavalli</surname> <given-names>F.</given-names></name> <name><surname>Cullen</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <source>The state of deepfakes: Landscape, threats, and impact</source>. Technical report, Deeptrace. Available at: <ext-link ext-link-type="uri" xlink:href="https://sensity.ai/reports/">https://sensity.ai/reports/</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Akhtar</surname> <given-names>Z.</given-names></name> <name><surname>Falk</surname> <given-names>T. H.</given-names></name></person-group> (<year>2017</year>). <article-title>Audio-visual multimedia quality assessment: a comprehensive survey</article-title>. <source>IEEE Access</source> <volume>5</volume>, <fpage>21090</fpage>&#x02013;<lpage>21117</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2017.2750918</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Alshammari</surname> <given-names>H.</given-names></name> <name><surname>EI-Sayed</surname> <given-names>A.</given-names></name></person-group> (<year>2023b</year>). <source>AIRABIC: Arabic dataset for performance evaluation of ai detectors</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/Hamed1Hamed/AIRABIC">https://github.com/Hamed1Hamed/AIRABIC</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B5">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Alshammari</surname> <given-names>H.</given-names></name> <name><surname>Sayed</surname> <given-names>A.</given-names></name></person-group> (<year>2023a</year>). AIRABIC: Arabic dataset for performance evaluation of ai detectors,&#x0201D; in <italic>Proceedings of the 2023 International Conference on Machine Learning and Applications (ICMLA)</italic> (IEEE), <fpage>864</fpage>&#x02013;<lpage>870</lpage>. <pub-id pub-id-type="doi">10.1109/ICMLA58977.2023.00127</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Ba</surname> <given-names>Z.</given-names></name> <name><surname>Wen</surname> <given-names>Q.</given-names></name> <name><surname>Cheng</surname> <given-names>P.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>F.</given-names></name> <name><surname>Lu</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <source>DEepfake CROss-lingual (DECRO) evaluation dataset</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/petrichorwq/DECRO-dataset">https://github.com/petrichorwq/DECRO-dataset</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ba</surname> <given-names>Z.</given-names></name> <name><surname>Wen</surname> <given-names>Q.</given-names></name> <name><surname>Cheng</surname> <given-names>P.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Lin</surname> <given-names>F.</given-names></name> <name><surname>Lu</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2023b</year>). Transferring audio deepfake detection capability across languages,&#x0201D; in <italic>Proceedings of the ACM Web Conference 2023</italic> (ACM), <fpage>2033</fpage>&#x02013;<lpage>2044</lpage>. <pub-id pub-id-type="doi">10.1145/3543507.3583222</pub-id></citation>
</ref>
<ref id="B8">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bandi</surname> <given-names>A.</given-names></name> <name><surname>Adapa</surname> <given-names>P. V. S. R.</given-names></name> <name><surname>Kuchi</surname> <given-names>Y. E. V. P. K.</given-names></name></person-group> (<year>2023</year>). <article-title>The power of generative AI: A review of requirements, models, input-output formats, evaluation metrics, and challenges</article-title>. <source>Fut. Internet</source> <volume>15</volume>:<fpage>260</fpage>. <pub-id pub-id-type="doi">10.3390/fi15080260</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Brady</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <source>Deepfakes: a new desinformation threat</source>? Technical report, Democracy Reporting International. Available at: <ext-link ext-link-type="uri" xlink:href="https://democracy-reporting.org/dri_publications/deepfakes-a-new-disinformation-threat/">https://democracy-reporting.org/dri_publications/deepfakes-a-new-disinformation-threat/</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>Z.</given-names></name> <name><surname>Ghosh</surname> <given-names>S.</given-names></name> <name><surname>Adatia</surname> <given-names>A. P.</given-names></name> <name><surname>Hayat</surname> <given-names>M.</given-names></name> <name><surname>Dhall</surname> <given-names>A.</given-names></name> <name><surname>Stefanov</surname> <given-names>K.</given-names></name></person-group> (<year>2023a</year>). <article-title>AV-Deepfake1M: A large-scale LLM-driven audio-visual deepfake dataset</article-title>. <source>arXiv:2311.15308</source>.</citation>
</ref>
<ref id="B11">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>Z.</given-names></name> <name><surname>Ghosh</surname> <given-names>S.</given-names></name> <name><surname>Adatia</surname> <given-names>A. P.</given-names></name> <name><surname>Hayat</surname> <given-names>M.</given-names></name> <name><surname>Dhall</surname> <given-names>A.</given-names></name> <name><surname>Stefanov</surname> <given-names>K.</given-names></name></person-group> (<year>2023b</year>). <source>AV-Deepfake1M: a large-scale LLM-driven audio-visual deepfake dataset</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/ControlNet/AV-Deepfake1M">https://github.com/ControlNet/AV-Deepfake1M</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B12">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chefer</surname> <given-names>H.</given-names></name> <name><surname>Alaluf</surname> <given-names>Y.</given-names></name> <name><surname>Vinker</surname> <given-names>Y.</given-names></name> <name><surname>Wolf</surname> <given-names>L.</given-names></name> <name><surname>Cohen-Or</surname> <given-names>D.</given-names></name></person-group> (<year>2023</year>). <article-title>Attend-and-Excite: attention-based semantic guidance for text-to-image diffusion models</article-title>. <source>ACM Trans. Graph</source>. <volume>42</volume>, <fpage>1</fpage>&#x02013;<lpage>10</lpage>. <pub-id pub-id-type="doi">10.1145/3592116</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Ding</surname> <given-names>C.</given-names></name></person-group> (<year>2023</year>). <article-title>X-IQE: eXplainable image quality evaluation for text-to-image generation with visual large language models</article-title>. <source>arXiv:2305.10843</source>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ciftci</surname> <given-names>U. A.</given-names></name> <name><surname>Demir</surname> <given-names>I.</given-names></name> <name><surname>Yin</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;FakeCatcher: Detection of synthetic portrait videos using biological signals,&#x0201D;</article-title> in <source>IEEE Transactions on Pattern Analysis and Machine Intelligence</source>.<pub-id pub-id-type="pmid">32750816</pub-id></citation></ref>
<ref id="B15">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Dang</surname> <given-names>H.</given-names></name> <name><surname>Liu</surname> <given-names>F.</given-names></name> <name><surname>Stehouwer</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Jain</surname> <given-names>A. K.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;On the detection of digital face manipulation,&#x0201D;</article-title> in <source>Proceedings of the 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>5781</fpage>&#x02013;<lpage>5790</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00582</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delgado</surname> <given-names>H.</given-names></name> <name><surname>Evans</surname> <given-names>N.</given-names></name> <name><surname>Kinnunen</surname> <given-names>T.</given-names></name> <name><surname>Lee</surname> <given-names>K. A.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Nautsch</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021a</year>). <source>ASVspoof 2021 challenge</source> - <italic>logical access database</italic>. Zenodo dataset.<pub-id pub-id-type="pmid">39215070</pub-id></citation></ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Delgado</surname> <given-names>H.</given-names></name> <name><surname>Evans</surname> <given-names>N.</given-names></name> <name><surname>Kinnunen</surname> <given-names>T.</given-names></name> <name><surname>Lee</surname> <given-names>K. A.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Nautsch</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021b</year>). <source>ASVspoof 2021 challenge</source> - <italic>speech deepfake database</italic>. Zenodo dataset.<pub-id pub-id-type="pmid">39215070</pub-id></citation></ref>
<ref id="B18">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>J.</given-names></name> <name><surname>Dong</surname> <given-names>W.</given-names></name> <name><surname>Socher</surname> <given-names>R.</given-names></name> <name><surname>Li</surname> <given-names>L.-J.</given-names></name> <name><surname>Li</surname> <given-names>K.</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L.</given-names></name></person-group> (<year>2009</year>). <article-title>&#x0201C;ImageNet: a large-scale hierarchical image database,&#x0201D;</article-title> in <source>Proceedings of the 2009 IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>248</fpage>&#x02013;<lpage>255</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2009.5206848</pub-id><pub-id pub-id-type="pmid">26886976</pub-id></citation></ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Deshmukh</surname> <given-names>A.</given-names></name> <name><surname>Wankhade</surname> <given-names>S. B.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Deepfake detection approaches using deep learning: A systematic review,&#x0201D;</article-title> in <source>Intelligent Computing and Networking: Proceedings of IC-ICN 2020, volume 146 of Lecture Notes in Networks and Systems</source> (<publisher-loc>Springer</publisher-loc>), <fpage>293</fpage>&#x02013;<lpage>302</lpage>. <pub-id pub-id-type="doi">10.1007/978-981-15-7421-4_27</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>X.</given-names></name> <name><surname>Raziei</surname> <given-names>Z.</given-names></name> <name><surname>Larson</surname> <given-names>E. C.</given-names></name> <name><surname>Olinick</surname> <given-names>E. V.</given-names></name> <name><surname>Krueger</surname> <given-names>P.</given-names></name> <name><surname>Hahsler</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>Swapped face detection using deep learning and subjective assessment</article-title>. <source>EURASIP J. Inf. Secur</source>. <volume>2020</volume>, <fpage>1</fpage>&#x02013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1186/s13635-020-00109-8</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dolhansky</surname> <given-names>B.</given-names></name> <name><surname>Bitton</surname> <given-names>J.</given-names></name> <name><surname>Pflaum</surname> <given-names>B.</given-names></name> <name><surname>Lu</surname> <given-names>J.</given-names></name> <name><surname>Howes</surname> <given-names>R.</given-names></name> <name><surname>Wang</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>The DeepFake detection challenge dataset</article-title>. <source>arXiv preprint arXiv:2006.07397</source>.</citation>
</ref>
<ref id="B22">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Dufour</surname> <given-names>N.</given-names></name> <name><surname>Gully</surname> <given-names>A.</given-names></name></person-group> (<year>2019</year>). <source>Contributing data to deepfake detection research</source>. Technical report, Google AI. Available at: <ext-link ext-link-type="uri" xlink:href="https://ai.googleblog.com/2019/09/contributing-data-to-deepfake-detection.html">https://ai.googleblog.com/2019/09/contributing-data-to-deepfake-detection.html</ext-link> (accessed March, 2024)<pub-id pub-id-type="pmid">36700137</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Durall</surname> <given-names>R.</given-names></name> <name><surname>Keuper</surname> <given-names>M.</given-names></name> <name><surname>Pfreundt</surname> <given-names>F.-J.</given-names></name> <name><surname>Keuper</surname> <given-names>J.</given-names></name></person-group> (<year>2019</year>). <article-title>Unmasking deepfakes with simple features</article-title>. <source>arXiv:1911.00686</source>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Fagni</surname> <given-names>T.</given-names></name> <name><surname>Falchi</surname> <given-names>F.</given-names></name> <name><surname>Gambini</surname> <given-names>M.</given-names></name> <name><surname>Martella</surname> <given-names>A.</given-names></name> <name><surname>Tesconi</surname> <given-names>M.</given-names></name></person-group> (<year>2021a</year>). <article-title>TweepFake: about detecting deepfake tweets</article-title>. <source>PLoS ONE</source> <volume>16</volume>:<fpage>e0251415</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0251415</pub-id><pub-id pub-id-type="pmid">33984021</pub-id></citation></ref>
<ref id="B25">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Fagni</surname> <given-names>T.</given-names></name> <name><surname>Falchi</surname> <given-names>F.</given-names></name> <name><surname>Gambini</surname> <given-names>M.</given-names></name> <name><surname>Martella</surname> <given-names>A.</given-names></name> <name><surname>Tesconi</surname> <given-names>M.</given-names></name></person-group> (<year>2021b</year>). <source>TweepFake: about detecting deepfake tweets</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/tizfa/tweepfake_deepfake_text_detection">https://github.com/tizfa/tweepfake_deepfake_text_detection</ext-link> (accessed July, 2024).<pub-id pub-id-type="pmid">33984021</pub-id></citation></ref>
<ref id="B26">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Fox</surname> <given-names>G.</given-names></name> <name><surname>Liu</surname> <given-names>W.</given-names></name> <name><surname>Kim</surname> <given-names>H.</given-names></name> <name><surname>Seidel</surname> <given-names>H.-P.</given-names></name> <name><surname>Elgharib</surname> <given-names>M.</given-names></name> <name><surname>Theobalt</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Videoforensicshq: detecting high-quality manipulated face videos,&#x0201D;</article-title> in <source>Proceedings of the 2021 IEEE International Conference on Multimedia and Expo</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/ICME51207.2021.9428101</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Frank</surname> <given-names>J.</given-names></name> <name><surname>Sch&#x000F6;nherr</surname> <given-names>L.</given-names></name></person-group> (<year>2021a</year>). <article-title>&#x0201C;WaveFake: a data set to facilitate audio deepfake detection,&#x0201D;</article-title> in <source>Proceedings of the 35th Conference on Neural Information Processing Systems (NeurIPS 2021) Track on Datasets and Benchmarks</source>, <fpage>1</fpage>&#x02013;<lpage>17</lpage>.</citation>
</ref>
<ref id="B28">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Frank</surname> <given-names>J.</given-names></name> <name><surname>Sch&#x000F6;nherr</surname> <given-names>L.</given-names></name></person-group> (<year>2021b</year>). <source>WaveFake: a data set to facilitate audio deepfake detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/RUB-SysSec/WaveFake">https://github.com/RUB-SysSec/WaveFake</ext-link></citation>
</ref>
<ref id="B29">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gong</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Huber</surname> <given-names>J.</given-names></name> <name><surname>MacKnight</surname> <given-names>M.</given-names></name> <name><surname>Poellabauer</surname> <given-names>C.</given-names></name></person-group> (<year>2019a</year>). ReMASC: realistic replay attack corpus for voice controlled systems,&#x0201D; in <source>Proceedings of Interspeech 2019</source>, 2355&#x02013;2359. <pub-id pub-id-type="doi">10.21437/Interspeech.2019-1541</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Gong</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Huber</surname> <given-names>J.</given-names></name> <name><surname>MacKnight</surname> <given-names>M.</given-names></name> <name><surname>Poellabauer</surname> <given-names>C.</given-names></name></person-group> (<year>2019b</year>). <source>ReMASC: realistic replay attack corpus for voice controlled systems</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/YuanGongND/ReMASC">https://github.com/YuanGongND/ReMASC</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Jiang</surname> <given-names>M.</given-names></name> <name><surname>Nie</surname> <given-names>J.</given-names></name> <name><surname>Ding</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <article-title>How close is ChatGPT to human experts? comparison corpus, evaluation, and detection</article-title>. <source>arXiv:2301.07597</source>.</citation>
</ref>
<ref id="B32">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Guo</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Jiang</surname> <given-names>M.</given-names></name> <name><surname>Nie</surname> <given-names>J.</given-names></name> <name><surname>Ding</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023b</year>). <source>Human ChatGPT Comparison Corpus (HC3)</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/Hello-SimpleAI/chatgpt-comparison-detection">https://github.com/Hello-SimpleAI/chatgpt-comparison-detection</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B33">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hartvigsen</surname> <given-names>T.</given-names></name> <name><surname>Gabriel</surname> <given-names>S.</given-names></name> <name><surname>Palangi</surname> <given-names>H.</given-names></name> <name><surname>Sap</surname> <given-names>M.</given-names></name> <name><surname>Ray</surname> <given-names>D.</given-names></name> <name><surname>Kamar</surname> <given-names>E.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;ToxiGen: a large-scale machine-generated dataset for adversarial and implicit hate speech detection,&#x0201D;</article-title> in <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics</source> (<publisher-loc>ACL</publisher-loc>), <fpage>3309</fpage>&#x02013;<lpage>3326</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.234</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B34">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>Y.</given-names></name> <name><surname>Gan</surname> <given-names>B.</given-names></name> <name><surname>Chen</surname> <given-names>S.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name> <name><surname>Yin</surname> <given-names>G.</given-names></name> <name><surname>Song</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;ForgeryNet: a versatile benchmark for comprehensive forgery analysis,&#x0201D;</article-title> in <source>Proceedings of the 2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>4360</fpage>&#x02013;<lpage>4369</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00434</pub-id></citation>
</ref>
<ref id="B35">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Heidari</surname> <given-names>A.</given-names></name> <name><surname>Jafari Navimipour</surname> <given-names>N.</given-names></name> <name><surname>Dag</surname> <given-names>H.</given-names></name> <name><surname>Unal</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>Deepfake detection using deep learning methods: a systematic and comprehensive review</article-title>. <source>WIREs Data Mining Knowl. Discov</source>. <volume>45</volume>:<fpage>e1520</fpage>. <pub-id pub-id-type="doi">10.1002/widm.1520</pub-id><pub-id pub-id-type="pmid">34460519</pub-id></citation></ref>
<ref id="B36">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hessel</surname> <given-names>J.</given-names></name> <name><surname>Holtzman</surname> <given-names>A.</given-names></name> <name><surname>Forbes</surname> <given-names>M.</given-names></name> <name><surname>Le Bras</surname> <given-names>R.</given-names></name> <name><surname>Choi</surname> <given-names>Y.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;CLIPScore: a reference-free evaluation metric for image captioning,&#x0201D;</article-title> in <source>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</source> (<publisher-loc>Association for Computational Linguistics</publisher-loc>), <fpage>7514</fpage>&#x02013;<lpage>7528</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.595</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B37">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>K.</given-names></name> <name><surname>Sun</surname> <given-names>K.</given-names></name> <name><surname>Xie</surname> <given-names>E.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;T2I-CompBench: a comprehensive benchmark for open-world compositional text-to-image generation,&#x0201D;</article-title> in <source>Proceedings of the 37th Neural Information Processing Systems Track on Datasets and Benchmarks (NeurIPS &#x00027;23)</source> (<publisher-loc>Curran Associates, Inc.</publisher-loc>), <fpage>78723</fpage>&#x02013;<lpage>78747</lpage>.</citation>
</ref>
<ref id="B38">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Jia</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name></person-group> (<year>2022a</year>). <source>DFDM: Deepfakes from different models</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/shanface33/Deepfake_Model_Attribution">https://github.com/shanface33/Deepfake_Model_Attribution</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B39">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jia</surname> <given-names>S.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name></person-group> (<year>2022b</year>). <article-title>Model attribution of face-swap deepfake videos</article-title>. <source>arXiv:2202.12951</source>. <pub-id pub-id-type="doi">10.1109/ICIP46576.2022.9897972</pub-id></citation>
</ref>
<ref id="B40">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Jiang</surname> <given-names>L.</given-names></name> <name><surname>Li</surname> <given-names>R.</given-names></name> <name><surname>Wu</surname> <given-names>W.</given-names></name> <name><surname>Qian</surname> <given-names>C.</given-names></name> <name><surname>Loy</surname> <given-names>C. C.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;DeeperForensics-1.0: a large-scale dataset for real-world face forgery detection,&#x0201D;</article-title> in <source>Proceedings of the 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>2886</fpage>&#x02013;<lpage>2895</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00296</pub-id></citation>
</ref>
<ref id="B41">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kalchbrenner</surname> <given-names>N.</given-names></name> <name><surname>Elsen</surname> <given-names>E.</given-names></name> <name><surname>Simonyan</surname> <given-names>K.</given-names></name> <name><surname>Noury</surname> <given-names>S.</given-names></name> <name><surname>Casagrande</surname> <given-names>N.</given-names></name> <name><surname>Lockhart</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Efficient neural audio synthesis</article-title>. <source>arXiv:1802.08435</source>.</citation>
</ref>
<ref id="B42">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Karras</surname> <given-names>T.</given-names></name> <name><surname>Laine</surname> <given-names>S.</given-names></name> <name><surname>Aila</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;A style-based generator architecture for generative adversarial networks,&#x0201D;</article-title> in <source>Proceedings of the 2019 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>4401</fpage>&#x02013;<lpage>4410</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2019.00453</pub-id></citation>
</ref>
<ref id="B43">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khanjani</surname> <given-names>Z.</given-names></name> <name><surname>Watson</surname> <given-names>G.</given-names></name> <name><surname>Janeja</surname> <given-names>V. P.</given-names></name></person-group> (<year>2023</year>). <article-title>Audio deepfakes: a survey</article-title>. <source>Front. Big Data</source> <volume>5</volume>:<fpage>1001063</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2022.1001063</pub-id><pub-id pub-id-type="pmid">36700137</pub-id></citation></ref>
<ref id="B44">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Khodabakhsh</surname> <given-names>A.</given-names></name> <name><surname>Ramachandra</surname> <given-names>R.</given-names></name> <name><surname>Raja</surname> <given-names>K.</given-names></name> <name><surname>Wasnik</surname> <given-names>P.</given-names></name> <name><surname>Busch</surname> <given-names>C.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Fake face detection methods: can they be generalized?&#x0201D;</article-title> in <source>Proceedings of the 2018 International Conference of the Biometrics Special Interest Group</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.23919/BIOSIG.2018.8553251</pub-id></citation>
</ref>
<ref id="B45">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>H.</given-names></name> <name><surname>Elgharib</surname> <given-names>M.</given-names></name> <name><surname>Zollh&#x000F6;fer</surname> <given-names>M.</given-names></name> <name><surname>Seidel</surname> <given-names>H. P.</given-names></name> <name><surname>Beeler</surname> <given-names>T.</given-names></name> <name><surname>Richardt</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Neural style-preserving visual dubbing</article-title>. <source>ACM Trans. Graph</source>. <volume>38</volume>, <fpage>1</fpage>&#x02013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1145/3355089.3356500</pub-id></citation>
</ref>
<ref id="B46">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>H.</given-names></name> <name><surname>Garrido</surname> <given-names>P.</given-names></name> <name><surname>Tewari</surname> <given-names>A.</given-names></name> <name><surname>Xu</surname> <given-names>W.</given-names></name> <name><surname>Thies</surname> <given-names>J.</given-names></name> <name><surname>Niessner</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Deep video portraits</article-title>. <source>ACM Trans. Graph</source>. <volume>37</volume>, <fpage>1</fpage>&#x02013;<lpage>14</lpage>. <pub-id pub-id-type="doi">10.1145/3197517.3201283</pub-id><pub-id pub-id-type="pmid">27187945</pub-id></citation></ref>
<ref id="B47">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Korshunov</surname> <given-names>P.</given-names></name> <name><surname>Marcel</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Vulnerability assessment and detection of deepfake videos,&#x0201D;</article-title> in <source>Proceedings of the 2019 International Conference on Biometrics</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/ICB45273.2019.8987375</pub-id></citation>
</ref>
<ref id="B48">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kwon</surname> <given-names>P.</given-names></name> <name><surname>You</surname> <given-names>J.</given-names></name> <name><surname>Nam</surname> <given-names>G.</given-names></name> <name><surname>Park</surname> <given-names>S.</given-names></name> <name><surname>Chae</surname> <given-names>G.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;KoDF: A large-scale korean DeepFake detection dataset,&#x0201D;</article-title> in <source>Proceedings of the 2021 IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>10724</fpage>&#x02013;<lpage>10733</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01057</pub-id></citation>
</ref>
<ref id="B49">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Ling</surname> <given-names>H.</given-names></name> <name><surname>Kim</surname> <given-names>S. W.</given-names></name> <name><surname>Kreis</surname> <given-names>K.</given-names></name> <name><surname>Fidler</surname> <given-names>S.</given-names></name> <name><surname>Torralba</surname> <given-names>A.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;BigDatasetGAN: Synthesizing imagenet with pixel-wise annotations,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>21330</fpage>&#x02013;<lpage>21340</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR52688.2022.02064</pub-id></citation>
</ref>
<ref id="B50">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Zhao</surname> <given-names>X.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Pei</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name></person-group> (<year>2022a</year>). <article-title>&#x0201C;FMFCC-V: an Asian large-scale challenging dataset for deepfake detection,&#x0201D;</article-title> in <source>Proceedings of the 2022 ACM Workshop on Information Hiding and Multimedia Security</source> (<publisher-loc>ACM</publisher-loc>), <fpage>7</fpage>&#x02013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1145/3531536.3532946</pub-id></citation>
</ref>
<ref id="B51">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Zhao</surname> <given-names>X.</given-names></name> <name><surname>Cao</surname> <given-names>Y.</given-names></name> <name><surname>Pei</surname> <given-names>P.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name></person-group> (<year>2022b</year>). <source>FMFCC-V: an Asian large-scale challenging dataset for deepfake detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/iiecasligen/FMFCC-V">https://github.com/iiecasligen/FMFCC-V</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B52">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>D.</given-names></name> <name><surname>Xiong</surname> <given-names>C.</given-names></name> <name><surname>Hoi</surname> <given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;BLIP: bootstrapping language-image pre-training for unified vision-language understanding and generation,&#x0201D;</article-title> in <source>Proceedings of the 39th International Conference on Machine Learning</source> (<publisher-loc>PMLR</publisher-loc>), <fpage>12888</fpage>&#x02013;<lpage>12900</lpage>.<pub-id pub-id-type="pmid">39055931</pub-id></citation></ref>
<ref id="B53">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>L.</given-names></name> <name><surname>Bao</surname> <given-names>J.</given-names></name> <name><surname>Yang</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>D.</given-names></name> <name><surname>Wen</surname> <given-names>F.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Advancing high fidelity identity swapping for forgery detection,&#x0201D;</article-title> in <source>Proceedings of the 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source>, 5073&#x02013;5082. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00512</pub-id></citation>
</ref>
<ref id="B54">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Chang</surname> <given-names>M.-C.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;In ICTU OCULI: exposing AI created fake videos by detecting eye blinking,&#x0201D;</article-title> in <source>Proceedings of the 2018 IEEE International Workshop on Information Forensics and Security</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1109/WIFS.2018.8630787</pub-id></citation>
</ref>
<ref id="B55">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Q.</given-names></name> <name><surname>Cui</surname> <given-names>L.</given-names></name> <name><surname>Bi</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>MAGE: machine-generated text detection in the wild</article-title>. <source>arXiv:2305.13242</source>.</citation>
</ref>
<ref id="B56">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>X.</given-names></name> <name><surname>Sun</surname> <given-names>P.</given-names></name> <name><surname>Qi</surname> <given-names>H.</given-names></name> <name><surname>Lyu</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Celeb-DF: a large-scale challenging dataset for deepfake forensics,&#x0201D;</article-title> in <source>Proceedings of the 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>3204</fpage>&#x02013;<lpage>3213</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00327</pub-id></citation>
</ref>
<ref id="B57">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Sahidullah</surname> <given-names>M.</given-names></name> <name><surname>Patino</surname> <given-names>J.</given-names></name> <name><surname>Delgado</surname> <given-names>H.</given-names></name> <name><surname>Kinnunen</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>ASVspoof 2021: towards spoofed and deepfake speech detection in the wild</article-title>. <source>IEEE/ACM Trans. Audio, Speech Lang. Proc</source>. <volume>31</volume>, <fpage>2507</fpage>&#x02013;<lpage>2522</lpage>. <pub-id pub-id-type="doi">10.1109/TASLP.2023.3285283</pub-id></citation>
</ref>
<ref id="B58">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Livingstone</surname> <given-names>S. R.</given-names></name> <name><surname>Russo</surname> <given-names>F. A.</given-names></name></person-group> (<year>2018</year>). <article-title>The ryerson audio-visual database of emotional speech and song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in north american english</article-title>. <source>PLoS ONE</source> <volume>13</volume>:<fpage>e0196391</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0196391</pub-id><pub-id pub-id-type="pmid">29768426</pub-id></citation></ref>
<ref id="B59">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liyanage</surname> <given-names>V.</given-names></name> <name><surname>Buscaldi</surname> <given-names>D.</given-names></name> <name><surname>Nazarenko</surname> <given-names>A.</given-names></name></person-group> (<year>2022a</year>). <article-title>A benchmark corpus for the detection of automatically generated text in academic publications</article-title>. <source>arXiv:2202.02013</source>.</citation>
</ref>
<ref id="B60">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Liyanage</surname> <given-names>V.</given-names></name> <name><surname>Buscaldi</surname> <given-names>D.</given-names></name> <name><surname>Nazarenko</surname> <given-names>A.</given-names></name></person-group> (<year>2022b</year>). <source>GeneratedTextDetection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/vijini/GeneratedTextDetection">https://github.com/vijini/GeneratedTextDetection</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B61">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lorenzo-Trueba</surname> <given-names>J.</given-names></name> <name><surname>Yamagishi</surname> <given-names>J.</given-names></name> <name><surname>Toda</surname> <given-names>T.</given-names></name> <name><surname>Saito</surname> <given-names>D.</given-names></name> <name><surname>Villavicencio</surname> <given-names>F.</given-names></name> <name><surname>Kinnunen</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;The voice conversion challenge 2018: Promoting development of parallel and nonparallel methods,&#x0201D;</article-title> in <source>Proceedings of the Odyssey 2018 The Speaker and Language Recognition Workshop</source> (<publisher-loc>International Speech Communication Association</publisher-loc>), <fpage>195</fpage>&#x02013;<lpage>202</lpage>. <pub-id pub-id-type="doi">10.21437/Odyssey.2018-28</pub-id></citation>
</ref>
<ref id="B62">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lyu</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Deepfake detection: Current challenges and next steps,&#x0201D;</article-title> in <source>Proceedings of the 2020 IEEE International Conference on Multimedia Expo Workshops</source> (<publisher-loc>IEEE</publisher-loc>) <pub-id pub-id-type="doi">10.1109/ICMEW46912.2020.9105991</pub-id></citation>
</ref>
<ref id="B63">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>H.</given-names></name> <name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Yan</surname> <given-names>X.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <article-title>CFAD: a Chinese dataset for fake audio detection</article-title>. <source>arXiv:2207.12308</source>. <pub-id pub-id-type="doi">10.2139/ssrn.4748856</pub-id></citation>
</ref>
<ref id="B64">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>H.</given-names></name> <name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Yan</surname> <given-names>X.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023b</year>). <source>CFAD: a Chinese dataset for fake audio detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/ADDchallenge/CFAD">https://github.com/ADDchallenge/CFAD</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B65">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Masood</surname> <given-names>M.</given-names></name> <name><surname>Nawaz</surname> <given-names>M.</given-names></name> <name><surname>Malik</surname> <given-names>K. M.</given-names></name> <name><surname>Javed</surname> <given-names>A.</given-names></name> <name><surname>Irtaza</surname> <given-names>A.</given-names></name> <name><surname>Malik</surname> <given-names>H.</given-names></name></person-group> (<year>2023</year>). <article-title>Deepfakes generation and detection: State-of-the-art, open challenges, countermeasures, and way forward</article-title>. <source>Appl. Intell</source>. <volume>53</volume>, <fpage>3974</fpage>&#x02013;<lpage>4026</lpage>. <pub-id pub-id-type="doi">10.1007/s10489-022-03766-z</pub-id></citation>
</ref>
<ref id="B66">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mirsky</surname> <given-names>Y.</given-names></name> <name><surname>Lee</surname> <given-names>W.</given-names></name></person-group> (<year>2021</year>). <article-title>The creation and detection of deepfakes: a survey</article-title>. <source>ACM Comput. Surv</source>. <volume>54</volume>, <fpage>1</fpage>&#x02013;<lpage>41</lpage>. <pub-id pub-id-type="doi">10.1145/3425780</pub-id></citation>
</ref>
<ref id="B67">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Mosca</surname> <given-names>E.</given-names></name> <name><surname>Abdalla</surname> <given-names>M. H. I.</given-names></name> <name><surname>Basso</surname> <given-names>P.</given-names></name> <name><surname>Musumeci</surname> <given-names>M.</given-names></name> <name><surname>Groh</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Distinguishing fact from fiction: a benchmark dataset for identifying machine-generated scientific papers in the LLM era,&#x0201D;</article-title> in <source>Proceedings of the 3rd Workshop on Trustworthy Natural Language Processing (TrustNLP 2023)</source> (<publisher-loc>ACL</publisher-loc>), <fpage>190</fpage>&#x02013;<lpage>207</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2023.trustnlp-1.17</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B68">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>M&#x000FC;ller</surname> <given-names>N.</given-names></name> <name><surname>Czempin</surname> <given-names>P.</given-names></name> <name><surname>Diekmann</surname> <given-names>F.</given-names></name> <name><surname>Froghyar</surname> <given-names>A.</given-names></name> <name><surname>B&#x000F6;ttinger</surname> <given-names>K.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Does audio deepfake detection generalize?,&#x0201D;</article-title> in <source>Proceedings of Interspeech 2022</source>, 2783&#x02013;2787. <pub-id pub-id-type="doi">10.21437/Interspeech.2022-108</pub-id></citation>
</ref>
<ref id="B69">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>M&#x000FC;ller</surname> <given-names>N. M.</given-names></name> <name><surname>Kawa</surname> <given-names>P.</given-names></name> <name><surname>Choong</surname> <given-names>W. H.</given-names></name> <name><surname>Casanova</surname> <given-names>E.</given-names></name> <name><surname>G&#x000F6;lge</surname> <given-names>E.</given-names></name> <name><surname>M&#x000FC;ller</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>MLAAD: The multi-language audio anti-spoofing dataset</article-title>. <source>arXiv:2401.09512</source>.</citation>
</ref>
<ref id="B70">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mysore</surname> <given-names>G. J.</given-names></name></person-group> (<year>2015</year>). <article-title>Can we automatically transform speech recorded on common consumer devices in real-world environments into professional production quality speech?&#x02013;a dataset, insights, and challenges</article-title>. <source>IEEE Signal Proc. Lett</source>. <volume>22</volume>, <fpage>1006</fpage>&#x02013;<lpage>1010</lpage>. <pub-id pub-id-type="doi">10.1109/LSP.2014.2379648</pub-id></citation>
</ref>
<ref id="B71">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Narayan</surname> <given-names>K.</given-names></name> <name><surname>Agarwal</surname> <given-names>H.</given-names></name> <name><surname>Thakral</surname> <given-names>K.</given-names></name> <name><surname>Mittal</surname> <given-names>S.</given-names></name> <name><surname>Vatsa</surname> <given-names>M.</given-names></name> <name><surname>Singh</surname> <given-names>R.</given-names></name></person-group> (<year>2023a</year>). <source>Df-Platter database</source>. Public dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://iab-rubric.org/df-platter-database">https://iab-rubric.org/df-platter-database</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B72">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Narayan</surname> <given-names>K.</given-names></name> <name><surname>Agarwal</surname> <given-names>H.</given-names></name> <name><surname>Thakral</surname> <given-names>K.</given-names></name> <name><surname>Mittal</surname> <given-names>S.</given-names></name> <name><surname>Vatsa</surname> <given-names>M.</given-names></name> <name><surname>Singh</surname> <given-names>R.</given-names></name></person-group> (<year>2023b</year>). DF-Platter: multi-face heterogeneous deepfake dataset,&#x0201D; in <italic>Proceedings of the 2023 IEEE/CVF Conference on Computer Vision and Pattern Recognition</italic> (IEEE), <fpage>9739</fpage>&#x02013;<lpage>9748</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR52729.2023.00939</pub-id></citation>
</ref>
<ref id="B73">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Neves</surname> <given-names>J. C.</given-names></name> <name><surname>Tolosana</surname> <given-names>R.</given-names></name> <name><surname>Vera-Rodriguez</surname> <given-names>R.</given-names></name> <name><surname>Lopes</surname> <given-names>V.</given-names></name> <name><surname>Proena</surname> <given-names>H.</given-names></name> <name><surname>Fierrez</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>GANprintR: improved fakes and evaluation of the state of the art in face manipulation detection</article-title>. <source>IEEE J. Select. Topics Signal Proc</source>. <volume>14</volume>, <fpage>1038</fpage>&#x02013;<lpage>1048</lpage>. <pub-id pub-id-type="doi">10.1109/JSTSP.2020.3007250</pub-id></citation>
</ref>
<ref id="B74">
<citation citation-type="journal"><person-group person-group-type="author"><collab>Nguyen T. T. Nguyen Q. V. H. Nguyen D. T. Nguyen D. T. Huynh-The T. Nahavandi S. </collab></person-group>. (<year>2022</year>). <article-title>Deep learning for deepfakes creation and detection: a survey</article-title>. <source>Comput. Vis. Image Understand</source>. <volume>223</volume>:<fpage>103525</fpage>. <pub-id pub-id-type="doi">10.1016/j.cviu.2022.103525</pub-id></citation>
</ref>
<ref id="B75">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Ni</surname> <given-names>B.</given-names></name> <name><surname>Peng</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Meng</surname> <given-names>G.</given-names></name> <name><surname>Fu</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Expanding language-image pretrained models for general video recognition,&#x0201D;</article-title> in <source>Proceedings of the 17th European Conference on Computer Vision (ECCV &#x00027;22)</source> (<publisher-loc>Springer Nature Switzerland</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>18</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-19772-7_1</pub-id></citation>
</ref>
<ref id="B76">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Nirkin</surname> <given-names>Y.</given-names></name> <name><surname>Keller</surname> <given-names>Y.</given-names></name> <name><surname>Hassner</surname> <given-names>T.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;FSGAN: subject agnostic face swapping and reenactment,&#x0201D;</article-title> in <source>Proceedings of the 2019 IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>7183</fpage>&#x02013;<lpage>7192</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2019.00728</pub-id><pub-id pub-id-type="pmid">35471874</pub-id></citation></ref>
<ref id="B77">
<citation citation-type="web"><person-group person-group-type="author"><collab>OpenAI</collab></person-group> (<year>2019</year>). <source>GPT-2-output-dataset: dataset of GPT-2 outputs for research in detection, biases, and more</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/openai/gpt-2-output-dataset/">https://github.com/openai/gpt-2-output-dataset/</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B78">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pal</surname> <given-names>D.</given-names></name> <name><surname>Triyason</surname> <given-names>T.</given-names></name></person-group> (<year>2018</year>). <article-title>A survey of standardized approaches towards the quality of experience evaluation for video services: an ITU perspective</article-title>. <source>Int. J. Dig. Multimedia Broadcast</source>. <volume>2018</volume>:<fpage>1391724</fpage>. <pub-id pub-id-type="doi">10.1155/2018/1391724</pub-id></citation>
</ref>
<ref id="B79">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Pu</surname> <given-names>J.</given-names></name> <name><surname>Mangaokar</surname> <given-names>N.</given-names></name> <name><surname>Kelly</surname> <given-names>L.</given-names></name> <name><surname>Bhattacharya</surname> <given-names>P.</given-names></name> <name><surname>Sundaram</surname> <given-names>K.</given-names></name> <name><surname>Javed</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2021a</year>). <article-title>&#x0201C;Deepfake videos in the wild: analysis and detection,&#x0201D;</article-title> in <source>Proceedings of the Web Conference 2021</source> (<publisher-loc>ACM</publisher-loc>), <fpage>981</fpage>&#x02013;<lpage>992</lpage>. <pub-id pub-id-type="doi">10.1145/3442381.3449978</pub-id></citation>
</ref>
<ref id="B80">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Pu</surname> <given-names>J.</given-names></name> <name><surname>Mangaokar</surname> <given-names>N.</given-names></name> <name><surname>Kelly</surname> <given-names>L.</given-names></name> <name><surname>Bhattacharya</surname> <given-names>P.</given-names></name> <name><surname>Sundaram</surname> <given-names>K.</given-names></name> <name><surname>Javed</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2021b</year>). <source>DF-W: a new deepfake dataset comprising of deepfake videos created and shared by the internet community</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/jmpu/webconf21-deepfakes-in-the-wild">https://github.com/jmpu/webconf21-deepfakes-in-the-wild</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B81">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Pu</surname> <given-names>J.</given-names></name> <name><surname>Sarwar</surname> <given-names>Z.</given-names></name> <name><surname>Abdullah</surname> <given-names>S. M.</given-names></name> <name><surname>Rehman</surname> <given-names>A.</given-names></name> <name><surname>Kim</surname> <given-names>Y.</given-names></name> <name><surname>Bhattacharya</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Deepfake text detection: Limitations and opportunities,&#x0201D;</article-title> in <source>Proceedings of the 2023 IEEE Symposium on Security and Privacy (SP)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1613</fpage>&#x02013;<lpage>1630</lpage>. <pub-id pub-id-type="doi">10.1109/SP46215.2023.10179387</pub-id></citation>
</ref>
<ref id="B82">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Radford</surname> <given-names>A.</given-names></name> <name><surname>Kim</surname> <given-names>J. W.</given-names></name> <name><surname>Hallacy</surname> <given-names>C.</given-names></name> <name><surname>Ramesh</surname> <given-names>A.</given-names></name> <name><surname>Goh</surname> <given-names>G.</given-names></name> <name><surname>Agarwal</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Learning transferable visual models from natural language supervision,&#x0201D;</article-title> in <source>Proceedings of the 38th International Conference on Machine Learning</source> (<publisher-loc>PMLR</publisher-loc>), <fpage>8748</fpage>&#x02013;<lpage>8763</lpage>.</citation>
</ref>
<ref id="B83">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rana</surname> <given-names>M. S.</given-names></name> <name><surname>Nobi</surname> <given-names>M. N.</given-names></name> <name><surname>Murali</surname> <given-names>B.</given-names></name> <name><surname>Sung</surname> <given-names>A. H.</given-names></name></person-group> (<year>2022</year>). <article-title>Deepfake detection: a systematic literature review</article-title>. <source>IEEE Access</source> <volume>10</volume>, <fpage>25494</fpage>&#x02013;<lpage>25513</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2022.3154404</pub-id><pub-id pub-id-type="pmid">34460519</pub-id></citation></ref>
<ref id="B84">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>R&#x000F6;ssler</surname> <given-names>A.</given-names></name> <name><surname>Cozzolino</surname> <given-names>D.</given-names></name> <name><surname>Verdoliva</surname> <given-names>L.</given-names></name> <name><surname>Riess</surname> <given-names>C.</given-names></name> <name><surname>Thies</surname> <given-names>J.</given-names></name> <name><surname>NieSSner</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>FaceForensics: a large-scale video dataset for forgery detection in human faces</article-title>. <source>arXiv preprint arXiv:1803.09179</source>.</citation>
</ref>
<ref id="B85">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>R&#x000F6;ssler</surname> <given-names>A.</given-names></name> <name><surname>Cozzolino</surname> <given-names>D.</given-names></name> <name><surname>Verdoliva</surname> <given-names>L.</given-names></name> <name><surname>Riess</surname> <given-names>C.</given-names></name> <name><surname>Thies</surname> <given-names>J.</given-names></name> <name><surname>NieSSner</surname> <given-names>M.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;FaceForensics&#x0002B;&#x0002B;: learning to detect manipulated facial images,&#x0201D;</article-title> in <source>Proceedings of the 2019 International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>11</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2019.00009</pub-id><pub-id pub-id-type="pmid">34960275</pub-id></citation></ref>
<ref id="B86">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rothe</surname> <given-names>R.</given-names></name> <name><surname>Timofte</surname> <given-names>R.</given-names></name> <name><surname>Van Gool</surname> <given-names>L.</given-names></name></person-group> (<year>2015</year>). <article-title>&#x0201C;DEX: deep expectation of apparent age from a single image,&#x0201D;</article-title> in <source>Proceedings of the 2015 IEEE International Conference on Computer Vision Workshop</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>252</fpage>&#x02013;<lpage>257</lpage>. <pub-id pub-id-type="doi">10.1109/ICCVW.2015.41</pub-id></citation>
</ref>
<ref id="B87">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sandotra</surname> <given-names>N.</given-names></name> <name><surname>Arora</surname> <given-names>B.</given-names></name></person-group> (<year>2024</year>). <article-title>A comprehensive evaluation of feature-based AI techniques for deepfake detection</article-title>. <source>Neural Comput. Applic</source>. <volume>36</volume>, <fpage>3859</fpage>&#x02013;<lpage>3887</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-023-09288-0</pub-id></citation>
</ref>
<ref id="B88">
<citation citation-type="web"><person-group person-group-type="author"><collab>Sensity</collab></person-group> (<year>2024</year>). <source>The state of deepfakes 2024</source>. Technical report, Sensity. Available at: <ext-link ext-link-type="uri" xlink:href="https://sensity.ai/reports/">https://sensity.ai/reports/</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B89">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Seow</surname> <given-names>J. W.</given-names></name> <name><surname>Lim</surname> <given-names>M. K.</given-names></name> <name><surname>Phan</surname> <given-names>R. C.</given-names></name> <name><surname>Liu</surname> <given-names>J. K.</given-names></name></person-group> (<year>2022</year>). <article-title>A comprehensive overview of Deepfake: generation, detection, datasets, and opportunities</article-title>. <source>Neurocomputing</source> <volume>513</volume>, <fpage>351</fpage>&#x02013;<lpage>371</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2022.09.135</pub-id></citation>
</ref>
<ref id="B90">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>H.</given-names></name> <name><surname>Huang</surname> <given-names>S.</given-names></name> <name><surname>Dong</surname> <given-names>Y.</given-names></name> <name><surname>Tu</surname> <given-names>W.-W.</given-names></name></person-group> (<year>2023a</year>). <source>DeepFakeFace</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/OpenRL-Lab/DeepFakeFace">https://github.com/OpenRL-Lab/DeepFakeFace</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B91">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>H.</given-names></name> <name><surname>Huang</surname> <given-names>S.</given-names></name> <name><surname>Dong</surname> <given-names>Y.</given-names></name> <name><surname>Tu</surname> <given-names>W.-W.</given-names></name></person-group> (<year>2023b</year>). <article-title>Robustness and generalizability of deepfake detection: a study with diffusion models</article-title>. <source>arXiv:2309.02218</source>.</citation>
</ref>
<ref id="B92">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Wu</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <source>CMFD</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/WuQinfang/CMFD">https://github.com/WuQinfang/CMFD</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B93">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Wu</surname> <given-names>Q.</given-names></name> <name><surname>Li</surname> <given-names>M.</given-names></name> <name><surname>Zhang</surname> <given-names>W.</given-names></name> <etal/></person-group>. (<year>2023b</year>). <article-title>Robust audio copy-move forgery detection using constant q spectral sketches and GA-SVM</article-title>. <source>IEEE Trans. Depend. Secure Comput</source>. <volume>20</volume>, <fpage>4016</fpage>&#x02013;<lpage>4031</lpage>. <pub-id pub-id-type="doi">10.1109/TDSC.2022.3215280</pub-id></citation>
</ref>
<ref id="B94">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Su</surname> <given-names>Z.</given-names></name> <name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Zhou</surname> <given-names>W.</given-names></name> <name><surname>Ma</surname> <given-names>G.</given-names></name> <name><surname>Hu</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>HC3 Plus: a semantic-invariant human ChatGPT comparison corpus</article-title>. <source>arXiv:2309.02731</source>.</citation>
</ref>
<ref id="B95">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tanaka</surname> <given-names>K.</given-names></name> <name><surname>Kameoka</surname> <given-names>H.</given-names></name> <name><surname>Kaneko</surname> <given-names>T.</given-names></name> <name><surname>Hojo</surname> <given-names>N.</given-names></name></person-group> (<year>2019</year>). <article-title>WaveCycleGAN2: time-domain neural post-filter for speech waveform generation</article-title>. <source>arXiv:1904.02892</source>.</citation>
</ref>
<ref id="B96">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Temnikova</surname> <given-names>I.</given-names></name> <name><surname>Marinova</surname> <given-names>I.</given-names></name> <name><surname>Gargova</surname> <given-names>S.</given-names></name> <name><surname>Margova</surname> <given-names>R.</given-names></name> <name><surname>Koychev</surname> <given-names>I.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Looking for traces of textual deepfakes in Bulgarian on social media,&#x0201D;</article-title> in <source>Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing</source> (<publisher-loc>Shoumen, Bulgaria</publisher-loc>: <publisher-name>INCOMA Ltd.</publisher-name>), <fpage>1151</fpage>&#x02013;<lpage>1161</lpage>. <pub-id pub-id-type="doi">10.26615/978-954-452-092-2_122</pub-id></citation>
</ref>
<ref id="B97">
<citation citation-type="web"><person-group person-group-type="author"><collab>Tencent</collab></person-group> (<year>2020</year>). <source>Artificial intelligence white paper</source>. Technical report, Tencent. Available at: <ext-link ext-link-type="uri" xlink:href="https://tech.sina.com.cn/roll/2020-07-14/doc-iivhvpwx5201226.shtml">https://tech.sina.com.cn/roll/2020-07-14/doc-iivhvpwx5201226.shtml</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B98">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Toda</surname> <given-names>T.</given-names></name> <name><surname>Chen</surname> <given-names>L.-H.</given-names></name> <name><surname>Saito</surname> <given-names>D.</given-names></name> <name><surname>Villavicencio</surname> <given-names>F.</given-names></name> <name><surname>Wester</surname> <given-names>M.</given-names></name> <name><surname>Wu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>&#x0201C;The voice conversion challenge 2016,&#x0201D;</article-title> in <source>Proceedings of Interspeech 2016</source> (<publisher-loc>International Speech Communication Association</publisher-loc>), <fpage>1632</fpage>&#x02013;<lpage>1636</lpage>. <pub-id pub-id-type="doi">10.21437/Interspeech.2016-1066</pub-id></citation>
</ref>
<ref id="B99">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tolosana</surname> <given-names>R.</given-names></name> <name><surname>Vera-Rodriguez</surname> <given-names>R.</given-names></name> <name><surname>Fierrez</surname> <given-names>J.</given-names></name> <name><surname>Morales</surname> <given-names>A.</given-names></name> <name><surname>Ortega-Garcia</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>Deepfakes and beyond: a survey of face manipulation and fake detection</article-title>. <source>Inf. Fusion</source> <volume>64</volume>, <fpage>131</fpage>&#x02013;<lpage>148</lpage>. <pub-id pub-id-type="doi">10.1016/j.inffus.2020.06.014</pub-id></citation>
</ref>
<ref id="B100">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tong</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Pan</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;An overview of deepfake: the sword of Damocles in AI,&#x0201D;</article-title> in <source>Proceedings of the 2020 International Conference on Computer Vision, Image and Deep Learning</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>265</fpage>&#x02013;<lpage>273</lpage>. <pub-id pub-id-type="doi">10.1109/CVIDL51233.2020.00-88</pub-id></citation>
</ref>
<ref id="B101">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Uchendu</surname> <given-names>A.</given-names></name> <name><surname>Ma</surname> <given-names>Z.</given-names></name> <name><surname>Le</surname> <given-names>T.</given-names></name> <name><surname>Zhang</surname> <given-names>R.</given-names></name> <name><surname>Lee</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;TURINGBENCH: a benchmark environment for Turing test in the age of neural text generation,&#x0201D;</article-title> in <source>Findings of the Association for Computational Linguistics: EMNLP 2021</source> (<publisher-loc>ACL</publisher-loc>), <fpage>2001</fpage>&#x02013;<lpage>2016</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.findings-emnlp.172</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B102">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Van Den Oord</surname> <given-names>A.</given-names></name> <name><surname>Dieleman</surname> <given-names>S.</given-names></name> <name><surname>Zen</surname> <given-names>H.</given-names></name> <name><surname>Simonyan</surname> <given-names>K.</given-names></name> <name><surname>Vinyals</surname> <given-names>O.</given-names></name> <name><surname>Graves</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>WaveNet: a generative model for raw audio</article-title>. <source>arXiv:1609.03499</source>.</citation>
</ref>
<ref id="B103">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Verdoliva</surname> <given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>Media forensics and deepfakes: an overview</article-title>. <source>IEEE J. Selected Topics Signal Proc</source>. <volume>14</volume>, <fpage>910</fpage>&#x02013;<lpage>932</lpage>. <pub-id pub-id-type="doi">10.1109/JSTSP.2020.3002101</pub-id><pub-id pub-id-type="pmid">38855214</pub-id></citation></ref>
<ref id="B104">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Yamagishi</surname> <given-names>J.</given-names></name> <name><surname>Todisco</surname> <given-names>M.</given-names></name> <name><surname>Delgado</surname> <given-names>H.</given-names></name> <name><surname>Nautsch</surname> <given-names>A.</given-names></name> <name><surname>Evans</surname> <given-names>N.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>ASVspoof 2019: A large-scale public database of synthesized, converted and replayed speech</article-title>. <source>Comput. Speech Lang</source>.<volume>64</volume>:<fpage>101114</fpage>. <pub-id pub-id-type="doi">10.1016/j.csl.2020.101114</pub-id></citation>
</ref>
<ref id="B105">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Bao</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023a</year>). <article-title>&#x0201C;DIRE for diffusion-generated image detection,&#x0201D;</article-title> in <source>Proceedings of the 2023 IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>22388</fpage>&#x02013;<lpage>22398</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV51070.2023.02051</pub-id></citation>
</ref>
<ref id="B106">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Bao</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>W.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023b</year>). <source>DIRE for diffusion-generated image detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/ZhendongWang6/DIRE">https://github.com/ZhendongWang6/DIRE</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B107">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Schuurmans</surname> <given-names>D.</given-names></name> <name><surname>Bosma</surname> <given-names>M.</given-names></name> <name><surname>Ichter</surname> <given-names>B.</given-names></name> <name><surname>Xia</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Chain-of-Thought prompting elicits reasoning in large language models,&#x0201D;</article-title> in <source>Proceedings of the 36th Neural Information Processing Systems (NeurIPS &#x00027;22)</source> (<publisher-loc>Curran Associates, Inc.</publisher-loc>), <fpage>24824</fpage>&#x02013;<lpage>24837</lpage>.</citation>
</ref>
<ref id="B108">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>J. Z.</given-names></name> <name><surname>Fang</surname> <given-names>G.</given-names></name> <name><surname>Wu</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Ge</surname> <given-names>Y.</given-names></name> <name><surname>Cun</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Towards a better metric for text-to-video generation</article-title>. <source>arXiv:2401.07781</source>.</citation>
</ref>
<ref id="B109">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>W.</given-names></name> <name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>H.</given-names></name> <name><surname>Gu</surname> <given-names>Y.</given-names></name> <name><surname>Zhao</surname> <given-names>R.</given-names></name> <name><surname>He</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;DatasetDM: synthesizing data with perception annotations using diffusion models,&#x0201D;</article-title> in <source>Proceedings of the 37th International Conference on Neural Information Processing Systems</source> (<publisher-loc>Curran Associates, Inc.</publisher-loc>), <fpage>54683</fpage>&#x02013;<lpage>54695</lpage>.</citation>
</ref>
<ref id="B110">
<citation citation-type="web"><person-group person-group-type="author"><collab>Xidian University Modern Image Processing Lab.</collab></person-group> (<year>2024</year>). <source>Homologous deepfake dataset: A self built small-scale, high-quality, and diverse deepfake dataset</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/mirro-yyf/Homologous_deepfake_dataset">https://github.com/mirro-yyf/Homologous_deepfake_dataset</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B111">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>J.</given-names></name> <name><surname>Lu</surname> <given-names>X.</given-names></name> <name><surname>Jiang</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Cheng</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023a</year>). FSD: An initial chinese dataset for fake song detection. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/xieyuankun/FSD-Dataset">https://github.com/xieyuankun/FSD-Dataset</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B112">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Xie</surname> <given-names>Y.</given-names></name> <name><surname>Zhou</surname> <given-names>J.</given-names></name> <name><surname>Lu</surname> <given-names>X.</given-names></name> <name><surname>Jiang</surname> <given-names>Z.</given-names></name> <name><surname>Yang</surname> <given-names>Y.</given-names></name> <name><surname>Cheng</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023b</year>). <source>FSD: an initial chinese dataset for fake song detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/xieyuankun/FSD-Dataset">https://github.com/xieyuankun/FSD-Dataset</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B113">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yariv</surname> <given-names>G.</given-names></name> <name><surname>Gat</surname> <given-names>I.</given-names></name> <name><surname>Benaim</surname> <given-names>S.</given-names></name> <name><surname>Wolf</surname> <given-names>L.</given-names></name> <name><surname>Schwartz</surname> <given-names>I.</given-names></name> <name><surname>Adi</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Diverse and aligned audio-to-video generation via text-to-video model adaptation,&#x0201D;</article-title> in <source>Proceedings of the 38th AAAI Conference on Artificial Intelligence (AAAI &#x00027;24)</source> (<publisher-loc>AAAI</publisher-loc>), <fpage>6639</fpage>&#x02013;<lpage>6647</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v38i7.28486</pub-id></citation>
</ref>
<ref id="B114">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Bai</surname> <given-names>Y.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Ma</surname> <given-names>H.</given-names></name> <name><surname>Tian</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;Half-Truth: a partially fake audio detection dataset,&#x0201D;</article-title> in <source>Proceedings of Interspeech 2021</source>, <fpage>1654</fpage>&#x02013;<lpage>1658</lpage>.</citation>
</ref>
<ref id="B115">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Fu</surname> <given-names>R.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Nie</surname> <given-names>S.</given-names></name> <name><surname>Ma</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2022a</year>). <article-title>&#x0201C;ADD 2022: the first audio deep synthesis detection challenge,&#x0201D;</article-title> in <source>Proceedings of the 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>9216</fpage>&#x02013;<lpage>9220</lpage>.</citation>
</ref>
<ref id="B116">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Fu</surname> <given-names>R.</given-names></name> <name><surname>Yan</surname> <given-names>X.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;ADD 2023: the second audio deepfake detection challenge,&#x0201D;</article-title> in <source>Proceedings of the Workshop on Deepfake Audio Detection and Analysis</source> (<publisher-loc>CEUR Workshop Proceedings</publisher-loc>), <fpage>125</fpage>&#x02013;<lpage>130</lpage>.</citation>
</ref>
<ref id="B117">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Tian</surname> <given-names>Z.</given-names></name> <name><surname>Fan</surname> <given-names>C.</given-names></name> <name><surname>Ma</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2022b</year>). <source>SceneFake: an initial dataset and benchmarks for scene fake audio detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/ADDchallenge/SceneFake">https://github.com/ADDchallenge/SceneFake</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B118">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>C. Y.</given-names></name> <name><surname>Fan</surname> <given-names>C.</given-names></name> <name><surname>Tian</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>SceneFake: an initial dataset and benchmarks for scene fake audio detection</article-title>. <source>Patt. Recogn</source>. <volume>152</volume>:<fpage>110468</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2024.110468</pub-id></citation>
</ref>
<ref id="B119">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yi</surname> <given-names>Z.</given-names></name> <name><surname>Huang</surname> <given-names>W.-C.</given-names></name> <name><surname>Tian</surname> <given-names>X.</given-names></name> <name><surname>Yamagishi</surname> <given-names>J.</given-names></name> <name><surname>Das</surname> <given-names>R. K.</given-names></name> <name><surname>Kinnunen</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Voice conversion challenge 2020-intra-lingual semi-parallel and cross-lingual voice conversion.,&#x0201D;</article-title> in <source>Proceedings of the Joint Workshop for the Blizzard Challenge and Voice Conversion Challenge 2020</source> (<publisher-loc>International Speech Communication Association</publisher-loc>), <fpage>80</fpage>&#x02013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.21437/VCCBC.2020-14</pub-id></citation>
</ref>
<ref id="B120">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Younus</surname> <given-names>M. A.</given-names></name> <name><surname>Hasan</surname> <given-names>T. M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Abbreviated view of deepfake videos detection techniques,&#x0201D;</article-title> in <source>Proceedings of the 2020 6th International Engineering Conference</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>115</fpage>&#x02013;<lpage>120</lpage>. <pub-id pub-id-type="doi">10.1109/IEC49899.2020.9122916</pub-id></citation>
</ref>
<ref id="B121">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>P.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Feng</surname> <given-names>X.</given-names></name> <name><surname>Xia</surname> <given-names>Z.</given-names></name></person-group> (<year>2024a</year>). <source>CHEAT</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/botianzhe/CHEAT">https://github.com/botianzhe/CHEAT</ext-link> (accessed July, 2024)</citation>
</ref>
<ref id="B122">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>P.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Feng</surname> <given-names>X.</given-names></name> <name><surname>Xia</surname> <given-names>Z.</given-names></name></person-group> (<year>2024b</year>). <article-title>CHEAT: A large-scale dataset for detecting ChatGPT-written abstracts</article-title>. <source>arXiv:2304.12008</source>.</citation>
</ref>
<ref id="B123">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhai</surname> <given-names>G.</given-names></name> <name><surname>Min</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>Perceptual image quality assessment: a survey</article-title>. <source>Sci. China Inf. Sci</source>. <volume>63</volume>, <fpage>1</fpage>&#x02013;<lpage>52</lpage>. <pub-id pub-id-type="doi">10.1007/s11432-019-2757-1</pub-id></citation>
</ref>
<ref id="B124">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Cooper</surname> <given-names>E.</given-names></name> <name><surname>Yamagishi</surname> <given-names>J.</given-names></name> <name><surname>Patino</surname> <given-names>J.</given-names></name> <name><surname>Evans</surname> <given-names>N.</given-names></name></person-group> (<year>2021a</year>). <article-title>&#x0201C;An initial investigation for detecting partially spoofed audio,&#x0201D;</article-title> in <source>Proceedings of Interspeech 2021</source>, 4264&#x02013;4268. <pub-id pub-id-type="doi">10.21437/Interspeech.2021-738</pub-id></citation>
</ref>
<ref id="B125">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Cooper</surname> <given-names>E.</given-names></name> <name><surname>Yamagishi</surname> <given-names>J.</given-names></name> <name><surname>Patino</surname> <given-names>J.</given-names></name> <name><surname>Evans</surname> <given-names>N.</given-names></name></person-group> (<year>2021b</year>). <source>PartialSpoof</source> . Zenodo dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://zenodo.org/records/5766198">https://zenodo.org/records/5766198</ext-link> (accessed March, 2024)</citation>
</ref>
<ref id="B126">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>T.</given-names></name> <name><surname>Deng</surname> <given-names>L.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Dang</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Deep learning in face synthesis: a survey on deepfakes,&#x0201D;</article-title> in <source>Proceedings of the 2020 IEEE 3rd International Conference on Computer and Communication Engineering Technology</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>67</fpage>&#x02013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1109/CCET50901.2020.9213159</pub-id><pub-id pub-id-type="pmid">36662116</pub-id></citation></ref>
<ref id="B127">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Ling</surname> <given-names>H.</given-names></name> <name><surname>Gao</surname> <given-names>J.</given-names></name> <name><surname>Yin</surname> <given-names>K.</given-names></name> <name><surname>Lafleche</surname> <given-names>J.-F.</given-names></name> <name><surname>Barriuso</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;DatasetGAN: Efficient labeled data factory with minimal human effort,&#x0201D;</article-title> in <source>Proceedings of the 2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>10140</fpage>&#x02013;<lpage>10150</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.01001</pub-id></citation>
</ref>
<ref id="B128">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Yin</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Yin</surname> <given-names>G.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name> <name><surname>Shao</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;CelebA-Spoof: large-scale face anti-spoofing dataset with rich annotations,&#x0201D;</article-title> in <source>Proceedings of the 2020 European Conference on Computer Vision</source> (<publisher-loc>Springer</publisher-loc>), <fpage>70</fpage>&#x02013;<lpage>85</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58610-2_5</pub-id></citation>
</ref>
<ref id="B129">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Xu</surname> <given-names>C.</given-names></name> <name><surname>Sun</surname> <given-names>Y.</given-names></name> <name><surname>Deng</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>Deep fake geography? When geospatial data encounter artificial intelligence</article-title>. <source>Cartogr. Geogr. Inf. Sci</source>. <volume>48</volume>, <fpage>338</fpage>&#x02013;<lpage>352</lpage>. <pub-id pub-id-type="doi">10.1080/15230406.2021.1910075</pub-id></citation>
</ref>
<ref id="B130">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Dong</surname> <given-names>Y.</given-names></name></person-group> (<year>2023a</year>). <article-title>EmoFake: an initial dataset for emotion fake audio detection</article-title>. <source>arXiv:2211.05363</source>.</citation>
</ref>
<ref id="B131">
<citation citation-type="web"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>Y.</given-names></name> <name><surname>Yi</surname> <given-names>J.</given-names></name> <name><surname>Tao</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Dong</surname> <given-names>Y.</given-names></name></person-group> (<year>2023b</year>). <source>EmoFake: an initial dataset for emotion fake audio detection</source>. GitHub dataset. Available at: <ext-link ext-link-type="uri" xlink:href="https://github.com/zy511361103/GADE">https://github.com/zy511361103/GADE</ext-link> (accessed July, 2024).</citation>
</ref>
<ref id="B132">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>P.</given-names></name> <name><surname>Han</surname> <given-names>X.</given-names></name> <name><surname>Morariu</surname> <given-names>V. I.</given-names></name> <name><surname>Davis</surname> <given-names>L. S.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Two-stream neural networks for tampered face detection,&#x0201D;</article-title> in <source>Proceedings of the 2017 IEEE Conference on Computer Vision and Pattern Recognition Workshops</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>1831</fpage>&#x02013;<lpage>1839</lpage>. <pub-id pub-id-type="doi">10.1109/CVPRW.2017.229</pub-id></citation>
</ref>
<ref id="B133">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>T.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Liang</surname> <given-names>Z.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Face forensics in the wild,&#x0201D;</article-title> in <source>Proceedings of the 2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>5774</fpage>&#x02013;<lpage>5784</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00572</pub-id></citation>
</ref>
<ref id="B134">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>D.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Shen</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>X.</given-names></name> <name><surname>Elhoseiny</surname> <given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;MiniGPT-4: enhancing vision-language understanding with advanced large language models,&#x0201D;</article-title> in <source>Proceedings of the 12th International Conference on Learning Representations</source>.<pub-id pub-id-type="pmid">38329788</pub-id></citation></ref>
<ref id="B135">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>J.-Y.</given-names></name> <name><surname>Park</surname> <given-names>T.</given-names></name> <name><surname>Isola</surname> <given-names>P.</given-names></name> <name><surname>Efros</surname> <given-names>A. A.</given-names></name></person-group> (<year>2017</year>). <article-title>&#x0201C;Unpaired image-to-image translation using cycle-consistent adversarial networks,&#x0201D;</article-title> in <source>Proceedings of the 2017 IEEE International Conference on Computer Vision</source> (<publisher-loc>IEEE</publisher-loc>), <fpage>2242</fpage>&#x02013;<lpage>2251</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2017.244</pub-id></citation>
</ref>
<ref id="B136">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Zi</surname> <given-names>B.</given-names></name> <name><surname>Chang</surname> <given-names>M.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <name><surname>Ma</surname> <given-names>X.</given-names></name> <name><surname>Jiang</surname> <given-names>Y.-G.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;WildDeepfake: a challenging real-world dataset for deepfake detection,&#x0201D;</article-title> in <source>Proceedings of the 2020 28th ACM International Conference on Multimedia</source> (<publisher-loc>ACM</publisher-loc>), <fpage>2382</fpage>&#x02013;<lpage>2390</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>