<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1377285</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2024.1377285</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>iDNA-OpenPrompt: OpenPrompt learning model for identifying DNA methylation</article-title>
<alt-title alt-title-type="left-running-head">Yu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2024.1377285">10.3389/fgene.2024.1377285</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Yu</surname>
<given-names>Xia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2204455/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ren</surname>
<given-names>Jia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Long</surname>
<given-names>Haixia</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zeng</surname>
<given-names>Rao</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1783423/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Guoqiang</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bilal</surname>
<given-names>Anas</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Cui</surname>
<given-names>Yani</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>School of Information and Communication Engineering</institution>, <institution>Hainan University</institution>, <addr-line>Haikou</addr-line>, <addr-line>Hainan</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Information Science and Technology</institution>, <institution>Hainan Normal University</institution>, <addr-line>Haikou</addr-line>, <addr-line>Hainan</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/437385/overview">Michal Marczyk</ext-link>, Silesian University of Technology, Poland</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/797721/overview">Guosheng Han</ext-link>, Xiangtan University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1064518/overview">Leyi Wei</ext-link>, Shandong University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Yani Cui, <email>cyn0213@163.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>16</day>
<month>04</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1377285</elocation-id>
<history>
<date date-type="received">
<day>27</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>07</day>
<month>03</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Yu, Ren, Long, Zeng, Zhang, Bilal and Cui.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Yu, Ren, Long, Zeng, Zhang, Bilal and Cui</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>
<bold>Introduction:</bold> DNA methylation is a critical epigenetic modification involving the addition of a methyl group to the DNA molecule, playing a key role in regulating gene expression without changing the DNA sequence. The main difficulty in identifying DNA methylation sites lies in the subtle and complex nature of methylation patterns, which may vary across different tissues, developmental stages, and environmental conditions. Traditional methods for methylation site identification, such as bisulfite sequencing, are typically labor-intensive, costly, and require large amounts of DNA, hindering high-throughput analysis. Moreover, these methods may not always provide the resolution needed to detect methylation at specific sites, especially in genomic regions that are rich in repetitive sequences or have low levels of methylation. Furthermore, current deep learning approaches generally lack sufficient accuracy.</p>
<p>
<bold>Methods:</bold> This study introduces the iDNA-OpenPrompt model, leveraging the novel OpenPrompt learning framework. The model combines a prompt template, prompt verbalizer, and Pre-trained Language Model (PLM) to construct the prompt-learning framework for DNA methylation sequences. Moreover, a DNA vocabulary library, BERT tokenizer, and specific label words are also introduced into the model to enable accurate identification of DNA methylation sites.</p>
<p>
<bold>Results and Discussion:</bold> An extensive analysis is conducted to evaluate the predictive, reliability, and consistency capabilities of the iDNA-OpenPrompt model. The experimental outcomes, covering 17 benchmark datasets that include various species and three DNA methylation modifications (4mC, 5hmC, 6mA), consistently indicate that our model surpasses outstanding performance and robustness approaches.</p>
</abstract>
<kwd-group>
<kwd>DNA methylation</kwd>
<kwd>OpenPrompt learning</kwd>
<kwd>prompt template</kwd>
<kwd>prompt verbalizer</kwd>
<kwd>BERT tokenizer</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Genomics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>DNA methylation is essential for numerous biological processes and is associated with multiple diseases, particularly cancer (<xref ref-type="bibr" rid="B29">Maegawa et al., 2010</xref>; <xref ref-type="bibr" rid="B34">Yehudit and Howard, 2013</xref>). Accurately identifying DNA methylation sites is necessary for comprehending gene regulation and the mechanisms of diseases. Deep learning approaches have recently emerged as a significant tool in recognizing DNA methylation sites, demonstrating encouraging outcomes. Presently, three extensively studied DNA methylation types include N6-methyladenine (6mA), 5-hydroxymethylcytosine (5hmC), and N4-methylcytosine (4mC) (<xref ref-type="bibr" rid="B22">Manavalan et al., 2019</xref>; <xref ref-type="bibr" rid="B37">Yingying et al., 2021</xref>).</p>
<p>The field has recently witnessed notable advancements in integrating deep learning methodologies. Regarding the prediction of DNA methylation sites of 4-mC species, in 2019, introducing two remarkable algorithms, 4mCCNN (<xref ref-type="bibr" rid="B13">Khanal et al., 2019</xref>) and 4mCPred-SVM (<xref ref-type="bibr" rid="B15">Leyi et al., 2019</xref>), marked a leap in 4-mC prediction capabilities. 4mCCNN used a CNN-based framework, whereas 4mCPred-SVM was developed using support vector machine (SVM) techniques. Additionally, <xref ref-type="bibr" rid="B25">Quanzhong et al. (2020)</xref> crafted DeepTorrent, a composite model fusing CNN and BiLSTM, to identify 4-mC sites (<xref ref-type="bibr" rid="B25">Quanzhong et al., 2020</xref>). Deep4mC, another innovative algorithm, validated the effectiveness of a CNN-only approach in delivering impressive 4-mC prediction outcomes (<xref ref-type="bibr" rid="B8">Haodong et al., 2020</xref>). Hyb4mC introduced a unique approach, integrating an elastic net with a capsule network for smaller datasets while emphasizing the prowess of CNN for larger datasets (<xref ref-type="bibr" rid="B35">Ying et al., 2022</xref>). Moreover, Zeng et al. introduced a novel two-layer deep learning structure named Deep4mcPred, based on ResNet with long short-term memory (LSTM) (<xref ref-type="bibr" rid="B26">Rao and Minghong, 2020</xref>). <xref ref-type="bibr" rid="B32">Xia et al. (2023)</xref> presented the DRSN4mCPred model, a variant based on the deep residual network, and it can enhance the model&#x2019;s capability to assimilate intricate data characteristics (<xref ref-type="bibr" rid="B32">Xia et al., 2023</xref>).</p>
<p>The research focusing specifically on recognizing 5hmC sites is comparatively limited. Tran TA et al. applied a unique feature extraction approach using k-mer embeddings obtained from a pre-trained language model (<xref ref-type="bibr" rid="B4">Duong et al., 2021</xref>). The BiLSTM-5mC model leveraged one-hot encoding and nucleotide property and frequency (NPF) techniques for representing nucleotide sequences. It then integrated a bidirectional long short-term memory (BiLSTM) model with a fully connected network to forecast methylation sites (<xref ref-type="bibr" rid="B33">Xin et al., 2021</xref>).</p>
<p>The field has seen considerable research in identifying 6-mA methylation sites. For instance, the sNNRice6mA algorithm adopted a two-dimensional one-hot encoding approach for DNA sequences, using a convolutional neural network (CNN) to identify 6-mA sites (<xref ref-type="bibr" rid="B5">Haitao and Zhiming, 2019</xref>). <xref ref-type="bibr" rid="B36">Ying et al. (2021)</xref> incorporated an attention mechanism into their model, enhancing the identification of critical features for more accurate detection of epigenetic changes in DNA (<xref ref-type="bibr" rid="B36">Ying et al., 2021</xref>). <xref ref-type="bibr" rid="B23">Mehedi et al. (2020)</xref> developed Meta-i6mA, a cross-species predictive framework for 6-mA sites in plant genomes, leveraging informative features in a comprehensive machine learning methodology (<xref ref-type="bibr" rid="B23">Mehedi et al., 2020</xref>). <xref ref-type="bibr" rid="B12">Juntao et al. (2021)</xref> introduced DeepM6ASeq-EL, an advanced method combining LSTM with ensemble learning to predict human m6A sites in RNA with high accuracy (<xref ref-type="bibr" rid="B12">Juntao et al., 2021</xref>). This fusion of techniques significantly boosts the model&#x2019;s prediction accuracy, offering a powerful tool for m6A site identification in the human genome. <xref ref-type="bibr" rid="B30">Sho et al. (2022)</xref> used word to vector (word2vec) and Bidirectional Encoder Representations from Transformers (BERT) for developing BERT6mA, a deep learning framework that showed exceptional performance in predicting 6-mA modifications (<xref ref-type="bibr" rid="B30">Sho et al., 2022</xref>). <xref ref-type="bibr" rid="B27">Ue et al. (2022)</xref> proposed a CapsuleNet-based DNA m6A site recognition framework, proving its precision in methylation site prediction (<xref ref-type="bibr" rid="B27">Ur et al., 2022</xref>). <xref ref-type="bibr" rid="B30">Sho et al. (2022)</xref> demonstrated that BERT-based models could significantly enhance the accuracy of predicting 6-mA sites in DNA, effectively handling interspecies variations and serving as a valuable asset for plant genome studies and epigenetic research (<xref ref-type="bibr" rid="B30">Sho et al., 2022</xref>).</p>
<p>Although the methods mentioned earlier have achieved varying degrees of progress, they are all specifically designed to identify one type of DNA methylation. Conversely, there are only a few techniques that address all three previously mentioned methylation categories (<xref ref-type="bibr" rid="B21">Lv et al., 2020</xref>; <xref ref-type="bibr" rid="B37">Yingying et al., 2021</xref>; <xref ref-type="bibr" rid="B11">Junru et al., 2022</xref>), with notable examples being iDNA-ABT (<xref ref-type="bibr" rid="B37">Yingying et al., 2021</xref>), iDNA-ABF (<xref ref-type="bibr" rid="B11">Junru et al., 2022</xref>), and iDNA-MS (<xref ref-type="bibr" rid="B21">Lv et al., 2020</xref>). Typically, DNA methylation datasets appropriate for deep learning contain shorter sequences per sample, with sequences of 41 base pairs (bp) being predominantly prevalent.</p>
<p>Many studies indicate a growing interest in using deep learning to predict DNA methylation, achieving significant progress in enhancing prediction accuracy (<xref ref-type="bibr" rid="B31">Wang et al., 2023</xref>). However, current deep learning-based models have not completely exploited the capabilities of learning features. Acknowledging this gap, the genomic sequences can be viewed as biological texts, and the sequences&#x2019; bases can be considered biological words (<xref ref-type="bibr" rid="B41">Zou et al., 2019</xref>; <xref ref-type="bibr" rid="B1">Dai et al., 2022</xref>). Considering this, we propose the iDNA-OpenPrompt model, an OpenPrompt learning approach (<xref ref-type="bibr" rid="B3">Ding et al., 2021</xref>) for DNA methylation sequences. The model combines a prompt template, prompt verbalizer, and pre-trained language model (PLM) to construct a prompt learning framework.</p>
<p>Moreover, a DNA vocabulary library, BERT tokenizer, and specific label words are also introduced into the model to enable accurate identification of DNA methylation sites. An extensive analysis is conducted to evaluate the predictive performance, reliability, and consistency of the iDNA-OpenPrompt model. The results, which include 17 benchmark datasets covering a variety of species and three types of DNA methylation modifications (4&#xa0;mC, 5&#xa0;hmC, and 6&#xa0;mA), consistently reveal that our model surpasses other outstanding methods in both performance metrics and overall robustness.</p>
<p>The primary contribution of this article is that the iDNA-OpenPrompt model can learn biological contextual semantics. In contrast to the existing approaches, iDNA-OpenPrompt brings the following contributions:<list list-type="simple">
<list-item>
<p>(1) Our model creates a DNA vocabulary library and integrates it with the BERT tokenizer for DNA methylation sequences to develop the prompt template.</p>
</list-item>
<list-item>
<p>(2) Our model constructs label words specific to DNA methylation sequences and integrates them with the BERT tokenizer to establish a prompt verbalizer.</p>
</list-item>
<list-item>
<p>(3) Our model constructs an OpenPrompt learning model that can be used for identifying DNA methylation sites.</p>
</list-item>
</list>
</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Dataset</title>
<p>For the iDNA-OpenPrompt model&#x2019;s evaluation, the datasets are selected from the iDNA-MS web server (<xref ref-type="bibr" rid="B10">iDNA-MS, 2020</xref>), including training and independent testing subsets, as detailed in <xref ref-type="table" rid="T1">Table 1</xref>. There are 4mC, 5hmC, 6mA methylation sequences, totaling 17 datasets, encompassing 501,200 DNA sequences. The length of each sample in the datasets is 41 base pairs. It is worth mentioning that in the 6mA samples, the methylated adenine (A) is always found in the central position, and similarly, methylated cytosine (C) is prominent in the 5hmC and 4mC samples. Indeed, such central position characteristics are also present in the negative samples.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Overview of datasets.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">ID</th>
<th rowspan="2" align="left">Dataset</th>
<th colspan="2" align="left">Training</th>
<th colspan="2" align="left">Independent testing</th>
</tr>
<tr>
<th align="left">Positive</th>
<th align="left">Negative</th>
<th align="left">Positive</th>
<th align="left">Negative</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">1</td>
<td align="left">4mC_C.equisetifolia</td>
<td align="left">183</td>
<td align="left">183</td>
<td align="left">183</td>
<td align="left">183</td>
</tr>
<tr>
<td align="left">2</td>
<td align="left">4mC_F.vesca</td>
<td align="left">7,899</td>
<td align="left">7,899</td>
<td align="left">7,898</td>
<td align="left">7,898</td>
</tr>
<tr>
<td align="left">3</td>
<td align="left">4mC_S.cerevisiae</td>
<td align="left">990</td>
<td align="left">990</td>
<td align="left">989</td>
<td align="left">989</td>
</tr>
<tr>
<td align="left">4</td>
<td align="left">4mC_Tolypocladium</td>
<td align="left">7,664</td>
<td align="left">7,664</td>
<td align="left">7,663</td>
<td align="left">7,663</td>
</tr>
<tr>
<td align="left">5</td>
<td align="left">5hmC_H.sapiens</td>
<td align="left">1,172</td>
<td align="left">1,172</td>
<td align="left">1,172</td>
<td align="left">1,172</td>
</tr>
<tr>
<td align="left">6</td>
<td align="left">5hmC_M.musculus</td>
<td align="left">1840</td>
<td align="left">1840</td>
<td align="left">1839</td>
<td align="left">1839</td>
</tr>
<tr>
<td align="left">7</td>
<td align="left">6mA_A.thaliana</td>
<td align="left">15,937</td>
<td align="left">15,937</td>
<td align="left">15,936</td>
<td align="left">15,936</td>
</tr>
<tr>
<td align="left">8</td>
<td align="left">6mA_C.elegans</td>
<td align="left">3,981</td>
<td align="left">3,981</td>
<td align="left">3,980</td>
<td align="left">3,980</td>
</tr>
<tr>
<td align="left">9</td>
<td align="left">6mA_C.equisetifclia</td>
<td align="left">3,033</td>
<td align="left">3,033</td>
<td align="left">3,033</td>
<td align="left">3,033</td>
</tr>
<tr>
<td align="left">10</td>
<td align="left">6mA_D.melanogaster</td>
<td align="left">5,596</td>
<td align="left">5,596</td>
<td align="left">5,595</td>
<td align="left">5,595</td>
</tr>
<tr>
<td align="left">11</td>
<td align="left">6mA_F.vesca</td>
<td align="left">1,551</td>
<td align="left">1,551</td>
<td align="left">1,551</td>
<td align="left">1,551</td>
</tr>
<tr>
<td align="left">12</td>
<td align="left">6mA_H.sapiens</td>
<td align="left">9,168</td>
<td align="left">9,168</td>
<td align="left">9,167</td>
<td align="left">9,167</td>
</tr>
<tr>
<td align="left">13</td>
<td align="left">6mA_R.chinensis</td>
<td align="left">300</td>
<td align="left">300</td>
<td align="left">300</td>
<td align="left">300</td>
</tr>
<tr>
<td align="left">14</td>
<td align="left">6mA_S.cerevisiae</td>
<td align="left">1893</td>
<td align="left">1893</td>
<td align="left">1893</td>
<td align="left">1893</td>
</tr>
<tr>
<td align="left">15</td>
<td align="left">6mA_T.thermophile</td>
<td align="left">53,800</td>
<td align="left">53,800</td>
<td align="left">53,800</td>
<td align="left">53,800</td>
</tr>
<tr>
<td align="left">16</td>
<td align="left">6mA_Tolypocladium</td>
<td align="left">1,690</td>
<td align="left">1,690</td>
<td align="left">1,689</td>
<td align="left">1,689</td>
</tr>
<tr>
<td align="left">17</td>
<td align="left">6mA_Xoc BLS256</td>
<td align="left">8,608</td>
<td align="left">8,608</td>
<td align="left">8,607</td>
<td align="left">8,607</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>
<xref ref-type="table" rid="T1">Table 1</xref> includes a &#x201c;dataset&#x201d; column, which lists the names of the various datasets. Within these names, the part before the &#x201c;-&#x201d; separator signifies the methylation modification type, and the segment following the separator denotes the species type. The &#x201c;training&#x201d; and &#x201c;testing&#x201d; columns provide detailed information about the quantity of positive and negative samples within each dataset.</p>
</sec>
<sec id="s2-2">
<title>2.2 Overview of iDNA-OpenPrompt</title>
<p>
<xref ref-type="fig" rid="F1">Figure 1</xref> displays the overall structure of the iDNA-OpenPrompt model. The core module of the iDNA-OpenPrompt model (prompt model) mainly consists of three parts: the prompt template, prompt verbalizer, and PLM. The prompt template part involves building a DNA vocabulary library and training it in the transformer&#x2019;s BERT tokenizer to form the prompt template. In the prompt verbalizer part, label words for DNA methylation sequences are created, and the constructed label words, along with the transformer&#x2019;s BERT tokenizer, are used to build a prompt verbalizer in the manual verbalizer method of OpenPrompt learning. The BERT model, which can capture bidirectional contextual information in the text, is used for the PLM part. Below, the key technologies of the iDNA-OpenPrompt model will be introduced.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Overall architecture of the iDNA-OpenPrompt Model.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g001.tif"/>
</fig>
</sec>
<sec id="s2-3">
<title>2.3 Prompt learning</title>
<p>In a standard prompt learning setting, like in natural language processing (NLP) tasks, input sentences are structured through a natural language template. This process frames text classification tasks as cloze-style tasks (<xref ref-type="bibr" rid="B40">Zhu et al., 2023</xref>). For example, in a task of classification, the goal is to categorize the sentence x into various topics, such as &#x201c;I must reduce the budget&#x201d; into the label. <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> or <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the template could be expressed as Eq. (<xref ref-type="disp-formula" rid="e1">1</xref>):<disp-formula id="e1">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>a</mml:mi>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>K</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>q</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
<p>Given an input <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mi mathvariant="normal">n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, categorized into a label y from the set of labels Y, the corresponding label word set is represented as <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. Here, <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a subset of the vocabulary V and associated with the y category. In PLMs, denoted as P, the probability of each word <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:mi mathvariant="normal">v</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> in <inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>y</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> being used to fill in the [MASK] is represented by <inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mtext>MASK</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">V</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. As a result, the text classification task is reformulated by calculating the probabilities of label words. This computation is formulated as Eq. (<xref ref-type="disp-formula" rid="e2">2</xref>):<disp-formula id="e2">
<mml:math id="m10">
<mml:mrow>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">y</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="normal">Y</mml:mi>
<mml:mo>&#x7c;</mml:mo>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mtext>MASK</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">V</mml:mi>
<mml:mi mathvariant="normal">y</mml:mi>
</mml:msub>
<mml:mo>&#x7c;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mi mathvariant="normal">p</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>In this example, if the determined probability for <inline-formula id="inf9">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, corresponding to <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>U</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>E</mml:mi>
<mml:mi>S</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, exceeds that of <inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>T</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, it suggests that the sentence x belongs to the BUSINESS category.</p>
</sec>
<sec id="s2-4">
<title>2.4 OpenPrompt</title>
<p>OpenPrompt (<xref ref-type="bibr" rid="B3">Ding et al., 2021</xref>) is an open-source toolkit designed for prompt learning, offering both ease of use and extensibility. It effectively modularizes the entire prompt learning framework and considers the interactions between various modules. OpenPrompt enables the versatile integration of different task formats, PLMs, and prompting modules. An instance of this flexibility is the straightforward adaptation of prefix-tuning (<xref ref-type="bibr" rid="B18">Li and Liang, 2021</xref>) for text classification tasks within OpenPrompt. This capability allows users to evaluate the broad applicability of their prompt learning models across different tasks rather than just focusing on performance in specific tasks.</p>
<p>In OpenPrompt, the template class is specifically used to create or define textual or soft-encoding templates encapsulating the original input. The templates are pivotal in constructing and formatting input data for effective interaction with PLMs (<xref ref-type="bibr" rid="B6">Han et al., 2021</xref>). They can wrap original text data into a format that aligns with the structure of PLMs. Templates can add extra contextual information to aid the model in more effectively comprehending and handling the input data. The verbalizer bridges PLMs and specific task requirements, offering a flexible and effective way to customize model outputs.</p>
</sec>
<sec id="s2-5">
<title>2.5 Prompt template</title>
<p>The prompt template is to construct a prompt framework, which involves formatting the original input data (such as sentences or paragraphs) into a specific structure, making it more suitable for understanding and processing by PLMs. One or more mask tokens are often inserted (for example, the [MASK] token used in BERT).</p>
<p>Various studies have explored different types of templates. For instance, there are manually written templates (<xref ref-type="bibr" rid="B28">Schick and Sch&#xfc;tze, 2020</xref>) and purely soft templates (<xref ref-type="bibr" rid="B14">Lester et al., 2021</xref>). <xref ref-type="bibr" rid="B19">Liu et al. (2023)</xref> demonstrated effective results by keeping manual tokens unchanged while fine-tuning a smaller portion (<xref ref-type="bibr" rid="B19">Liu et al., 2023</xref>). <xref ref-type="bibr" rid="B7">Han et al. (2022)</xref> used contextualized templates, necessitating the addition of specific entities to create complete templates. Additionally, their approach to loss calculation involved using outputs from various positions (<xref ref-type="bibr" rid="B7">Han et al., 2022</xref>). <xref ref-type="bibr" rid="B20">Logan IV et al. (2021)</xref> introduced an empty template, a straightforward combination of the input data, and a subsequent [MASK] token (<xref ref-type="bibr" rid="B20">Logan IV et al., 2021</xref>).</p>
<p>Within the iDNA-OpenPrompt model, the manual template, which is trainable using task-specific datasets, is used. This manual template enables the precise construction of templates based on one&#x2019;s understanding of the task and specific requirements, and it can simplify the model training process and reduce the demand for computational resources. The template mainly consists of two modules: creating a DNA vocabulary library and the BERT tokenizer.</p>
<sec id="s2-5-1">
<title>2.5.1 Creation of the DNA vocabulary</title>
<p>When creating a vocabulary library for DNA methylation sequences, unlike in traditional NLP tasks, the presence of one, two, or even three nucleobases in a sequence does not necessarily indicate a DNA methylation site. Considering the categories of DNA methylation (4&#xa0;mC, 5&#xa0;hmC, and 6&#xa0;mA) and the nucleobase composition for each, we propose using DNA vocabulary for DNA methylation sequences in the prompt template. Here, the length of nucleobase sequences (A, T, G, and C) is defined as kmer &#x3d; 1, 2, 3, 4, 5, and 6, to form the DNA methylation sequence vocabulary. For example, at kmer &#x3d; 1, the template includes four nucleobase words: A, T, G, and C. At kmer &#x3d; 2, there are 16 nucleobase words, such as AA, AT, AG, <inline-formula id="inf13">
<mml:math id="m15">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, and CC. Similarly, for kmer &#x3d; 3, there are 64 nucleobase words; for kmer &#x3d; 4, there are 256 nucleobase words; for kmer &#x3d; 5, there are 1,024 nucleobase words; and for kmer &#x3d; 6, there are 4,096 nucleobase words. The maximum k-mer value in this prompt template is set to 6 because, in DNA methylation sequences, 6&#xa0;mA methylation involves attaching a methyl group to the sixth nitrogen atom of the adenine nucleobase. Therefore, the DNA vocabulary library contains a total of 5,460 nucleobase words. After creating the vocabulary library, the BERT tokenizer is used to generate the tokenizer of the iDNA-OpenPrompt model.</p>
</sec>
<sec id="s2-5-2">
<title>2.5.2 BERT tokenizer</title>
<p>BERT tokenizer is designed explicitly for the BERT model and is pivotal in NLP tasks. The DNA vocabulary processed by the BERT tokenizer enables the raw text to be transformed into a format effectively handled by OpenPrompt learning. It breaks down basic text strings into smaller units, tokens, words, subwords, or symbols. To accommodate the needs of the BERT model, the BERT tokenizer automatically adds unique tokens such as the start of the sequence token [CLS], separator token [SEP], and padding token [PAD]. It creates an attention mask to indicate which tokens are meaningful and which are for padding. The BERT tokenizer provides essential text processing capabilities for the use of the iDNA-OpenPrompt model.</p>
</sec>
</sec>
<sec id="s2-6">
<title>2.6 Prompt verbalizer</title>
<p>In OpenPrompt, the verbalizer plays an important role, especially when applying PLMs to downstream tasks. The primary function of the verbalizer is to map labels to the vocabulary; the verbalizer maps task-specific labels (such as category labels in classification tasks) to words within the pre-trained model&#x2019;s vocabulary. This mapping allows the model to associate its outputs with specific labels.</p>
<p>Like prompt templates, prompt verbalizer classes derive from a shared base class featuring necessary attributes and essential abstract methods. Beyond the manually defined verbalizer, OpenPrompt includes automated options like the automatic verbalizer and knowledgeable verbalizer (<xref ref-type="bibr" rid="B9">Hu et al., 2021</xref>). Critical processes such as calibrations (<xref ref-type="bibr" rid="B39">Zhao et al., 2021</xref>) are also incorporated in OpenPrompt. In the iDNA-OpenPrompt model, a manual verbalizer is chosen for the prompt verbalizer; the manual verbalizer mainly consists of two modules: label words and BERT tokenizer.</p>
<sec id="s2-6-1">
<title>2.6.1 Label words</title>
<p>Labeling words is a crucial attribute in the manual verbalizer component within the OpenPrompt framework. These words or phrases are labeled words to interpret and transform the model&#x2019;s output.</p>
<p>In this study, the method for constructing label words is as follows: for DNA methylation sequences and non-methylation sequences, centering around the 21st nucleobase of the sequences, kmer &#x3d; 6 encoding is performed on the nucleobase sequences on both sides of the central nucleobase and the encoded words as label words. In all 4-mC sequences (including positive and negative samples), the 21st nucleobase is always C; in all 5-hmC sequences, it is C, and in all 6-mA sequences, it is A.</p>
<p>The words encoded from the positive samples in the DNA methylation sequence dataset are used as positive-sample label words. In contrast, those encoded from the negative samples are used as negative-sample label words.</p>
<p>For example, it is taking a positive sample from the 4-mC category of the 4&#xa0;mC_F.vesca species, &#x201c;GAA&#x200b;GCA&#x200b;AAA&#x200b;ATC&#x200b;GGA&#x200b;AAA&#x200b;CCC&#x200b;A <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> CTTTTGGTT&#x201d;: the possible positive sample label words that can be constructed are as follows: &#x201c;GAAGCA, AAGCAA, AGCAAA, GCAAAA, <inline-formula id="inf15">
<mml:math id="m17">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, AAAACC, AGAAAA, GAAAAT, AAAATT, <inline-formula id="inf16">
<mml:math id="m18">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, TTGGTT&#x201d;. Similarly, a negative sample was taken from the 4-mC category of the 4&#xa0;mC_F.vesca species, &#x201c;TGC&#x200b;ATA&#x200b;CTT&#x200b;TCA&#x200b;GTA&#x200b;GTT&#x200b;TTC&#x200b;AAT <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> ATGGCAGT&#x201d;: the negative sample label words that can be constructed are as follows: &#x201c;TGCATA, GCATAC, CATACT, ATACTT, <inline-formula id="inf18">
<mml:math id="m20">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, AGTTTT, AATGCA, ATGCAT, TGCATT, <inline-formula id="inf19">
<mml:math id="m21">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, GGCAGT&#x201d;. To understand the process of constructing label_words for DNA methylation sequences, <xref ref-type="fig" rid="F2">Figure 2</xref> illustrates its schematic diagram.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Schematic diagram of label_words for DNA methylation sequences.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g002.tif"/>
</fig>
</sec>
</sec>
<sec id="s2-7">
<title>2.7 PLM</title>
<p>The PLM of iDNA-OpenPrompt is the BERT model. The application of the BERT model in OpenPrompt follows the fundamental principles and structure of the BERT model (<xref ref-type="bibr" rid="B2">Devlin et al., 2018</xref>) while adapting and extending it within the framework of prompt learning. The core of the BERT model is the encoder part of the transformer, which comprises multiple encoder layers, each containing self-attention mechanisms and feed-forward neural networks. One of the primary attributes of BERT is its ability to generate bidirectional contextualized word embeddings, signifying that it considers the context of the entire sentence when processing each word. To learn deep language representations, the BERT model undergoes pre-training on an extensive corpus, including tasks like the masked language model (MLM) and next sentence prediction (NSP).</p>
<sec id="s2-7-1">
<title>2.7.1 Attention calculation</title>
<p>The scalar product between the query vector (Q) and key vector (K) is computed, followed by scaling down of the result to prevent overly large attention scores, while a scaling factor (commonly the inverse square root of the key vectors&#x2019; dimension) is also factored in. The attention scores are then subjected to a softmax operation for normalization into attention weights. A weighted sum over the value vectors (V) is then performed using these weights, resulting in the final attention representation. The formulaic representation of self-attention is expressed as Eq. (<xref ref-type="disp-formula" rid="e3">3</xref>) and (<xref ref-type="disp-formula" rid="e4">4</xref>):<disp-formula id="e3">
<mml:math id="m22">
<mml:mrow>
<mml:mfenced open="{" close="" separators="|">
<mml:mrow>
<mml:mtable columnalign="center">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>X</mml:mi>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m23">
<mml:mrow>
<mml:mtext>Self</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>attention</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>softmax</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:msup>
<mml:mi mathvariant="normal">K</mml:mi>
<mml:mi mathvariant="normal">T</mml:mi>
</mml:msup>
</mml:mrow>
<mml:msqrt>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">k</mml:mi>
</mml:msub>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">V</mml:mi>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>In this context, <inline-formula id="inf20">
<mml:math id="m24">
<mml:mrow>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mo>&#x2a;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> symbolizes the embedding output obtained from the embedding module, where <inline-formula id="inf21">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> indicates the embedding dimension and L represents the input sequence&#x2019;s length. Q, K, and <inline-formula id="inf22">
<mml:math id="m26">
<mml:mrow>
<mml:mi mathvariant="normal">V</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> correspond to the matrices of the query, key, and value, respectively. These matrices are derived from X through a linear transformation using <inline-formula id="inf23">
<mml:math id="m27">
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>Q</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf24">
<mml:math id="m28">
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>K</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf25">
<mml:math id="m29">
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>V</mml:mi>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, each existing in the real space <inline-formula id="inf26">
<mml:math id="m30">
<mml:mrow>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x2a;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Here, <inline-formula id="inf27">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> denotes the size of the query, key, and value vectors. <inline-formula id="inf28">
<mml:math id="m32">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf29">
<mml:math id="m33">
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are both regarded as hyperparameters.</p>
</sec>
<sec id="s2-7-2">
<title>2.7.2 Multi-head attention</title>
<p>The computation of the attention head specified by index &#x201c;i&#x201d; is as shown in Eq. (<xref ref-type="disp-formula" rid="e5">5</xref>), (<xref ref-type="disp-formula" rid="e6">6</xref>) and (<xref ref-type="disp-formula" rid="e7">7</xref>):<disp-formula id="e5">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:msubsup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">Q</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">K</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:msubsup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">K</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">V</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="normal">X</mml:mi>
<mml:msubsup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">V</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<disp-formula id="e6">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>f</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>Q</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>K</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m36">
<mml:mrow>
<mml:mtext>MultiHead</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>Attention</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">Q</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">K</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="normal">V</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>Concact</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mtext>Head</mml:mtext>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>Head</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mtext>Head</mml:mtext>
<mml:mi mathvariant="normal">h</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">O</mml:mi>
</mml:msup>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
<inline-formula id="inf30">
<mml:math id="m37">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">Q</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf31">
<mml:math id="m38">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">K</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, and <inline-formula id="inf32">
<mml:math id="m39">
<mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="normal">W</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mi mathvariant="normal">V</mml:mi>
</mml:msubsup>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">m</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi mathvariant="normal">k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> are the query, key, and value matrices for the i-th head, respectively. The parameter &#x2018;h&#x2019; denotes the count of heads. The multi-head attention is used for Q, K, and V by concatenating &#x2018;h&#x2019; individual heads, with each performing self-attention relevant to the input sequence. Furthermore, <inline-formula id="inf33">
<mml:math id="m40">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:msup>
<mml:mi>W</mml:mi>
<mml:mi>o</mml:mi>
</mml:msup>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> acts as a linear transformation matrix, adjusting the dimensions of the multi-head attention&#x2019;s output to align with the input dimensions of the encoder block. This enables a skip connection, where the input for the encoder block is linked to the output from the multi-head attention mechanism.</p>
<p>In OpenPrompt, the BERT model is commonly used with templates and verbalizers. Prompt templates are designed to construct input formats suitable for processing by BERT. In contrast, prompt verbalizers are used to map the output of models to specific task labels by leveraging the advanced language understanding capabilities of the BERT model, which can strengthen the function of OpenPrompt models within a variety of NLP tasks.</p>
</sec>
</sec>
</sec>
<sec id="s3">
<title>3 Performance metrics</title>
<p>The performance of the iDNA-OpenPrompt model, along with other DNA methylation recognition models (<xref ref-type="bibr" rid="B38">Zeng and Liao, 2021</xref>; <xref ref-type="bibr" rid="B16">Li F. et al., 2023</xref>; <xref ref-type="bibr" rid="B17">Li Q. et al., 2023</xref>), is evaluated using the following five commonly used metrics: accuracy (ACC), sensitivity (SN), specificity (SP), Matthews&#x2019; correlation coefficient (MCC), and area under curve (AUC). The equations for these measurements are expressed below Eq. <xref ref-type="disp-formula" rid="e8">8</xref> to Eq. <xref ref-type="disp-formula" rid="e12">12</xref>:<disp-formula id="e8">
<mml:math id="m41">
<mml:mrow>
<mml:mtext>ACC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
<disp-formula id="e9">
<mml:math id="m42">
<mml:mrow>
<mml:mtext>SN</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mtext>TP</mml:mtext>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m43">
<mml:mrow>
<mml:mtext>SP</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mtext>TN</mml:mtext>
<mml:mrow>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="e11">
<mml:math id="m44">
<mml:mrow>
<mml:mtext>MCC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>FP</mml:mtext>
<mml:mo>&#xd7;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtext>TP</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FP</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mtext>TN</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>FN</mml:mtext>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
<disp-formula id="e12">
<mml:math id="m45">
<mml:mrow>
<mml:mtext>AUC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mtext>pos</mml:mtext>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mtext>rank</mml:mtext>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mtext>num</mml:mtext>
<mml:mtext>pos</mml:mtext>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mtext>num</mml:mtext>
<mml:mtext>pos</mml:mtext>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mtext>num</mml:mtext>
<mml:mtext>pos</mml:mtext>
</mml:msub>
<mml:msub>
<mml:mtext>num</mml:mtext>
<mml:mtext>neg</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>Here, TP, FN, TN, and FP denote the counts of true positive, false negative, true negative, and false positive instances, respectively. ACC and MCC are both used for gauging the model&#x2019;s comprehensive performance. SN pertains to the ratio of accurately predicted samples correctly identified as methylated with the predictor, while SP quantifies the proportion of accurately predicted non-methylated samples with the predictor. The AUC is determined as the region enclosed between the receiver operating characteristic (ROC) curve and the coordinate plane, where the false positive rate (FPR) is plotted on the <italic>x</italic>-axis, and the true positive rate (TPR) is plotted on the <italic>y</italic>-axis. In total, an increase in these metrics signifies an improved model performance.</p>
</sec>
<sec sec-type="results" id="s4">
<title>4 Results</title>
<sec id="s4-1">
<title>4.1 The visualization of UMAP for samples of iDNA-OpenPrompt</title>
<p>To visually demonstrate the iDNA-OpenPrompt&#x2019;s performance, Uniform Manifold Approximation and Projection (UMAP) (<xref ref-type="bibr" rid="B11">Junru et al., 2022</xref>) displays the distribution of samples with and without methylation sites. UMAP is a sophisticated non-linear method for reducing dimensionality that effectively maps high-dimensional data into a more manageable two-dimensional space, preserving local and global data point structures.</p>
<p>As seen in <xref ref-type="fig" rid="F3">Figure 3</xref>, blue corresponds to non-DNA methylation (negatives), while red corresponds to DNA methylation (positives). The figures of (a-1) and (b-1) display the visualization of DNA methylation and non-methylation sequence samples without model processing; positive and negative samples appear mixed. The figures of (a-2) and (b-2) exhibit the visualization of DNA methylation and non-methylation sequence samples after iDNA-OpenPrompt model processing; and the positive and negative samples distinctly separate into well-defined groups. This separation visually confirms the model&#x2019;s capacity to differentiate between DNA methylation and non-DNA methylation samples effectively.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Representing samples before and after using the iDNA-OpenPrompt model with UMAP. <bold>(A)</bold> UMAP visualization of samples before and after processing with the iDNA-OpenPrompt model for the species 5hmC_M.musculus and 5hmC_H.sapiens. <bold>(B)</bold> UMAP visualization of samples before and after processing with the iDNA-OpenPrompt model for the species 4mC_cerevisiae and 4mC_C.equisetifolia. <bold>(C)</bold> UMAP visualization of samples before and after processing with the iDNA-OpenPrompt model for the species 6mA_F.vesca and 6mA_Tolypocladium. In Panels (A&#x2013;C) (a-1) and (b-1) show the samples before processing with the model, while (a-2) and (b-2) show the samples after processing with the model.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g003.tif"/>
</fig>
</sec>
<sec id="s4-2">
<title>4.2 Comparison of iDNA-OpenPrompt&#x2019;s performance with other outstanding methods</title>
<p>To evaluate the performance of iDNA-OpenPrompt, the comparative study is conducted against four outstanding predictors, including iDNA-ABT (<xref ref-type="bibr" rid="B37">Yingying et al., 2021</xref>), iDNA-ABF (<xref ref-type="bibr" rid="B11">Junru et al., 2022</xref>), iDNA-MS (<xref ref-type="bibr" rid="B21">Lv et al., 2020</xref>), and MM-6mAPred (<xref ref-type="bibr" rid="B24">Pian et al., 2020</xref>). iDNA-ABT, iDNA-ABF, and iDNA-MS are designed for various methylation prediction tasks, whereas MM-6mAPred was initially tailored for 6-mA site prediction. This comparison highlights iDNA-OpenPrompt&#x2019;s adaptability and its capability, not just limited to 6&#xa0;mA but also extending to 5hmC and 4&#xa0;mC. Each of these predictors is independently trained on 17 distinct training datasets encompassing three methylation types, and then, its corresponding test dataset is evaluated (details are provided in <xref ref-type="table" rid="T1">Table 1</xref>). The outcomes, encompassing metrics such as ACC, SN, SP, AUC, and MCC, are depicted in <xref ref-type="fig" rid="F4">Figure 4A&#x2013;E</xref>. The data clearly show that the proposed model consistently surpasses the performance of four other exceptional predictors across all 17 datasets. The effectiveness of the proposed model can be attributed to its utilization of the OpenPrompt learning framework, which has proven to be highly effective in enhancing its performance, along with the outstanding performance of the prompt template and prompt verbalizer specifically designed for DNA methylation sequences.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Comparing Performance of iDNA-OpenPrompt with other outstanding methods. <bold>(A)</bold> the ACC of iDNA-OpenPrompt with other outstanding methods, <bold>(B)</bold> the SN of iDNA-OpenPrompt with other outstanding methods, <bold>(C)</bold> the SP of iDNA-OpenPrompt with other outstanding methods, <bold>(D)</bold> the AUC of iDNA-OpenPrompt with other outstanding methods, <bold>(E)</bold> the MCC of iDNA-OpenPrompt with other outstanding methods. The evaluation metrics displayed above (ACC, SN, SP, AUC, MCC) are the results of testing the iDNA-OpenPrompt, iDNA-ABT, iDNA-ABF, iDNA-MS, and MM-6mAPred models on datasets of 17 species.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g004.tif"/>
</fig>
</sec>
<sec id="s4-3">
<title>4.3 Successful cross-species validation results</title>
<p>To assess the proposed model&#x2019;s adaptability across different species, it is imperative to gauge a model&#x2019;s ability to be trained on data from one species and then used to detect modification sites in others. With this goal in mind, we have developed distinct models, each customized for a specific species; the effectiveness of these models is ascertained by applying them to other species for 4mC, 5hmC, 6mA modification. The outcomes of this validation procedure across different species are visually represented in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>The heat map of cross-validation. <bold>(A)</bold> The cross-validation accuracy results for DNA methylation 5hmC in two species. <bold>(B)</bold> The cross-validation accuracy results for DNA methylation 4mC in four species. <bold>(C)</bold> The cross-validation accuracy results for DNA methylation 6mA in eleven species. In the figures, the species datasets indicated on the horizontal axis are used for training, and the species datasets indicated on the vertical axis are used for testing.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g005.tif"/>
</fig>
<p>Considering the significant discrepancy in the quantity of training and testing samples for various species, with some species having only a few hundred samples and others reaching over a hundred thousand, we aim for fairness in cross-validation. Therefore, from the datasets of all species, we randomly selected 365 samples for the model&#x2019;s cross-validation. This selection comprised 183 positive samples and 182 negative samples. The cross-validation outcomes are depicted in <xref ref-type="fig" rid="F5">Figure 5</xref>.</p>
<p>
<xref ref-type="fig" rid="F5">Figure 5A</xref> reveals the results of cross-species validation of 5hmC_<italic>H. sapiens</italic> and 5hmC_<italic>M. musculus</italic>. Specifically, the accuracy rate attained for 5hmC_H: sapiens and 5hmC_<italic>M. musculus</italic> is 98.09%, underscoring the success of the proposed method. <xref ref-type="fig" rid="F5">Figure 5C</xref> reveals that in the 6mA_R.chinensis model&#x2019;s cross-validation, the accuracy for 6mA_R.chinensis is less than that for 6mA_T.thermophile indicates suboptimal results. However, the cross-validation of other species was performed satisfactorily. We can confidently deploy the proposed model, assuring its high-quality performance in identifying DNA methylation sites across different species, indicating that the proposed model has strong cross-validation performance.</p>
</sec>
<sec id="s4-4">
<title>4.4 The impact of the DNA vocabulary and label_words on model accuracy</title>
<p>To verify the algorithm&#x2019;s effectiveness proposed in this article, the length of the DNA vocabulary library in the prompt template and the nucleotide length of the words in the label_words of the prompt verbalizer are changed to test their impact on the proposed model. In the following experiments, the nucleotide length in the DNA vocabulary refers to the length, encompassing all possible combinations of nucleotides ranging from 1, 2, <inline-formula id="inf34">
<mml:math id="m46">
<mml:mrow>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, up to that maximum length. For instance, if the nucleotide length is 6, then the DNA vocabulary includes nucleotide words that contain all combinations of nucleotides with lengths of 1, 2, 3, 4, 5, and 6.</p>
<sec id="s4-4-1">
<title>4.4.1 The impact of the number (length) of nucleotides in the DNA vocabulary library on the model</title>
<p>By changing the length of the nucleotide vocabulary in the DNA vocabulary while keeping the nucleotide length of the words in the label_words of the prompt verbalizer at 6, tests are conducted on all species across three categories (4mC, 5hmC, 6mA) with the nucleotide numbers (lengths) of individual words in the DNA vocabulary library being 2, 3, 4, 5, 6, 7, and 8. The test results show that, with the nucleotide length of the words in the label_words of the prompt verbalizer unchanged, the highest model accuracy is achieved when the number of nucleotides of individual words in the DNA vocabulary is 6. Taking the 4mC species as an example, the model&#x2019;s accuracy is illustrated in <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Impact of the number (length) of nucleotides in the DNA vocabulary library on the iDNA-OpenPrompt model.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g006.tif"/>
</fig>
</sec>
<sec id="s4-4-2">
<title>4.4.2 The impact of the number (length) of nucleotides in the label_words of the prompt verbalizer on the model</title>
<p>In this experiment, by changing the length of the nucleotide vocabulary in the label_words of the prompt verbalizer while keeping the nucleotide length of the words in the DNA vocabulary of prompt template at 6, tests are conducted on all species across three categories (4mC, 5hmC, 6mA) with the nucleotide numbers (lengths) of individual words in the label_words being 2, 3, 4, 5, 6, 7, and 8. The test results indicate that, with the nucleotide length of the words in the DNA vocabulary of the prompt template unchanged, the highest model accuracy is achieved when the number of nucleotides of individual words in the label_words of the prompt marker is 6. Taking the 6mA_F.vesca species as an example, the model accuracy is illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Accuracy of the number (length) of nucleotides in the label_words of the prompt verbalizer on the iDNA-OpenPrompt model.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g007.tif"/>
</fig>
</sec>
<sec id="s4-4-3">
<title>4.4.3 The accuracy of simultaneously changing the DNA vocabulary library and label_words of the iDNA-OpenPrompt model</title>
<p>In this experiment, the extent of their impact on model performance is assessed by modifying the length of nucleotide vocabularies in both the DNA vocabulary of the prompt template and within the label_words of the prompt verbalizer. When the maximum length of nucleotide vocabularies in the DNA vocabulary and within the label_words is set to 2, 3, 4, 5, 6, and 7 for testing across multiple species within three methylation categories, the results reveal that the model&#x2019;s accuracy peaked when both the maximum nucleotide vocabulary length in the DNA vocabulary and the nucleotide length within the label_words are 6. The performance does not improve further when the lengths are extended to 7, and the risk of overfitting the model increases when both lengths reach 8. Taking the 6mA species as an example, the model&#x2019;s accuracy across various maximum lengths of nucleotide vocabularies in the DNA vocabulary and within the label_words of the prompt marker is illustrated in <xref ref-type="fig" rid="F8">Figure 8</xref>.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Accuracy of simultaneously changing the DNA vocabulary library and label_words of the iDNA-OpenPrompt model.</p>
</caption>
<graphic xlink:href="fgene-15-1377285-g008.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>The proposed iDNA-OpenPrompt model used the innovative OpenPrompt learning approach and combines a prompt template, prompt verbalizer, and PLM to construct the prompt learning framework. Moreover, a DNA vocabulary library, BERT tokenizer, and specific label words are also introduced into the model to enable accurate identification of DNA methylation sites. An extensive analysis is conducted to evaluate the model&#x2019;s predictive capability, reliability, and consistency of the iDNA-OpenPrompt model. The experimental outcomes, covering 17 benchmark datasets that include various species and three distinct DNA methylation modifications, namely, 4mC, 5hmC, 6mA, consistently indicate that our model surpasses existing outstanding approaches regarding performance and robustness. The limitation to this model lies in that the DNA vocabulary in the prompt template is manually generated, and applying bioinformatics to other RNA sequences or other biological information sequences requires manual generation of their vocabularies anew. In future work, making vocabulary generation automatic and adaptable to other biological information sequences is one of the future research directions.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. These data can be found at: <ext-link ext-link-type="uri" xlink:href="https://github.com/Yyxx-1987/iDNA-OpenPrompt/tree/master/iDNA-OpenPrompt">https://github.com/Yyxx-1987/iDNA-OpenPrompt/tree/master/iDNA-OpenPrompt</ext-link>.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>XY: methodology, software, validation, visualization, and writing&#x2013;original draft. JR: formal analysis, investigation, resources, writing&#x2013;review and editing, and conceptualization. HL: funding acquisition, methodology, validation, writing&#x2013;review and editing, and visualization. RZ: data curation, methodology, and writing&#x2013;review and editing. GZ: investigation, visualization, and writing&#x2013;review and editing. AB: writing&#x2013;review and editing. YC: data curation, investigation, methodology, and writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work is supported by the National Natural Science Foundation of China (No. 62302132, No. 62262016, No. 61961160706, No. 62262018, No. 62262019), 14th Five-Year Plan Civil Aerospace Technology Preliminary Research Project (D040405), the Hainan Provincial Natural Science Foundation of China (No. 823RC488, No. 623RC481, No. 620RC603, No. 721QN0890, No. 621MS038), the Program of Hainan Association for Science and Technology Plans to Youth R &#x26; D Innovation (QCQTXM202209), the Project supported by the Education Department of Hainan Province (Hnky2024-18).</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>scIMC: a platform for benchmarking comparison and visualization analysis of scRNA-seq data imputation methods</article-title>. <source>Nucleic Acids Res.</source> <volume>50</volume> (<issue>9</issue>), <fpage>4877</fpage>&#x2013;<lpage>4899</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkac317</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Devlin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>M. -W.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Toutanova</surname>
<given-names>K.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1810.04805">https://arxiv.org/abs/1810.04805</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>H. -T.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Openprompt: an open-source framework for prompt-learning</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2111.01998">https://arxiv.org/abs/2111.01998</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2111.01998</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Duong</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>TheAnh</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>DinhMinh</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>YuYen</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>An extensive examination of discovering 5-Methylcytosine Sites in Genome-Wide DNA Promoters using machine learning based approaches</article-title>. <source>IEEE/ACM Trans. Comput. Biol. Bioinforma</source>. <pub-id pub-id-type="doi">10.1109/TCBB.2021.3082184</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Haitao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhiming</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>SNNRice6mA: a deep learning method for predicting DNA N6-methyladenine sites in rice genome</article-title>. <source>Front. Genet.</source> <volume>10</volume>, <fpage>1071</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2019.01071</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Huo</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Pre-trained models: past, present and future</article-title>. <source>AI Open</source> <volume>2</volume>, <fpage>225</fpage>&#x2013;<lpage>250</lpage>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2021.08.002</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Ptr: prompt tuning with rules for text classification</article-title>. <source>AI Open</source> <volume>3</volume>, <fpage>182</fpage>&#x2013;<lpage>192</lpage>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2022.11.003</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Haodong</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Peilin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhongming</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Deep4mC: systematic assessment and computational prediction for DNA N4-methylcytosine sites by deep learning</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume> (<issue>3</issue>). <pub-id pub-id-type="doi">10.1093/bib/bbaa099</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Knowledgeable prompt-tuning: incorporating knowledge into prompt verbalizer for text classification</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2108.02035">https://arxiv.org/abs/2108.02035</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2108.02035</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<collab>iDNA-MS</collab> (<year>2020</year>). <source>iDNA-MS web server</source>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Junru</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yingying</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ruheng</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Chao</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Yi</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>iDNA-ABF: multi-scale deep biological language learning model for the interpretable prediction of DNA methylations</article-title>. <source>Genome Biol.</source> <volume>23</volume> (<issue>1</issue>), <fpage>219</fpage>. <pub-id pub-id-type="doi">10.1186/s13059-022-02780-1</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Juntao</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jing</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>DeepM6ASeq-EL: prediction of human N6-methyladenosine (m6A) sites with LSTM and ensemble learning</article-title>. <source>Front. Comput. Sci.</source> <volume>16</volume> (<issue>2</issue>), <fpage>162302</fpage>. <pub-id pub-id-type="doi">10.1007/s11704-020-0180-0</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Khanal</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Nazari</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Tayara</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chong</surname>
<given-names>K. T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>4mCCNN: identification of N4-methylcytosine sites in prokaryotes using convolutional neural network</article-title>. <source>IEEE Access</source> <volume>7</volume>, <fpage>145455</fpage>&#x2013;<lpage>145461</lpage>. <pub-id pub-id-type="doi">10.1109/access.2019.2943169</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Lester</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Al-Rfou</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Constant</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The power of scale for parameter-efficient prompt tuning</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2104.08691">https://arxiv.org/abs/2104.08691</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2104.08691</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Leyi</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Shasha</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Eijy</surname>
<given-names>N. L. A.</given-names>
</name>
<name>
<surname>Ran</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Exploring sequence-based features for the improved prediction of DNA N4-methylcytosine sites in multiple species</article-title>. <source>Bioinforma. Oxf. Engl.</source> <volume>35</volume> (<issue>8</issue>), <fpage>1326</fpage>&#x2013;<lpage>1333</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty824</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Duan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2023a</year>). <article-title>EpiTEAmDNA: sequence feature representation via transfer learning and ensemble learning for identifying multiple DNA epigenetic modification types across species</article-title>. <source>Comput. Biol. Med.</source> <volume>160</volume>, <fpage>107030</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107030</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>M6A-BERT-Stacking: a tissue-specific predictor for identifying RNA N6-methyladenosine sites based on BERT and stacking strategy</article-title>. <source>Symmetry</source> <volume>15</volume> (<issue>3</issue>), <fpage>731</fpage>. <pub-id pub-id-type="doi">10.3390/sym15030731</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>X. L.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Prefix-tuning: optimizing continuous prompts for generation</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2101.00190">https://arxiv.org/abs/2101.00190</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2101.00190</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <source>GPT understands, too</source>. <publisher-name>AI Open</publisher-name>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2023.08.012</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Logan IV</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Bala&#x17e;evi&#x107;</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Wallace</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Petroni</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Riedel</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Cutting down on prompts and parameters: simple few-shot learning with language models</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2106.13353">https://arxiv.org/abs/2106.13353</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2106.13353</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Dao</surname>
<given-names>F. -Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>Z. -X.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>W.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>iDNA-MS: an integrated computational tool for detecting DNA modification sites in multiple genomes</article-title>. <source>Iscience</source> <volume>23</volume> (<issue>4</issue>), <fpage>100991</fpage>. <pub-id pub-id-type="doi">10.1016/j.isci.2020.100991</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Maegawa</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hinkal</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H. S.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2010</year>). <article-title>Widespread and tissue specific age-related DNA methylation changes in mice</article-title>. <source>Genome Res.</source> <volume>20</volume> (<issue>3</issue>), <fpage>332</fpage>&#x2013;<lpage>340</lpage>. <pub-id pub-id-type="doi">10.1101/gr.096826.109</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Manavalan</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Basith</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Shin</surname>
<given-names>T. H.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Meta-4mCpred: a sequence-based meta-predictor for accurate DNA 4mC site prediction using effective feature representation</article-title>. <source>Mol. Therapy-Nucleic Acids</source> <volume>16</volume>, <fpage>733</fpage>&#x2013;<lpage>744</lpage>. <pub-id pub-id-type="doi">10.1016/j.omtn.2019.04.019</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mehedi</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Shaherin</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Shamima</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Gwang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Balachandran</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hiroyuki</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Meta-i6mA: an interspecies predictor for identifying DNA N6-methyladenine sites of plant genomes by exploiting informative features in an integrative machine-learning framework</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume> (<issue>3</issue>). <pub-id pub-id-type="doi">10.1093/bib/bbaa202</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pian</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>MM-6mAPred: identifying DNA N6-methyladenine sites based on Markov model</article-title>. <source>Bioinformatics</source> <volume>36</volume> (<issue>2</issue>), <fpage>388</fpage>&#x2013;<lpage>392</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btz556</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Quanzhong</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Jinxiang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yanze</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Shuqin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Cangzhi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jiangning</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>DeepTorrent: a deep learning-based approach for predicting DNA N4-methylcytosine sites</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume> (<issue>3</issue>). <pub-id pub-id-type="doi">10.1093/bib/bbaa124</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Minghong</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Developing a multi-layer deep learning based predictive model to identify DNA N4-methylcytosine modifications</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>8</volume>, <fpage>274</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2020.00274</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Schick</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Sch&#xfc;tze</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Exploiting cloze questions for few shot text classification and natural language inference</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2001.07676">https://arxiv.org/abs/2001.07676</ext-link>
</comment>. <pub-id pub-id-type="doi">10.48550/arXiv.2001.07676</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sho</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Mehedi</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>HongWen</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hiroyuki</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>BERT6mA: prediction of DNA N6-methyladenine site using deep learning-based approaches</article-title>. <source>Briefings Bioinforma.</source> <volume>23</volume> (<issue>2</issue>). <pub-id pub-id-type="doi">10.1093/bib/bbac053</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ur</surname>
<given-names>R. M.</given-names>
</name>
<name>
<surname>Hilal</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Quan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>To</surname>
<given-names>C. K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>i6mA-Caps: a CapsuleNet-based framework for identifying DNA N6-methyladenine sites</article-title>. <source>Bioinformatics</source> <volume>38</volume> (<issue>16</issue>), <fpage>3885</fpage>&#x2013;<lpage>3891</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btac434</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>DeepBIO: an automated and interpretable deep-learning platform for high-throughput biological sequence prediction, functional annotation and visualization analysis</article-title>. <source>Nucleic Acids Res.</source> <volume>51</volume> (<issue>7</issue>), <fpage>3017</fpage>&#x2013;<lpage>3029</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkad055</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yani</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Rao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Haixia</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Cuihua</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>DRSN4mCPred: accurately predicting sites of DNA N4-methylcytosine using deep residual shrinkage network for diagnosis and treatment of gastrointestinal cancer in the precision medicine era</article-title>. <source>Front. Med.</source> <volume>10</volume>, <fpage>1187430</fpage>. <pub-id pub-id-type="doi">10.3389/fmed.2023.1187430</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Jun</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Qianyue</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Taigang</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>BiLSTM-5mC: a bidirectional long short-term memory-based approach for predicting 5-methylcytosine sites in genome-wide DNA promoters</article-title>. <source>Molecules</source> <volume>26</volume> (<issue>24</issue>), <fpage>7414</fpage>. <pub-id pub-id-type="doi">10.3390/molecules26247414</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yehudit</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>DNA methylation dynamics in health and disease</article-title>. <source>Nat. Struct. Mol. Biol.</source> <volume>20</volume> (<issue>3</issue>), <fpage>274</fpage>&#x2013;<lpage>281</lpage>. <pub-id pub-id-type="doi">10.1038/nsmb.2518</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ying</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Yanan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zequn</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Niannian</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Jun</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jianjun</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Hyb4mC: a hybrid DNA2vec-based model for DNA N4-methylcytosine sites prediction</article-title>. <source>BMC Bioinforma.</source> <volume>23</volume> (<issue>1</issue>), <fpage>258</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-022-04789-6</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ying</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Jian</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xiaoyu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Xinxin</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jiangning</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Leveraging the attention mechanism to improve the identification of DNA N6-methyladenine sites</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume> (<issue>6</issue>), <fpage>bbab351</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbab351</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yingying</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wenjia</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Junru</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lizhen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Rao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Leyi</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>iDNA-ABT: advanced deep learning model for detecting DNA methylation with adaptive features and transductive information maximization</article-title>. <source>Bioinforma. Oxf. Engl.</source> <volume>37</volume> (<issue>24</issue>), <fpage>4603</fpage>&#x2013;<lpage>4610</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btab677</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zeng</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>6mAPred-MSFF: a deep learning model for predicting DNA N6-methyladenine sites across species based on a multi-scale feature fusion mechanism</article-title>. <source>Appl. Sci.</source> <volume>11</volume> (<issue>16</issue>), <fpage>7731</fpage>. <pub-id pub-id-type="doi">10.3390/app11167731</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wallace</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Calibrate before use: improving few-shot performance of language models</article-title>,&#x201d; in <conf-name>International Conference on Machine Learning PMLR</conf-name>, <conf-loc>Maryland, USA</conf-loc>, <conf-date>July 17-23, 2022</conf-date>. <pub-id pub-id-type="doi">10.3390/app11167731</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Qiang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Prompt-learning for short text classification</article-title>. <source>IEEE Trans. Knowl. Data Eng.</source>, <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/tkde.2023.3332787</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Gene2vec: gene subsequence embedding for prediction of mammalian N-6-methyladenosine sites from mRNA</article-title>. <source>Rna</source> <volume>25</volume> (<issue>2</issue>), <fpage>205</fpage>&#x2013;<lpage>218</lpage>. <pub-id pub-id-type="doi">10.1261/rna.069112.118</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>