<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article article-type="methods-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1376486</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2024.1376486</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>ACP-DRL: an anticancer peptides recognition method based on deep representation learning</article-title>
<alt-title alt-title-type="left-running-head">Xu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2024.1376486">10.3389/fgene.2024.1376486</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Xu</surname>
<given-names>Xiaofang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2385416/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Li</surname>
<given-names>Chaoran</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2600735/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Yuan</surname>
<given-names>Xinpu</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="author-notes" rid="fn001">
<sup>&#x2020;</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Qiangjian</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Yi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Zhu</surname>
<given-names>Yunping</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2073159/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chen</surname>
<given-names>Tao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2598725/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>State Key Laboratory of Medical Proteomics</institution>, <institution>Beijing Proteome Research Center</institution>, <institution>National Center for Protein Sciences(Beijing)</institution>, <institution>Beijing Institute of Lifeomics</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Department of General Surgery</institution>, <institution>First Medical Center</institution>, <institution>Chinese PLA General Hospital</institution>, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Institute of Dataspace</institution>, <institution>Hefei Comprehensive National Science Center</institution>, <addr-line>Hefei</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1539905/overview">Zhuang Xiong</ext-link>, Fuzhou University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2648760/overview">Chao Liu</ext-link>, Beihang University, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/838718/overview">Chunhou Zheng</ext-link>, Anhui University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Tao Chen, <email>taochen1019@163.com</email>; Yunping Zhu, <email>zhuyunping@ncpsb.org.cn</email>
</corresp>
<fn fn-type="equal" id="fn001">
<label>
<sup>&#x2020;</sup>
</label>
<p>These authors share first authorship</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>04</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1376486</elocation-id>
<history>
<date date-type="received">
<day>25</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>25</day>
<month>03</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Xu, Li, Yuan, Zhang, Liu, Zhu and Chen.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Xu, Li, Yuan, Zhang, Liu, Zhu and Chen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Cancer, a significant global public health issue, resulted in about 10 million deaths in 2022. Anticancer peptides (ACPs), as a category of bioactive peptides, have emerged as a focal point in clinical cancer research due to their potential to inhibit tumor cell proliferation with minimal side effects. However, the recognition of ACPs through wet-lab experiments still faces challenges of low efficiency and high cost. Our work proposes a recognition method for ACPs named ACP-DRL based on deep representation learning, to address the challenges associated with the recognition of ACPs in wet-lab experiments. ACP-DRL marks initial exploration of integrating protein language models into ACPs recognition, employing in-domain further pre-training to enhance the development of deep representation learning. Simultaneously, it employs bidirectional long short-term memory networks to extract amino acid features from sequences. Consequently, ACP-DRL eliminates constraints on sequence length and the dependence on manual features, showcasing remarkable competitiveness in comparison with existing methods.</p>
</abstract>
<kwd-group>
<kwd>anticancer peptides</kwd>
<kwd>deep representation learning</kwd>
<kwd>BERT</kwd>
<kwd>self-supervised</kwd>
<kwd>pre-training</kwd>
<kwd>language models</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Statistical Genetics and Methodology</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Cancer is a major public health problem worldwide and one of the leading cause of death (<xref ref-type="bibr" rid="B20">Siegel et al., 2023</xref>). Typical treatment options to reduce the burden of cancer on human health involve surgery, radiotherapy, and/or systemic therapy. However, the toxicities associated with traditional treatment methods, present considerable challenges for tolerability and adherence, making it difficult for patients to complete their prescribed treatment regimens (<xref ref-type="bibr" rid="B14">Mun et al., 2018</xref>). Therefore, the development of new anticancer drugs with higher efficacy, low resistance, and fewer adverse effects are necessary. Anticancer peptides (ACPs) potentially offer new perspectives for achieving this goal (<xref ref-type="bibr" rid="B7">Gabernet et al., 2016</xref>). Considering their intrinsic nature as cationic amphiphiles, ACPs exhibit unique, receptor-independent mechanisms. These peptides display an exceptional capacity to selectively target and eliminate cancer cells via folding-dependent membrane disruption (<xref ref-type="bibr" rid="B2">Aronson et al., 2018</xref>). On the one hand, ACPs therapy has been extensively researched and applied in preclinical and various stages of clinical trials against tumors (<xref ref-type="bibr" rid="B18">Pelliccia et al., 2019</xref>; <xref ref-type="bibr" rid="B12">Liu et al., 2024</xref>). On the other hand, the time-consuming and costly process of identifying ACPs through biological experiments, as well as the limited number of available ACPs, have hindered its development.</p>
<p>Fortunately, with the tremendous progress made in the field of machine learning over the past decades, the feasibility of employing computational methods to predict typical peptides has become a reality. As a result, various recognition methods for ACPs based on amino acid sequences have emerged, such as iACP (<xref ref-type="bibr" rid="B3">Chen et al., 2016</xref>), PEPred-Suite (<xref ref-type="bibr" rid="B30">Wei et al., 2019</xref>), ACPred-Fuse (<xref ref-type="bibr" rid="B19">Rao et al., 2020</xref>), iACP-DRLF (<xref ref-type="bibr" rid="B13">Lv et al., 2021</xref>), AntiCP 2.0 (<xref ref-type="bibr" rid="B1">Agrawal et al., 2021</xref>), ACP-check (<xref ref-type="bibr" rid="B33">Zhu et al., 2022</xref>) and ACP-BC (<xref ref-type="bibr" rid="B22">Sun et al., 2023</xref>). These recognition methods adopt diverse approaches to convert amino acid sequences into numerical representations and use machine learning algorithms to uncover patterns within these features. Among these methodologies, AntiCP 2.0 relies on common feature extraction techniques such as dipeptide composition and an ETree classifier model. In contrast, iACP-DRLF leverages two deep representation learning techniques alongside LGBM for refined feature selection. And ACP-check integrates a bidirectional long short-term memory (Bi-LSTM) network with a fully connected network, facilitating predictions based on both raw amino acid sequences and handcrafted features. ACP-BC is a three-channel end-to-end model, which employs data augmentation techniques, integrated in various combinations.</p>
<p>Despite the numerous informatics approaches proposed for ACPs recognition, there is still room for improvement. For instance, AntiCP 2.0 imposes a requirement on the target peptide sequence length to be between 4 and 50, while iACP-DRLF introduces a complex feature extraction strategy. More importantly, the scarcity of experimentally annotated datasets of ACPs significantly constrains the utilization and performance of machine learning. In light of these considerations, this study proposes ACP-DRL. ACP-DRL incorporates advanced language models that can efficiently utilize vast unlabelled datasets and extend sequence length through positional encoding, while Bi-LSTM operates without imposing restrictions on sequence length. In ACP-DRL, we have shifted our focus to deep representation learning, alleviating the scarcity of ACP datasets through the application of extensive unlabeled data. This allows predictions on longer sequences and reduces dependence on feature engineering based on expert knowledge. Simultaneously, in comparison with existing methods, ACP-DRL demonstrates exceptional performance.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Datasets</title>
<p>To ensure a fair comparison, the main and alternate datasets supplied by AntiCP 2.0 (<xref ref-type="bibr" rid="B1">Agrawal et al., 2021</xref>) were employed in this research. These consolidated datasets incorporate data harvested from numerous databases including DADP (<xref ref-type="bibr" rid="B15">Novkovi&#x107; et al., 2012</xref>), CAMP (<xref ref-type="bibr" rid="B26">Waghu et al., 2014</xref>), APD (<xref ref-type="bibr" rid="B29">Wang and Wang, 2004</xref>), APD2 (<xref ref-type="bibr" rid="B28">Wang et al., 2009</xref>), CancerPPD (<xref ref-type="bibr" rid="B25">Tyagi et al., 2015</xref>), Uniprot (<xref ref-type="bibr" rid="B4">Consortium, 2015</xref>), and SwissProt (<xref ref-type="bibr" rid="B8">Gasteiger et al., 2001</xref>) databases. The positive dataset, enriched with experimentally validated ACPs, was derived from a conjoined compilation of the antimicrobial peptide (AMP) database and the CancerPPD database. In contrast, the main negative dataset consisted of AMPs lacking anticancer activity, sourced solely from the AMP database, while the alternative negative dataset encompassed random peptides extracted from protein within the SwissProt database (<xref ref-type="bibr" rid="B8">Gasteiger et al., 2001</xref>). The main dataset includes 861 ACPs and equal number of non-ACPs while the alternate dataset holds a count of 970 for both ACPs and non-ACPs.</p>
<p>We additionally created an imbalanced dataset (comprising 845 ACPs and 3,800 non-ACPs) for five-fold cross-validation, which encompasses all data from both the main and alternate datasets. Additional sequence data was obtained from <xref ref-type="bibr" rid="B19">Rao et al. (2020)</xref>, and we used the CD-HIT algorithm to construct nonredundant sequences.</p>
<p>Furthermore, we collected approximately 1.5 million peptide sequences from PeptideAtlas (<xref ref-type="bibr" rid="B17">Omenn et al., 2022</xref>) as an unlabeled dataset for in-domain further pre-training of protein language model.</p>
<p>We assessed the amino acid composition (AAC) of peptides and generated six sample sequence logos (<xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>) in the in-domain further pre-training daset (IFPT), main, and alternate datasets. This was done to gain insights into the residue preferences at the N-terminus and C-terminus in these three datasets.</p>
<p>The result indicates that both the main and alternate datasets showed a high predominance of &#x2018;K&#x2019;, &#x2018;L&#x2019;, and &#x2018;A&#x2019; residues at the N-terminus, and &#x2018;K&#x2019; and &#x2018;L&#x2019; at the C-terminus (<xref ref-type="sec" rid="s10">Supplementary Figure S1A</xref>), consistent with previous studies (<xref ref-type="bibr" rid="B1">Agrawal et al., 2021</xref>). However, no particular amino acid type dominated at the N-terminus (<xref ref-type="sec" rid="s10">Supplementary Figure S1A</xref>) in the IFPT dataset, suggesting little to no conservation. As for the C-terminus (<xref ref-type="sec" rid="s10">Supplementary Figure S1B</xref>), it often concluded with either &#x2018;K&#x2019; or &#x2018;R&#x2019;, most likely influenced by specific enzyme cleavage sites, as the C-terminus is the end part to form during protein synthesis. The presence of amino acids such as lysine or arginine could have a significant impact on this cleavage process, with enzymes like trypsin specifically cleaving these, thereby affecting their prevalence at the C-termini. It can be discerned that the dataset used for in-domain further pre-training does not exhibit substantial similarity with the dataset utilized for anticancer peptide recognition.</p>
</sec>
<sec id="s2-2">
<title>2.2 Framework of ACP-DRL</title>
<p>As depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>, the framework of ACP-DRL consists of three main modules. Firstly, the initial section delineates the representation of peptide sequences. Secondly, the following section elucidates the further pre-training of the protein language model. Thirdly, the section explains the process of extracting peptide sequence features using a Bi-LSTM, and subsequently classifying these peptides based on the extracted features.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Framework of ACP-DRL. <bold>(A)</bold> Tokenized peptides representation. <bold>(B)</bold> Language model with in-domain further pre-training. <bold>(C)</bold> Fine-tuning layer and classifier.</p>
</caption>
<graphic xlink:href="fgene-15-1376486-g001.tif"/>
</fig>
<sec id="s2-2-1">
<title>2.2.1 Tokenized peptides representation</title>
<p>The initial section of <xref ref-type="fig" rid="F1">Figure 1</xref> illustrates a process in which peptide sequences are tokenized, which means that each amino acid is converted into its corresponding numerical IDs. These IDs are subsequently used as inputs for our peptide language model. Within the vocabulary of our language model, a total of 26 tokens have been utilized. This includes five special tokens ([PAD] [UNK] [CLS] [SEP] [MASK]), 20 tokens representing the standard amino acids in their abbreviated forms. Additionally, the token &#x201c;X&#x201d; has been specifically designated to denote non-generic or unresolved amino acids. This allocation of &#x201c;X&#x201d; facilitates the accommodation of non-standard amino acids, ultimately enhancing the model&#x2019;s adaptability and flexibility.</p>
</sec>
<sec id="s2-2-2">
<title>2.2.2 Language model with in-domain further pre-training</title>
<p>There is a perspective within the community that proteins can be represented by amino acids, and thus, they can be approximated as a unique form of natural language (<xref ref-type="bibr" rid="B16">Ofer et al., 2021</xref>). Recent academic research has further emphasized this perspective, with the release of numerous protein language models. <xref ref-type="bibr" rid="B5">Elnaggar et al. (2021)</xref> put forward the BERT-BFD model which was trained on the BFD (<xref ref-type="bibr" rid="B21">Steinegger et al., 2019</xref>) dataset composed of an impressive count of 2,122 million protein sequences. Concurrently, OntoProtein was put forth by <xref ref-type="bibr" rid="B32">Zhang et al. (2022)</xref>, employing the robust techniques of knowledge graphs and gene ontology. The aforementioned efforts have yielded excellent protein language models. Upon consideration, we selected OntoProtein as the foundational model and further conducted training based on our work.</p>
<p>Pre-training broadly involves the initial training of a model on a large dataset which enables it to acquire universal features. In-domain further pre-training signifies an added layer of refinement to the pre-trained model using task-relevant data within a specific field or operation. This additional step aims to bolster model performance within its designated tasks (<xref ref-type="bibr" rid="B9">Grambow et al., 2022</xref>).</p>
<p>In the context of our research, we collected and employed the IFPT dataset (about 1.5 million peptide sequences) to incrementally enhance OntoProtein to approximate the peptide level feature space more closely. Through this strategy, we proposed the OntoProtein within Peptides (OPP) model and could continuously obtain and train learnable deep representations during the training of downstream tasks.</p>
<p>The imperative behind this step is to facilitate OntoProtein&#x2019;s adaptability to the transition happening from protein sequences to peptide sequences. It is worth noting that, although OntoProtein jointly trains knowledge embedding (KE) and masked language modeling (MLM) tasks, during the In-domain further pre-training stage, we only trained the MLM task.</p>
<p>As shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, during the in-domain further pre-training stage, a subset of amino acids is masked, and the language model needs to predict the masked amino acids based on contextual information. This prompts the language model to learn the underlying information of peptide sequences. In our approach, each token (amino acid) has a 15% probability of being masked, and we use cross-entropy loss to estimate the predictions for these masked tokens. This process invokes the preparation of masked token inputs, conforming to the principles of masked language modeling. The distribution of these masked tokens adheres to a specific ratio: 80% are masked, 10% are replaced with random tokens, and the remaining 10% maintain their original identity.</p>
</sec>
<sec id="s2-2-3">
<title>2.2.3 Fine-tuning layer and classifier</title>
<p>BERT has demonstrated significant potential in the field of text classification, with researchers commonly acknowledging that the &#x201c;[CLS]&#x201d; token is expected to capture information from the entire sequence (<xref ref-type="bibr" rid="B23">Sun et al., 2019</xref>; <xref ref-type="bibr" rid="B27">Wang and Kuo, 2020</xref>). Consequently, in early classification tasks, researchers often relied solely on the information from the &#x201c;[CLS]&#x201d; token; however, this practice is not considered optimal (<xref ref-type="bibr" rid="B10">Jiang et al., 2021</xref>; <xref ref-type="bibr" rid="B11">Kim et al., 2021</xref>). In order to further extract sequence features from the peptides, we added an extra fine-tuning layer rather than connecting the &#x201c;[CLS]&#x201d; output directly to a fully connected layer. Bi-LSTM is particularly suitable for handling sequence data and can simultaneously capture both preceding and following contextual information.</p>
<p>The LSTM comprises four components: the forgetting gate <italic>f</italic>
<sub>
<italic>t</italic>
</sub>, the input gate <italic>i</italic>
<sub>
<italic>t</italic>
</sub>, the cell state <italic>C</italic>
<sub>
<italic>t</italic>
</sub>, and the output gate <italic>o</italic>
<sub>
<italic>t</italic>
</sub>. The forgetting gate <italic>f</italic>
<sub>
<italic>t</italic>
</sub> takes a value between 0 and 1. When an element of <italic>f</italic>
<sub>
<italic>t</italic>
</sub> is 0, it prevents the passage of the value from the previous cell state <italic>C</italic>
<sub>
<italic>t</italic>&#x2212;1</sub>, achieving selective forgetfulness. Meanwhile, the input gate <italic>i</italic>
<sub>
<italic>t</italic>
</sub> contributes information to the cell state <italic>C</italic>
<sub>
<italic>t</italic>
</sub>, thereby updating the information. This selective interplay of remembering and forgetting effectively addresses challenges such as gradient explosion, gradient disappearance, and distance-dependent issues commonly encountered in traditional RNNs. The whole process is as follows:<disp-formula id="e1">
<mml:math id="m1">
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m2">
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(3)</label>
</disp-formula>
<disp-formula id="e4">
<mml:math id="m4">
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x25e6;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x25e6;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mo>&#x303;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
<label>(4)</label>
</disp-formula>
<disp-formula id="e5">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(5)</label>
</disp-formula>
<disp-formula id="e6">
<mml:math id="m6">
<mml:msub>
<mml:mrow>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>o</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x25e6;</mml:mo>
<mml:mi>tanh</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>In this study, we employed Bi-LSTM to extract contextual information. As illustrated in the third section of <xref ref-type="fig" rid="F1">Figure 1</xref>, our OPP model furnishes a high-dimensional encoding for each amino acid in peptide sequences. We sequentially input this into two LSTMs (forward and backward) and combined their state vectors to provide a feature vector for each peptide. After that, we utilize a fully connected layer and Softmax function for classification, with a default threshold of 0.5. If the probability of belonging to the positive class is greater than 0.5, the target peptide sequence is categorized as an ACP; otherwise, it is designated as a non-ACP.</p>
</sec>
</sec>
<sec id="s2-3">
<title>2.3 Performance evaluation</title>
<p>The evaluation in this study is conducted using four metrics, namely, accuracy (Acc), sensitivity (Sen), specificity (SP) and Mathew&#x2019;s correlation coefficient (MCC), which is in line with previous studies. The specific evaluation metrics are as follows:<disp-formula id="e7">
<mml:math id="m7">
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(7)</label>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m8">
<mml:mi>S</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(8)</label>
</disp-formula>
<disp-formula id="e9">
<mml:math id="m9">
<mml:mi>S</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m10">
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#xd7;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
</sec>
</sec>
<sec sec-type="results|discussion" id="s3">
<title>3 Results and discussion</title>
<p>We ran ACP-DRL on a single node of the GPU cluster in the National Center for Protein Sciences (Beijing). During the training, we trained for 20 epochs with a learning rate of 2e-5 on 8&#xa0;T V100 GPUs, using adafactor as the optimizer, and adopted the cosine with restarts learning rate schedule. The batch size could be set to 32 for each training iteration. In this section, we commence with the evaluation of the language model, followed by an assessment of various fine-tuning layers. Finally, we compare results of ACP-DRL with existing methods.</p>
<sec id="s3-1">
<title>3.1 Evaluation of different language models</title>
<p>The development of Artificial Intelligence for Science has provided scholars with available protein language models. To assess the feasibility of current typical protein language models in ACPs recognition, we gathered randomly initialized BERT, BERT-BFD trained on 2,122 million protein sequences and OntoProtein which incorporates joint training with KE and GO, for training and evaluation. We designed a common vocabulary for these three models, encoding peptide sequences from the main dataset into each model, and subsequently employing a fully connected layer for classification on the encoded results. The evaluation results (<xref ref-type="fig" rid="F2">Figure 2</xref>) suggest that the three language models have similar performances in terms of sensitivity. Still, regarding specificity, the initialized BERT performs worse, which might contribute to its lower accuracy. BERT-BFD and OntoProtein, both of which have been pre-trained employing a substantial volume of protein sequences, demonstrate performances that are relatively equivalent. Overall, OntoProtein is slightly inferior to BERT-BFD in sensitivity but achieves advantages in accuracy, specificity and MCC, with the benefit in specificity being more pronounced. Furthermore, considering its slightly higher MCC than BERT-BFD, we propose that OntoProtein has more potential for the task at hand.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Evaluation of language models on main dataset.</p>
</caption>
<graphic xlink:href="fgene-15-1376486-g002.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Performance of different fine-tuning layers</title>
<p>We can obtain the encoding of each amino acid in a peptide sequence through language models. To further extract sequence features, we utilized OntoProtein as the base model and experimented with various fine-tuning layers on the main dataset. Specifically, the fully connected layer only utilized the encoding of the &#x201c;[CLS]&#x201d; token for classification, while Text-CNN, forward LSTM, and Bi-LSTM utilized the encoding information of the entire sequence. <xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the experimental results under different fine-tuning layers. It can be observed that the effectiveness of using only the encoding of the &#x201c;[CLS]&#x201d; token for classification is not satisfactory, corroborating the findings of <xref ref-type="bibr" rid="B11">Kim et al. (2021)</xref> and <xref ref-type="bibr" rid="B10">Jiang et al. (2021)</xref>. The performance is somewhat improved with a simple forward LSTM, but a significant leap is observed when incorporating a backward LSTM. Text-CNN demonstrates a certain level of competitiveness in this task but falls short of Bi-LSTM, reaffirming our confidence in choosing Bi-LSTM as the fine-tuning layer.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Performance of different fine-tuning layers on main dataset.</p>
</caption>
<graphic xlink:href="fgene-15-1376486-g003.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>3.3 Evaluation of in-domain further pre-training</title>
<p>In-domain further pre-training is a primary approach for enhancing language models using in-domain additional datasets. We gathered a dataset comprising approximately 1.5 million peptide sequences to assess performance of OntoProtein in the peptide domain, ultimately obtaining the OPP model used for actual training. To better understand the distribution changes of feature information in language models, we employed t-Distributed Stochastic Neighbor Embedding (t-SNE) for the visualization of model features. We discussed three stages of the language model: a) the unpretrained BERT model, b) the OntoProtein model released by <xref ref-type="bibr" rid="B32">Zhang et al. (2022)</xref>, and c) the OPP model obtained through our additional pre-training. <xref ref-type="fig" rid="F4">Figures 4A, B</xref> present the t-SNE visualization results of the test sets from the main and alternate datasets at three stages. It can be observed that the initialized BERT exhibits a considerable overlap of points on both datasets, confirming the subpar testing results shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. This phenomenon may be attributed to the model&#x2019;s excessive parameter count compared to the small training dataset. OntoProtein and our OPP model demonstrate excellent performance in the t-SNE visualizations, displaying distinct sample clusters. On the main dataset, the ACPs of our OPP model are more clustered than those of OntoProtein, while on the alternate dataset, the OPP model has fewer mixed-in non-ACPs among its ACPs. Therefore, it is reasonable to conclude that the in-domain further pre-training strategy&#x2014;utilizing the IFPT dataset implemented in this study&#x2014;augments the model&#x2019;s performance on both the main and alternate datasets. This enhancement is observable notwithstanding the significant differences in amino acid composition and positional preference between the unlabeled IFPT dataset and the datasets used for downstream tasks, as depicted in <xref ref-type="sec" rid="s10">Supplementary Figure S1</xref>.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Visualization results of t-SNE for language models at different stages on <bold>(A)</bold> main dataset and <bold>(B)</bold> alternate dataset.</p>
</caption>
<graphic xlink:href="fgene-15-1376486-g004.tif"/>
</fig>
</sec>
<sec id="s3-4">
<title>3.4 Comparison with existing methods</title>
<sec id="s3-4-1">
<title>3.4.1 Evaluation on the main and alternate dataset</title>
<p>After confirming the superior performance of our OPP model, we adopted it to construct the ACP-DRL model. To evaluate the performance of ACP-DRL model, we conduct a comparison with other machine learning or deep learning models, which include iACP, PEPred-Suite, ACPred-Fuse, AntiCP 2.0, iACP-DRLF, ACP-check, and ACP-BC. In this evaluation, we use the benchmark datasets (main and alternate datasets) as proposed by AntiCP 2.0.</p>
<p>The original ACP-BC paper does not furnish performance metrics for our benchmark datasets. Hence, using their GitHub repository (<ext-link ext-link-type="uri" xlink:href="https://github.com/shunmengfan/ACP-BC">https://github.com/shunmengfan/ACP-BC</ext-link>) where their source code is available, we conducted experiments on our benchmark dataset using the best parameters stated in their paper. The optimal parameters deployed were: data augmentation factor R) set to 1.0, LSTM hidden layer C) with 256 nodes, number of neurons in the embedding layer D) as 512, and a learning rate of 1e-3. Both ACP-check and iACP-DRLF adopted the metric values reported in their respective papers, and hence there are slight differences in the degree of precision. The precision of ACP-check is maintained at 1%, while iACP-DRLF maintains its precision to 0.1%. The performance of the remaining methods came from the metric values reported by AntiCP 2.0 after executing evaluations on the main and alternate datasets.</p>
<p>
<xref ref-type="table" rid="T1">Table 1</xref> and <xref ref-type="table" rid="T2">Table 2</xref> respectively display the performance on the main and alternate datasets, with the best performance for each metric highlighted in bold. As shown in <xref ref-type="table" rid="T1">Table 1</xref>, our model achieved the highest accuracy, specificity, and MCC on the main dataset, with a sensitivity close to that of ACP-check. The advantage is even more pronounced on the alternate dataset (<xref ref-type="table" rid="T2">Table 2</xref>), where our model reached an accuracy of 94.43%. Although our sensitivity was slightly lower than ACP-check, our specificity exceeded ACP-check by 3.64%. Overall, compared to existing advanced methods, the ACP-DRL model proposed in this study is highly competitive.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Comparison with existing methods on main dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">Acc(%)</th>
<th align="center">Sen(%)</th>
<th align="center">Spc (%)</th>
<th align="center">MCC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">ACP-DRL (Ours)</td>
<td align="center">
<bold>78.96</bold>
</td>
<td align="center">79.53</td>
<td align="center">
<bold>78.39</bold>
</td>
<td align="center">
<bold>0.56</bold>
</td>
</tr>
<tr>
<td align="center">ACP-BC</td>
<td align="center">75.16</td>
<td align="center">72.61</td>
<td align="center">77.71</td>
<td align="center">0.50</td>
</tr>
<tr>
<td align="center">ACP-check</td>
<td align="center">78</td>
<td align="center">
<bold>80</bold>
</td>
<td align="center">77</td>
<td align="center">0.56</td>
</tr>
<tr>
<td align="center">iACP-DRLF</td>
<td align="center">77.5</td>
<td align="center">80.7</td>
<td align="center">74.3</td>
<td align="center">0.55</td>
</tr>
<tr>
<td align="center">AntiCP 2.0</td>
<td align="center">75.43</td>
<td align="center">77.46</td>
<td align="center">73.41</td>
<td align="center">0.51</td>
</tr>
<tr>
<td align="center">ACPred-Fuse</td>
<td align="center">68.9</td>
<td align="center">69.19</td>
<td align="center">68.6</td>
<td align="center">0.38</td>
</tr>
<tr>
<td align="center">PEPred-Suite</td>
<td align="center">53.49</td>
<td align="center">33.14</td>
<td align="center">73.84</td>
<td align="center">0.08</td>
</tr>
<tr>
<td align="center">iACP</td>
<td align="center">55.10</td>
<td align="center">77.91</td>
<td align="center">32.16</td>
<td align="center">0.11</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The best performance for each metric highlighted in bold.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison with existing methods on alternate dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">Acc(%)</th>
<th align="center">Sen(%)</th>
<th align="center">Spc (%)</th>
<th align="center">MCC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">ACP-DRL (Ours)</td>
<td align="center">
<bold>94.43</bold>
</td>
<td align="center">92.22</td>
<td align="center">
<bold>96.64</bold>
</td>
<td align="center">
<bold>0.89</bold>
</td>
</tr>
<tr>
<td align="center">ACP-BC</td>
<td align="center">91.05</td>
<td align="center">92.14</td>
<td align="center">89.96</td>
<td align="center">0.82</td>
</tr>
<tr>
<td align="center">ACP-check</td>
<td align="center">93</td>
<td align="center">
<bold>93</bold>
</td>
<td align="center">93</td>
<td align="center">0.86</td>
</tr>
<tr>
<td align="center">iACP-DRLF</td>
<td align="center">93.0</td>
<td align="center">89.6</td>
<td align="center">96.4</td>
<td align="center">0.86</td>
</tr>
<tr>
<td align="center">AntiCP 2.0</td>
<td align="center">92.01</td>
<td align="center">92.27</td>
<td align="center">91.75</td>
<td align="center">0.84</td>
</tr>
<tr>
<td align="center">ACPred-Fuse</td>
<td align="center">78.87</td>
<td align="center">64.43</td>
<td align="center">93.3</td>
<td align="center">0.6</td>
</tr>
<tr>
<td align="center">PEPred-Suite</td>
<td align="center">57.47</td>
<td align="center">40.21</td>
<td align="center">74.74</td>
<td align="center">0.16</td>
</tr>
<tr>
<td align="center">iACP</td>
<td align="center">77.58</td>
<td align="center">78.35</td>
<td align="center">76.8</td>
<td align="center">0.55</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The best performance for each metric highlighted in bold.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-4-2">
<title>3.4.2 Five-fold cross-validation on imbalanced dataset</title>
<p>To further illustrate the effectiveness of our model, we constructed an imbalanced dataset (comprising 845 ACPs and 3,800 non-ACPs) for a five-fold cross-validation. For this validation, we chose ACP-BC, the most recent model, and ACP-check, which offers competitive performance on main and alternate datasets, for comparisons.</p>
<p>We downloaded the source code for ACP-check from an open-source project (<ext-link ext-link-type="uri" xlink:href="https://github.com/ystillen/ACP-check">https://github.com/ystillen/ACP-check</ext-link>) and tailored a version for our five-fold cross-validation. For ACP-check, we chose the parameters best suited to the main dataset (lr &#x3d; 1e-3, batch size &#x3d; 50, epoch &#x3d; 30). For ACP-BC, we still referred to the previously mentioned code and optimal parameters.</p>
<p>In this cross-validation, we included two additional evaluation metrics&#x2014;Area Under the ROC Curve (AUC) and Area Under the Precision-Recall Curve (AUPR)&#x2014;to further assess the model. While AUC serves as a common indicator for classifying performance across different thresholds (with a score close to 1.0 indicating strong performance), AUPR focuses more on the performance of classifiers in circumstances with imbalanced positive and negative samples.</p>
<p>
<xref ref-type="sec" rid="s10">Supplementary Tables S1&#x2013;S3</xref> demonstrate the performance of the three models on the imbalanced dataset, while <xref ref-type="table" rid="T3">Table 3</xref> presents the average performance based on five-fold cross-validation, with the best results highlighted in bold. The results suggest that ACP-DRL has achieved top-tier performance across five evaluation metrics&#x2014;Acc, Spc, MCC, AUC, and AUPR.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Comparison of ACP-BC, ACP-check, and ACP-DRL in five-fold cross validation on an imbalanced dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">Acc(%)</th>
<th align="center">Sen(%)</th>
<th align="center">Spc (%)</th>
<th align="center">MCC</th>
<th align="center">AUC</th>
<th align="center">AUPR</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">ACP-BC</td>
<td align="center">88.53</td>
<td align="center">
<bold>64.58</bold>
</td>
<td align="center">93.87</td>
<td align="center">0.60</td>
<td align="center">0.89</td>
<td align="center">0.71</td>
</tr>
<tr>
<td align="center">ACP-check</td>
<td align="center">82.00</td>
<td align="center">49.22</td>
<td align="center">89.21</td>
<td align="center">0.40</td>
<td align="center">0.76</td>
<td align="center">0.44</td>
</tr>
<tr>
<td align="center">ACP-DRL (ours)</td>
<td align="center">
<bold>89.82</bold>
</td>
<td align="center">62.47</td>
<td align="center">
<bold>95.89</bold>
</td>
<td align="center">
<bold>0.64</bold>
</td>
<td align="center">
<bold>0.91</bold>
</td>
<td align="center">
<bold>0.78</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The best performance for each metric highlighted in bold.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Although ACP-BC attained the highest score for sensitivity, ACP-DRL achieved similar results whilst surpassing ACP-BC in specificity. This may suggest that ACP-DRL adopts a more conservative approach when classifying positive instances, hence avoiding potential misidentifications of true positives. Perhaps due to the sensitivity of ACP-check to data distribution, it did not demonstrate competitive performance in this test.</p>
<p>Paired T-tests were conducted on the results of five-fold cross-validation (as shown in <xref ref-type="sec" rid="s10">Supplementary Table S4</xref>). The results indicated statistically significant differences in Spc, AUC, and AUPR metrics (<italic>p</italic> &#x3c; 0.05) and a marginal difference in Acc (<italic>p</italic> &#x3d; 0.05) when comparing our ACP-DRL model with ACP-BC. Meanwhile, when comparing with ACP-check, Acc, Sen, MCC, AUC, and AUPR all manifested significant differences (<italic>p</italic> &#x3c; 0.05).</p>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s4">
<title>4 Conclusion</title>
<p>In this work, we have proposed a novel ACPs recognition method called ACP-DRL. ACP-DRL enhances the existing protein language model using in-domain further pre-training technology to approximate the peptide level feature space more closely, continuously obtains and trains learnable deep representation during training of downstream tasks, and learns the features at the amino acid level through Bi-LSTM, which combined with a fully connected layer to complete the recognition of ACPs. This design introduces the BERT-based protein large language model and further pre-training techniques into the ACPs recognition for the first time, eliminates constraints on sequence length and the dependence on manual features, showcasing remarkable competitiveness in comparison with existing methods. In recent years, recognizing various functional peptides like MFTP (<xref ref-type="bibr" rid="B6">Fan et al., 2023</xref>), MLBP(<xref ref-type="bibr" rid="B24">Tang et al., 2022</xref>), and PrMFTP (<xref ref-type="bibr" rid="B31">Yan et al., 2022</xref>) has seen significant advancements. These methods universally use encoders to transition peptide sequences into vectors. Believing that our OPP model is notably adept at this encoding task, we plan to apply it to the research in recognizing multifunctional peptides next.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. Code and datasets can be found here: <ext-link ext-link-type="uri" xlink:href="https://github.com/shallFun4Learning/ACP-DRL">https://github.com/shallFun4Learning/ACP-DRL</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>XX: Writing&#x2013;original draft, Writing&#x2013;review and editing. CL: Writing&#x2013;original draft, Writing&#x2013;review and editing. XY: Writing&#x2013;original draft, Writing&#x2013;review and editing. QZ: Data curation, Writing&#x2013;review and editing. YL: Data curation, Writing&#x2013;review and editing. YZ: Writing&#x2013;original draft, Writing&#x2013;review and editing. TC: Writing&#x2013;original draft, Writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s7">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Key Research and Development Program (2021YFA1301603) and the Open Fund of State Key Laboratory of Medical Proteomics (SKLP-O202207).</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2024.1376486/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2024.1376486/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Agrawal</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Bhagat</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Mahalwal</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Raghava</surname>
<given-names>G. P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Anticp 2.0: an updated model for predicting anticancer peptides</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume>, <fpage>bbaa153</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaa153</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Aronson</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Simonson</surname>
<given-names>A. W.</given-names>
</name>
<name>
<surname>Orchard</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Llin&#xe1;s</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Medina</surname>
<given-names>S. H.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Lipopeptisomes: anticancer peptide-assembled particles for fusolytic oncotherapy</article-title>. <source>Acta Biomater.</source> <volume>80</volume>, <fpage>269</fpage>&#x2013;<lpage>277</lpage>. <pub-id pub-id-type="doi">10.1016/j.actbio.2018.09.025</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chou</surname>
<given-names>K.-C.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>iacp: a sequence-based tool for identifying anticancer peptides</article-title>. <source>Oncotarget</source> <volume>7</volume>, <fpage>16895</fpage>&#x2013;<lpage>16909</lpage>. <pub-id pub-id-type="doi">10.18632/oncotarget.7815</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Consortium</surname>
<given-names>U.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Uniprot: a hub for protein information</article-title>. <source>Nucleic acids Res.</source> <volume>43</volume>, <fpage>D204</fpage>&#x2013;<lpage>D212</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gku989</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Elnaggar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Heinzinger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Dallago</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Rehawi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Jones</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Prottrans: towards cracking the language of lifes code through self-supervised deep learning and high performance computing</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>43</volume>, <fpage>1</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1109/TPAMI.2019.2929146</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Deep learning-based multi-functional therapeutic peptides prediction with a multi-label focal dice loss function</article-title>. <source>Bioinformatics</source> <volume>39</volume>, <fpage>btad334</fpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btad334</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gabernet</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>A. T.</given-names>
</name>
<name>
<surname>Hiss</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Schneider</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Membranolytic anticancer peptides</article-title>. <source>MedChemComm</source> <volume>7</volume>, <fpage>2232</fpage>&#x2013;<lpage>2245</lpage>. <pub-id pub-id-type="doi">10.1039/c6md00376a</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gasteiger</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Bairoch</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Swiss-prot: connecting biomolecular knowledge via a protein database</article-title>. <source>Curr. issues Mol. Biol.</source> <volume>3</volume>, <fpage>47</fpage>&#x2013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.21775/cimb.003.047</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grambow</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Schaaf</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>In-domain pre-training improves clinical note generation from doctor-patient conversations</article-title>. <source>Proc. First Workshop Nat. Lang. Generation Healthc.</source>, <fpage>9</fpage>&#x2013;<lpage>22</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Xin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>How does bert rerank passages? an attribution analysis with information bottlenecks</article-title>. <source>Proc. Fourth BlackboxNLP Workshop Anal. Interpreting Neural Netw. NLP</source>, <fpage>496</fpage>&#x2013;<lpage>509</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.blackboxnlp-1.39</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Yoo</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.-g.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Self-guided contrastive learning for bert sentence representations</article-title>,&#x201d; in <conf-name>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</conf-name>, <conf-date>August 1-6, 2021</conf-date>, <fpage>2528</fpage>&#x2013;<lpage>2540</lpage>.</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>From oncolytic peptides to oncolytic polymers: a new paradigm for oncotherapy</article-title>. <source>Bioact. Mater.</source> <volume>31</volume>, <fpage>206</fpage>&#x2013;<lpage>230</lpage>. <pub-id pub-id-type="doi">10.1016/j.bioactmat.2023.08.007</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lv</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Cui</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Anticancer peptides prediction with deep representation learning features</article-title>. <source>Briefings Bioinforma.</source> <volume>22</volume>, <fpage>bbab008</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbab008</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mun</surname>
<given-names>E. J.</given-names>
</name>
<name>
<surname>Babiker</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Weinberg</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Kirson</surname>
<given-names>E. D.</given-names>
</name>
<name>
<surname>Von Hoff</surname>
<given-names>D. D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Tumor-treating fields: a fourth modality in cancer treatment</article-title>. <source>Clin. Cancer Res.</source> <volume>24</volume>, <fpage>266</fpage>&#x2013;<lpage>275</lpage>. <pub-id pub-id-type="doi">10.1158/1078-0432.CCR-17-1117</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Novkovi&#x107;</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Simuni&#x107;</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bojovi&#x107;</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Tossi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jureti&#x107;</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Dadp: the database of anuran defense peptides</article-title>. <source>Bioinformatics</source> <volume>28</volume>, <fpage>1406</fpage>&#x2013;<lpage>1407</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts141</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ofer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Brandes</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Linial</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The language of proteins: NLP, machine learning and protein sequences</article-title>. <source>Comput. Struct. Biotechnol. J.</source> <volume>19</volume>, <fpage>1750</fpage>&#x2013;<lpage>1758</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2021.03.022</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Omenn</surname>
<given-names>G. S.</given-names>
</name>
<name>
<surname>Lane</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Overall</surname>
<given-names>C. M.</given-names>
</name>
<name>
<surname>Pineau</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Packer</surname>
<given-names>N. H.</given-names>
</name>
<name>
<surname>Cristea</surname>
<given-names>I. M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>The 2022 report on the human proteome from the hupo human proteome project</article-title>. <source>J. proteome Res.</source> <volume>22</volume>, <fpage>1024</fpage>&#x2013;<lpage>1042</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jproteome.2c00498</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pelliccia</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Amato</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Capasso</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Di Gaetano</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Massarotti</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Piccolo</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Bio-inspired dual-selective bcl-2/c-myc g-quadruplex binders: design, synthesis, and anticancer activity of drug-like imidazo [2, 1-i] purine derivatives</article-title>. <source>J. Med. Chem.</source> <volume>63</volume>, <fpage>2035</fpage>&#x2013;<lpage>2050</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jmedchem.9b00262</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Acpred-fuse: fusing multi-view information improves the prediction of anticancer peptides</article-title>. <source>Briefings Bioinforma.</source> <volume>21</volume>, <fpage>1846</fpage>&#x2013;<lpage>1855</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbz088</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Siegel</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>K. D.</given-names>
</name>
<name>
<surname>Wagle</surname>
<given-names>N. S.</given-names>
</name>
<name>
<surname>Jemal</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Cancer statistics, 2023</article-title>. <source>Ca Cancer J. Clin.</source> <volume>73</volume>, <fpage>17</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.3322/caac.21763</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Steinegger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mirdita</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>S&#xf6;ding</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Protein-level assembly increases protein sequence recovery from metagenomic samples manyfold</article-title>. <source>Nat. methods</source> <volume>16</volume>, <fpage>603</fpage>&#x2013;<lpage>606</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-019-0437-4</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Pang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Acp-bc: a model for accurate identification of anticancer peptides based on fusion features of bidirectional long short-term memory and chemically derived information</article-title>. <source>Int. J. Mol. Sci.</source> <volume>24</volume>, <fpage>15447</fpage>. <pub-id pub-id-type="doi">10.3390/ijms242015447</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gan</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Patient knowledge distillation for bert model compression</article-title>,&#x201d; in <conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing</conf-name>.</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>E.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Identifying multi-functional bioactive peptide functions using multi-label deep learning</article-title>. <source>Briefings Bioinforma.</source> <volume>23</volume>, <fpage>bbab414</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbab414</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tyagi</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tuknait</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Anand</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Gupta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sharma</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mathur</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2015</year>). <article-title>Cancerppd: a database of anticancer peptides and proteins</article-title>. <source>Nucleic acids Res.</source> <volume>43</volume>, <fpage>D837</fpage>&#x2013;<lpage>D843</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gku892</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Waghu</surname>
<given-names>F. H.</given-names>
</name>
<name>
<surname>Gopi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Barai</surname>
<given-names>R. S.</given-names>
</name>
<name>
<surname>Ramteke</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Nizami</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Idicula-Thomas</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Camp: collection of sequences and structures of antimicrobial peptides</article-title>. <source>Nucleic acids Res.</source> <volume>42</volume>, <fpage>D1154</fpage>&#x2013;<lpage>D1158</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkt1157</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kuo</surname>
<given-names>C.-C. J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Sbert-wk: a sentence embedding method by dissecting bert-based word models</article-title>. <source>IEEE/ACM Trans. Audio, Speech, Lang. Process.</source> <volume>28</volume>, <fpage>2146</fpage>&#x2013;<lpage>2157</lpage>. <pub-id pub-id-type="doi">10.1109/taslp.2020.3008390</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Apd2: the updated antimicrobial peptide database and its application in peptide design</article-title>. <source>Nucleic acids Res.</source> <volume>37</volume>, <fpage>D933</fpage>&#x2013;<lpage>D937</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkn823</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2004</year>). <article-title>Apd: the antimicrobial peptide database</article-title>. <source>Nucleic acids Res.</source> <volume>32</volume>, <fpage>D590</fpage>&#x2013;<lpage>D592</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkh025</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wei</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Su</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Q.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Pepred-suite: improved and robust prediction of therapeutic peptides using adaptive feature representation learning</article-title>. <source>Bioinformatics</source> <volume>35</volume>, <fpage>4272</fpage>&#x2013;<lpage>4280</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btz246</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yan</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Bin</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Prmftp: multi-functional therapeutic peptides prediction based on multi-head self-attention mechanism and class weight optimization</article-title>. <source>PLoS Comput. Biol.</source> <volume>18</volume>, <fpage>e1010511</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pcbi.1010511</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Bi</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). &#x201c;<article-title>Ontoprotein: protein pretraining with gene ontology embedding</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations</conf-name>, <conf-date>May 3-7, 2021</conf-date>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Acp-check: an anticancer peptide prediction model based on bidirectional long short-term memory and multi-features fusion strategy</article-title>. <source>Comput. Biol. Med.</source> <volume>148</volume>, <fpage>105868</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.105868</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>