<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="methods-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2026.1736391</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Methods</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Multi-feature fusion for gene prediction and functional peptide identification</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Ma</surname> <given-names>Chenjing</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wei</surname> <given-names>Qianran</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Guohua</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Miao</surname> <given-names>Yan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<uri xlink:href="https://loop.frontiersin.org/people/1539796"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yuan</surname> <given-names>Lei</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<uri xlink:href="https://loop.frontiersin.org/people/1187129"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Hepatobiliary Surgery, The Quzhou Affiliated Hospital of Wenzhou Medical University, Quzhou People&#x00027;s Hospital</institution>, <city>Quzhou, Zhejiang</city>, <country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>College of Computer and Control Engineering, Northeast Forestry University</institution>, <city>Harbin, Heilongjiang</city>, <country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Faculty of Computing, Harbin Institute of Technology</institution>, <city>Harbin, Heilongjiang</city>, <country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Lei Yuan, <email xlink:href="mailto:senxiu99@163.com">senxiu99@163.com</email>; Yan Miao, <email xlink:href="mailto:miaoyan@nefu.edu.cn">miaoyan@nefu.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-06">
<day>06</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1736391</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>14</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>15</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Ma, Wei, Wang, Miao and Yuan.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Ma, Wei, Wang, Miao and Yuan</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-06">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Anticancer peptides (ACPs) have demonstrated potent antitumor activity and low toxicity, offering considerable potential in cancer therapeutics. Meanwhile, antimicrobial peptides (AMPs)serve as key components of the innate immune defense system. Owing to their broad-spectrum antimicrobial activity and low propensity for inducing resistance, AMPs have attracted considerable attention in the fields of infection control and immunotherapy. Accurate identification of ACPs and AMPs is critical for the discovery of novel therapeutic agents. However, wet-lab identification is often time-consuming, costly, and inefficient, falling short of the demands for highthroughput drug screening. Furthermore, existing computational methods exhibit limitations in feature representation and cross-task prediction capability. To address these challenges, a tool for functional peptide prediction is proposed, namely GP2FI, which consists of two sequential stages: a gene prediction model (MHA-preconv) and a functional peptide identification model (FuncPred-CB). MHA-preconv integrates CNNs with Transformer encoder layers to form a two-stage deep architecture, effectively capturing both local sequence patterns and long-range dependencies. Based on the coding regions identified by MHA-preconv, FuncPred-CB incorporates a pre-trained BERT language model to automatically extract contextual semantic features from amino acid sequences. Experimental results on multiple benchmark datasets demonstrate that MHA-preconv and GP2FI consistently outperforms the state-of-the-art methods in terms of accuracy and other performance metrics.The code for the GP2FI can be found at <ext-link ext-link-type="uri" xlink:href="https://github.com/ma999-mxl/maLBX.git">https://github.com/ma999-mxl/maLBX.git</ext-link>.</p></abstract>
<kwd-group>
<kwd>ACP</kwd>
<kwd>AMPs</kwd>
<kwd>deep learning</kwd>
<kwd>functional peptide prediction</kwd>
<kwd>gene prediction</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the National Natural Science Foundation of China (NSFC) [62573111 and 62301139], the Heilongjiang Provincial Natural Science Foundation of China [JJ2025QC0185], the Quzhou municipal Science and Technology Project Foundation [2022K55], and the Zhejiang Provincial Natural Science Foundation of China [LTGY23H070004].</funding-statement>
</funding-group>
<counts>
<fig-count count="7"/>
<table-count count="5"/>
<equation-count count="1"/>
<ref-count count="44"/>
<page-count count="14"/>
<word-count count="8059"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Systems Microbiology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Functional peptides, particularly anticancer peptides (ACPs) and antimicrobial peptides (AMPs), have emerged as prominent research topics in recent years due to their critical roles in cancer therapy and immune defense. ACPs exhibit remarkable anti-tumor potential by selectively targeting cancer cells through unique membrane-disruptive mechanisms. AMPs, on the other hand, are widely distributed in living organisms and possess broad-spectrum antimicrobial activity with low risk of resistance development. They have been extensively applied in medicine, food safety, and agriculture. However, the experimental identification of such functional peptides is time-consuming and costly, which significantly hinders their large-scale development and practical application.</p>
<p>With the rapid development of artificial intelligence technologies, sequence-based functional peptide prediction has emerged as a feasible and efficient alternative approach. This predictive process generally involves two key stages: first, deep learning methods are employed to accurately identify open reading frames (ORFs) from raw genomic sequences; second, the predicted gene sequences are translated into protein sequences, which are subsequently analyzed by machine learning or deep learning models to identify potential ACPs or AMPs. Therefore, constructing an end-to-end framework that integrates efficient gene prediction and functional peptide identification is of great importance for the discovery of novel functional peptides and the advancement of precision medicine and anti-infective therapeutics.</p>
<p>For gene prediction, a variety of algorithms have been proposed to identify ORFs with protein-coding potential in genomic sequences. These methods can generally be categorized into three main types: statistical learning-based, traditional machine learning-based, and deep learning-based approaches. Early tools such as Prodigal (<xref ref-type="bibr" rid="B16">Hyatt et al., 2010</xref>) employed Hidden Markov Models (HMMs) combined with statistical scoring schemes to rank and evaluate ORFs. While these approaches are computationally efficient, they often struggle to capture complex sequence patterns and long-range dependencies (<xref ref-type="bibr" rid="B21">Larsen and Krogh, 2003</xref>; <xref ref-type="bibr" rid="B9">Delcher et al., 2007</xref>; <xref ref-type="bibr" rid="B20">Kelley et al., 2011</xref>). Traditional machine learning-based methods, such as Orphelia (<xref ref-type="bibr" rid="B13">Hoff et al., 2009</xref>) and MGC (<xref ref-type="bibr" rid="B10">El Allali and Rose, 2013</xref>), integrated neural networks with discriminative classifiers. MetaGUN (<xref ref-type="bibr" rid="B24">Liu et al., 2013</xref>), MetaGeneAnnotator (<xref ref-type="bibr" rid="B29">Noguchi et al., 2008</xref>), and mRMR-SVM (<xref ref-type="bibr" rid="B6">Al-Ajlan and El Allali, 2018a</xref>) utilized Support Vector Machines (SVMs) for gene classification. FragGeneScan (<xref ref-type="bibr" rid="B33">Rho et al., 2010</xref>) combined sequencing error models with codon usage preferences, enhancing its robustness on low-quality data. Additionally, an m5C-Seq (<xref ref-type="bibr" rid="B4">Abbas et al., 2024</xref>) model&#x02014;an ensemble-learning approach for predicting RNA 5-methylcytosine modification sites&#x02014;and an ML (<xref ref-type="bibr" rid="B1">Abbas et al., 2025a</xref>) model designed for rare genetic diseases, which leverages machine learning to handle high-dimensional genomic data in a manner that deviates from the traditional single-gene prediction paradigm, have been introduced. Nevertheless, these methods still exhibit limitations in deep feature extraction and modeling of sequence-level dependencies. In response, a growing number of deep learning-based models have been established in recent years. Meta-MFDL (<xref ref-type="bibr" rid="B41">Zhang et al., 2017</xref>) employs a multi-layer stacked architecture for feature extraction and classification. CNN-MGP (<xref ref-type="bibr" rid="B7">Al-Ajlan and El Allali, 2018b</xref>) constructs a multi-branch CNN ensemble for gene prediction. CNN-RAI (<xref ref-type="bibr" rid="B19">Karag&#x000F6;z and Nalbantoglu, 2021</xref>) leverages k-mer features in a CNN framework. Although these models improve prediction accuracy, gene prediction commonly faces challenges such as short read lengths, incomplete sequences, and fragmentation, leading to loss of sequence information. Additionally, limitations in capturing global sequence dependencies further increase the difficulty of accurate gene identification.</p>
<p>For functional peptide identification, research has primarily focused on the independent prediction of two major categories: ACPs and AMPs. For ACP prediction, various models have been developed by transforming peptide sequences into numerical representations and applying classification algorithms. Representative methods include ACP-DRL (<xref ref-type="bibr" rid="B39">Xu et al., 2024</xref>), PEPred-Suite (<xref ref-type="bibr" rid="B38">Wei et al., 2019</xref>), ACPred-Fuse (<xref ref-type="bibr" rid="B32">Rao et al., 2019</xref>), iACP-DRLF (<xref ref-type="bibr" rid="B26">Lv et al., 2021</xref>), AntiCP2.0 (<xref ref-type="bibr" rid="B5">Agrawal et al., 2020</xref>), ACP-check (<xref ref-type="bibr" rid="B44">Zhu et al., 2022</xref>), and ACP-BC (<xref ref-type="bibr" rid="B34">Sun et al., 2023</xref>), which utilize dipeptide composition, deep representation learning, Bi-LSTM architectures, or multi-channel data augmentation strategies for modeling. In the field of AMP prediction, the construction of large-scale AMP databases such as CAMP (<xref ref-type="bibr" rid="B36">Waghu et al., 2015</xref>), APD3 (<xref ref-type="bibr" rid="B37">Wang, 2004</xref>), dbAMP (<xref ref-type="bibr" rid="B17">Jhong et al., 2018</xref>), and DRAMP 2.0 (<xref ref-type="bibr" rid="B18">Kang et al., 2019</xref>) has provided essential data resources for computational model development. Existing tools include CS-AMPPred (<xref ref-type="bibr" rid="B30">Porto et al., 2012</xref>) (SVM-based classification), PEP-FOLD (<xref ref-type="bibr" rid="B8">Bhadra et al., 2018</xref>) (random forest models), Ensemble-AMPPred (<xref ref-type="bibr" rid="B22">Lertampaiporn et al., 2021</xref>), and AMPpred-EL (<xref ref-type="bibr" rid="B25">Lv et al., 2022</xref>) (ensemble learning approaches). Recently, deep learning-based models such as AMPScanner (<xref ref-type="bibr" rid="B35">Veltri et al., 2018</xref>), BERT-AMP (<xref ref-type="bibr" rid="B27">Ma et al., 2022</xref>), sAMPpred-GAT (<xref ref-type="bibr" rid="B40">Yan et al., 2022</xref>), and AMPpred-MFA (<xref ref-type="bibr" rid="B23">Li et al., 2023</xref>) have demonstrated excellent performance in AMP identification tasks. However, most of these approaches suffer from several limitations, including data scarcity, sequence length constraints, limited semantic representation capability, and task specificity. Notably, they generally support only single-task classification and lack a unified framework for predicting multiple functional peptide types such as ACPs and AMPs. Recently, AI (<xref ref-type="bibr" rid="B2">Abbas et al., 2025b</xref>) applicable to intelligent healthcare have also begun to emerge. These models are task-specific, designed to fulfill well-defined functions, yet they can only handle the designated task and are unable to adapt to different types of problems.</p>
<p>To overcome these limitations and further enhance the accuracy of ORF prediction and functional peptide identification, a novel deep learning-based gene prediction and functional peptide identification method, namely GP2FI, is proposed. It consists of two components: a gene prediction model, MHA-preconv, and a peptide prediction model, FuncPred-CB. MHA-preconv first extracts candidate ORFs from raw genomic sequences and encodes them with a set of features. It then adopts a two-stage deep learning architecture to integrate both local and global sequence features. FuncPred-CB is a peptide prediction model, which integrates a pre-trained BERT language model with a dual-channel CNN&#x02013;BiLSTM architecture to effectively handle longer sequences and reduce reliance on manual feature engineering. It is capable of simultaneously predicting ACPs and AMPs within a single unified framework, significantly enhancing the adaptability and accuracy of functional peptide recognition across diverse tasks.</p>
<p>The performance of the MHA-preconv model was evaluated against three widely used tools across multiple genomic datasets. MHA-preconv achieved a gene prediction accuracy of 0.98, outperforming Prodigal (<xref ref-type="bibr" rid="B16">Hyatt et al., 2010</xref>) by 0.02, Orphelia (<xref ref-type="bibr" rid="B13">Hoff et al., 2009</xref>) by 0.13, and FragGeneScan (<xref ref-type="bibr" rid="B33">Rho et al., 2010</xref>) by 0.15, Tiberius (<xref ref-type="bibr" rid="B11">Gabriel et al., 2024</xref>) by 0.07, and Helixer (<xref ref-type="bibr" rid="B15">Holst et al., 2025</xref>) by 0.08. The FuncPred-CB model was applied to multiple ACP and AMP datasets and conducted systematic comparisons with state-of-the-art prediction methods. FuncPred-CB achieved a maximum accuracy of 0.93 in ACP prediction, exceeding that of ACP-DRL (<xref ref-type="bibr" rid="B39">Xu et al., 2024</xref>) by 0.02, ACP-check (<xref ref-type="bibr" rid="B44">Zhu et al., 2022</xref>) by 0.15, iACP-DRLF (<xref ref-type="bibr" rid="B26">Lv et al., 2021</xref>) by 0.12, and AntiCP2.0 (<xref ref-type="bibr" rid="B5">Agrawal et al., 2020</xref>) by 0.22. FuncPred-CB also achieved a competitive accuracy of 0.96 and the highest AUC of 0.99 in AMP prediction, with its accuracy surpassing the deep stacked model AMPpred-MFA (<xref ref-type="bibr" rid="B23">Li et al., 2023</xref>) by 0.0113. These experimental results demonstrate the superior performance of GP2FI. MHA-preconv achieves higher accuracy and enhanced sequence modeling capability in gene prediction, while FuncPred-CB balances precision and task generalization in unified ACP and AMP identification, providing a powerful computational foundation for functional peptide discovery and downstream bio-activity research.</p></sec>
<sec id="s2">
<label>2</label>
<title>Methods</title>
<p>GP2FI is a two-stage deep learning framework for gene prediction and functional peptide identification. It consists of two stages: a gene prediction model (MHA-preconv) and a functional peptide identification model (FuncPred-CB). MHA-preconv integrates CNNs with Transformer encoder layers to form a two-stage deep architecture, effectively capturing both local sequence patterns and long-range dependencies within ORF sequences. Based on the coding regions identified by MHA-preconv, FuncPred-CB incorporates a pre-trained BERT language model to automatically extract contextual semantic features from amino acid sequences. It also adopts a dual-channel feature extraction mechanism combining CNN and Bi-LSTM, enabling it to simultaneously capture local structural features and global dependencies. Final classification is performed using a multi-layer perceptron.The overall workflow of GP2FI is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Architecture of the GP2FI framework. <bold>(1)</bold> MHA-preconv model: (a) Feature extraction, (b) Feature fusion, (c) CNN model, and (d) Transformer encoder model. <bold>(2)</bold> FuncPred-CB model: (a) Peptide sequence tokenization and domain-specific pretrained language model, (b) CNN model, (c) Bi-LSTM model, and (d) MLP model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0001.tif">
<alt-text content-type="machine-generated">Diagram illustrating two workflows for DNA and protein sequence analysis. The left side shows an DNA sequence analysis pipeline, including feature extraction with open reading frame identification, fusion of features, CNN processing, and a transformer encoder. The right side depicts a protein sequence analysis process, including peptide tokenization using a domain-specific pre-trained model, CNN and BI-LSTM processing, and MLP for final feature fusion. Both workflows involve layers of neural networks and specific sequence handling techniques.</alt-text>
</graphic>
</fig>
<p>MHA-preconv comprises four main stages: <italic>feature extraction, multi-feature fusion, CNN model</italic>, and <italic>Transformer encoder model</italic>. First, all candidate ORFs are extracted from the genomic sequence and encoded using one-hot encoding, with each sequence standardized to a fixed length of 700 base pairs. In parallel, six handcrafted features are computed. Then the one-hot encoded sequences and handcrafted features are jointly input into the CNN module to extract local feature patterns. The resulting feature maps are flattened and passed into a Transformer encoder to capture long-range dependencies and global contextual relationships. Finally, a fully connected layer followed by a softmax layer outputs the probability of each ORF being a protein-coding region.</p>
<p>FuncPred-CB also consists of four main stages: <italic>Peptide sequence tokenization and domain-specific pretrained language model, CNN model, Bi-LSTM model</italic>, and <italic>MLP model</italic>. First, the protein sequences translated from MHA-preconv are tokenized, where each amino acid is mapped to a corresponding token ID. The tokenized sequence is then fed into a pretrained BERT language model to obtain contextual semantic embeddings for each residue. The output of BERT is subsequently passed through two parallel channels: a CNN channel for capturing local structural patterns, and a Bi-LSTM channel for modeling long-range dependencies within the sequence.The outputs from both channels are concatenated along the feature dimension and fed into a multi-layer perceptron (MLP) for classification, enabling unified prediction of both ACPs and AMPs.</p>
<sec>
<label>2.1</label>
<title>Feature extraction</title>
<p>Each ORF is characterized by six types of effective features, as detailed in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S1.1</xref>, including <italic>monocodon usage, dicodon usage, translation initiation site (TIS), ORF length, GC content</italic>, and <italic>basic nucleotide composition</italic>. These features are designed to enhance the discriminative power of the model in identifying protein-coding regions. In general, ORFs can be categorized as either complete or incomplete. A complete ORF is defined as one that contains both a start codon (<monospace>ATG</monospace>, <monospace>CTG</monospace>, <monospace>GTG</monospace>, or <monospace>TTG</monospace>) and a stop codon (<monospace>TAG</monospace>, <monospace>TGA</monospace>, or <monospace>TAA</monospace>). In contrast, incomplete ORFs lack upstream or downstream regions, or both. In cases where both ends are truncated, the ORF spans the entire sequence fragment without any identifiable start or stop codon. A complete prokaryotic gene, as illustrated in <xref ref-type="fig" rid="F2">Figure 2</xref>, typically begins at the 5&#x02032; promoter region and ends at the 3&#x02032; terminator region. Transcription occurs between the transcription start site and the transcription termination site, encompassing the 5&#x02032; untranslated region (5&#x02032; UTR), the ORF, and the 3&#x02032; untranslated region (3&#x02032; UTR), with only the ORF being translated into protein. Given that the translation initiation site can be located up to 30 base pairs upstream of the canonical start codon, the ORF start offset was set to 30 bp in the search procedure (<xref ref-type="bibr" rid="B14">Hoff et al., 2008</xref>). During both training and testing phases, only ORFs with a minimum length of 60 base pairs were considered to ensure reliability.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Structure of a prokaryotic open reading frame (ORF).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0002.tif">
<alt-text content-type="machine-generated">Diagram of a gene structure showing regions from left to right: Promoter with TATAAT boxes, 5' UTR, initiation codon, CDS (coding sequence), termination codon, 3' UTR, and Terminator. Conserved sequence: TTGACA.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>2.2</label>
<title>Multi-feature fusion</title>
<p>The fixed-length ORFs are subjected to one-hot encoding, where each nucleotide is represented by a one-hot vector, resulting each ORF with a length of <italic>L</italic> is represented as an <italic>L</italic>&#x000D7;4 matrix. The encoded ORF and the manually extracted six features are then fused as input for further processing by the subsequent CNN and Transformer encoder layers. The entire feature set, encompassing the encoded ORF, and the six features, is concatenated into a one-dimensional feature vector to represent the input sequence fragment, expressed as:</p>
<disp-formula id="E1"><mml:math id="M1"><mml:mrow><mml:mi>X</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi><mml:mi>R</mml:mi><mml:mi>F</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>M</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>D</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>I</mml:mi><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>O</mml:mi><mml:mi>R</mml:mi><mml:msub><mml:mrow><mml:mi>F</mml:mi></mml:mrow><mml:mrow><mml:mi>L</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>X</mml:mi></mml:mrow><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi><mml:mi>C</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>where <italic>X</italic><sub><italic>ORF</italic></sub>, <italic>X</italic><sub><italic>MC</italic></sub>, <italic>X</italic><sub><italic>DC</italic></sub>, <italic>X</italic><sub><italic>TIS</italic></sub>, <italic>X</italic><sub><italic>OR</italic><sub><italic>F</italic></sub><sub><italic>L</italic></sub></sub>, <italic>X</italic><sub><italic>GC</italic></sub>, and <italic>X</italic><sub><italic>baseC</italic></sub> represent the feature extraction vectors mentioned above.</p>
</sec>
<sec>
<label>2.3</label>
<title>Feature extraction for gene detection</title>
<sec>
<label>2.3.1</label>
<title>CNN model</title>
<p>A CNN model pre-trained on 10 mutually exclusive datasets, each constructed based on predefined GC content ranges, was employed. The concatenated one-dimensional array obtained from multi-feature fusion is input into the appropriate pre-trained CNN model, which is then fine-tuned using our target dataset. The final CNN architecture consists of six layers. The first layer is a convolutional layer with 64 filters and a filter window size of 3. The second layer is a max-pooling layer with a pool size of 2. The third layer is another convolutional layer with 200 filters and a filter window size of 3. The fourth layer is a second max-pooling layer with a pool size of 2. This is followed by a dropout layer to mitigate overfitting. The output from the convolutional layers is then flattened into a one-dimensional vector and is fed into the first fully connected layer, where the dimensionality is reduced from 35,000 to 4,096.</p></sec>
<sec>
<label>2.3.2</label>
<title>Transformer encoder model</title>
<p>To incorporate global contextual information, a Transformer encoder modeL was incorporated after the CNN model, allowing long-distance relationships within the sequence be considered while preserving sequential information. The output from the CNN layers is fed into the Transformer encoder with an 8-head attention mechanism to extract global contextual dependencies across the entire ORF sequence. The output is then passed through a flattening layer to convert the multi-dimensional attention output into a one-dimensional vector. This vector is subsequently fed into a fully connected layer with an input dimension of 4,096 and an output dimension of 128, followed by a dropout layer with a dropout rate of 0.2. The resulting vector is then passed into another fully connected layer with 128 neurons, producing a single scalar output. A <monospace>Sigmoid</monospace> activation function is applied to obtain the probability that an ORF encodes a protein-coding gene. As a post-processing step, a greedy algorithm (<xref ref-type="bibr" rid="B43">Zhou and Troyanskaya, 2015</xref>) is applied to ensure that only one gene is retained among overlapping predictions. The candidate ORF with the highest probability score is selected, and any other ORF overlapping more than 60 base pairs with it is discarded. The final set of predicted genes is then produced.</p>
<p>During model training, the binary cross-entropy loss (<xref ref-type="bibr" rid="B14">Hoff et al., 2008</xref>) was used to compute the error between predicted probabilities and ground-truth labels. The model was trained with a batch size of 32 using the Adam optimizer, with a learning rate of 0.001. Multiple hyperparameter configurations were explored to optimize performance.</p>
</sec>
</sec>
<sec>
<label>2.4</label>
<title>Peptide sequence tokenization and domain-specific pre-trained language model</title>
<p>The FuncPred-CB model for functional peptide identification begins by tokenizing peptide sequences, converting each amino acid into its corresponding numerical ID as input to the language model. The vocabulary comprises 26 tokens, including the single-letter codes of the 20 standard amino acids, an unknown residue represented by <monospace>X</monospace>, and special tokens such as <monospace>[CLS]</monospace> and <monospace>[SEP]</monospace>. Subsequently, a BERT-based protein language model pre-trained in the ACP-DRL framework is employed to map each amino acid to a vector representation enriched with contextual semantics. Trained on large-scale protein sequence databases using a masked language modeling (MLM) strategy, this model demonstrates strong capabilities in biological sequence modeling and semantic representation.By incorporating BERT-derived contextual embeddings, the model is better equipped to capture critical patterns and long-range dependencies within functional peptides, thereby enhancing the performance of downstream classification tasks.</p>
</sec>
<sec>
<label>2.5</label>
<title>Feature extraction for functional peptide identification</title>
<sec>
<label>2.5.1</label>
<title>CNN model</title>
<p>A dual-channel feature extraction architecture based on the pre-trained language model BERT is employed. The sequence representations output by BERT are embedded as high-dimensional, context-sensitive vectors and fed into one of the CNN channels to extract both local structural features and global dependencies. In the CNN channel, the BERT output is transposed to match the input format required by one-dimensional convolution, and then passed through three consecutive convolutional layers. Each layer uses a kernel size of 3 and contains 256, 128, and 64 filters, respectively. After ReLU activation, average pooling is applied to extract stable local pattern features.</p></sec>
<sec>
<label>2.5.2</label>
<title>Bi-LSTM model</title>
<p>The other feature extraction channel, Bi-LSTM, processes the original sequence outputs from BERT to capture long-range dependencies within the sequence through a Bi-LSTM network. The final hidden state at the last time step is taken as the global representation of the peptide sequence.The features obtained from both CNN and Bi-LSTM channels are concatenated and passed through a batch normalization layer before being fed into a three-layer MLP for classification. This architecture effectively integrates the strengths of CNNs in capturing local structural features with the contextual modeling capabilities of Bi-LSTM, thereby enhancing the overall performance of functional peptide identification. Besides, we further conducted statistical and visual analyses of amino acid composition to explore underlying data characteristics and enhance model interpretability. Specifically, we analyzed the frequency distribution of the 20 standard amino acids in the ACP and AMP datasets. Detailed analysis is provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S3.2</xref>.</p>
<p>During the training phase, binary cross-entropy loss was used as the objective function. The model was optimized using the Adafactor optimizer with an initial learning rate of 2 &#x000D7; 10<sup>&#x02212;5</sup>. Training was conducted for 20 epochs with a batch size of 4. Early stopping was applied to prevent overfitting, and evaluation metrics including accuracy, F1 score, and Matthews correlation coefficient (MCC) were recorded after each epoch. Comprehensive evaluation metrics are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S2</xref>.</p>
</sec>
</sec>
</sec>
<sec sec-type="results" id="s3">
<label>3</label>
<title>Results</title>
<sec>
<label>3.1</label>
<title>Datasets</title>
<p>MHA-preconv was trained and evaluated using four datasets. Dataset_1 contains 164 complete genomes (including bacteria and archaea) and is used for training and validation, with the data split into training and testing sets at a 7:3 ratio. Dataset_2 consists of 10 complete genomes for model tuning. Dataset_3 includes complete genomes from 9 independent species and is used for independent testing. Dataset_4 encompasses 100 newly collected genomes covering broad taxonomic diversity (including Gram-negative bacteria, Staphylococcus spp., etc.) and was divided into five equal subsets for incremental testing. Stratified sampling was applied to both training and test splits to ensure equal expected numbers of positive and negative instances in every mini-batch, preventing the majority class (NCS) from overwhelming the minority class (CDS). In addition, a weighted random-sampling strategy was employed during training to oversample the minority class and further alleviate class bias. All genomic sequences and annotations were downloaded from GenBank (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/genbank/)">https://www.ncbi.nlm.nih.gov/genbank/)</ext-link> and NCBI RefSeq (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/refseq/)">https://www.ncbi.nlm.nih.gov/refseq/)</ext-link>. The databases used and detailed dataset-construction procedures are described in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S3.1</xref>.</p>
<p>FuncPred-CB was trained and evaluated by two task-specific datasets For the ACP task, Dataset_1 contains 970 ACPs and 970 non-ACPs, while Dataset_2 includes 861 ACPs and an equal number of non-ACPs.Positive sequences were collected from the AMP and CancerPPD databases and experimentally verified to possess anticancer activity; negatives consist of AMPs without anticancer activity and random peptides extracted from Swiss-Prot. For the AMP task, the dataset contains 10,322 non-redundant AMP sequences whose antimicrobial activity has been experimentally validated, together with 3,029,894 non-AMP sequences. To avoid potential bias, any peptides known to exhibit anticancer activity were excluded from the AMP-positive class, ensuring that positives possess only antimicrobial activity. To mitigate distribution shifts that could arise from random splitting, stratified sampling was applied and a fixed random seed (seed = 702) was set to guarantee reproducibility. Training and test sets are stored in separate physical files to prevent data leakage at the source; the test set is used exclusively for final evaluation and is never involved in model development or hyper-parameter tuning. Class imbalance is addressed by weighted random oversampling of the minority class during training. All datasets were randomly split into training and test sets at an 8:2 ratio. The databases used and detailed dataset-construction procedures are described in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S3.1</xref>.</p>
</sec>
<sec>
<label>3.2</label>
<title>Performance of MHA-preconv on the gene dataset Dataset_1 and of FuncPred-CB on the ACP dataset Dataset_1 and the AMP dataset Dataset_3.</title>
<p>Comprehensive training and testing were performed for gene finding, ACP and AMP tasks with three dedicated models: MHA-preconv, FuncPred-CB-ACP and FuncPred-CB-AMP. MHA-preconv was evaluated on the gene-prediction portion of Dataset_1, whereas FuncPred-CB was assessed on the functional-peptide portions&#x02014;ACP Dataset_1 and AMP Dataset_3. After 48 epochs, MHA-preconv achieved 98 % test accuracy; FuncPred-CB reached 92 % and 96 % test accuracy on the ACP and AMP datasets, respectively, within 10 epochs. The results are displayed in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Comparative performance metrics of MHA-preconv and FuncPred-CB on test datasets.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0003.tif">
<alt-text content-type="machine-generated">Line chart comparing the performance of three models: MHA-preconv, FuncPred-CB-ACP, and FuncPred-CB-AMP across four metrics. MHA-preconv (blue) shows consistently high scores above 0.95. FuncPred-CB-ACP (orange) scores range from 0.840 to 0.920. FuncPred-CB-AMP (green) performs similarly to MHA-preconv. Metrics on the x-axis include test accuracy, sensitivity, specificity, f1 score, and MCC; y-axis shows score levels.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>3.3</label>
<title>Comparison of MHA-preconv with four benchmark methods on Dataset_3 and Dataset_4</title>
<p>MHA-preconv model was compared with five well-established gene prediction tools: Prodigal, Orphelia, FragGeneScan, Tiberius, and Helixer, using Dataset_3 and Dataset_4. The results are shown in <xref ref-type="table" rid="T1">Tables 1</xref>, <xref ref-type="table" rid="T2">2</xref>. The highest accuracy was achieved by MHA-preconv on 8 out of 9 species in Dataset_3. The only exception was N. pharaonis (Species No. 7), where Prodigal slightly outperformed our method. For the remaining species, Acc values of 97.56%, 96.18%, 93.96%, 97.49%, 76.47%, 96.52%, 96.12%, and 96.67% were achieved by MHA-preconv, respectively. On Dataset4, MHA-preconv outperformed all benchmark methods, achieving the highest Acc values across all five subsets: 97.33%, 96.40%, 97.78%, 95.88%, and 95.74%. These demonstrates the strong generalization ability and classification performance of our method across a wide range of microbial genomes.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Results obtained by four methods on 9 different strains.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>1</bold></th>
<th valign="top" align="center"><bold>2</bold></th>
<th valign="top" align="center"><bold>3</bold></th>
<th valign="top" align="center"><bold>4</bold></th>
<th valign="top" align="center"><bold>5</bold></th>
<th valign="top" align="center"><bold>6</bold></th>
<th valign="top" align="center"><bold>7</bold></th>
<th valign="top" align="center"><bold>8</bold></th>
<th valign="top" align="center"><bold>9</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="6">MHA-preconv</td>
<td valign="top" align="left">Accuracy(%)</td>
<td valign="top" align="center"><bold>97.56</bold></td>
<td valign="top" align="center"><bold>96.18</bold></td>
<td valign="top" align="center">93.96</td>
<td valign="top" align="center"><bold>97.49</bold></td>
<td valign="top" align="center"><bold>76.47</bold></td>
<td valign="top" align="center"><bold>96.52</bold></td>
<td valign="top" align="center">85.53</td>
<td valign="top" align="center"><bold>96.12</bold></td>
<td valign="top" align="center"><bold>96.67</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">94.25</td>
<td valign="top" align="center"><bold>96.36</bold></td>
<td valign="top" align="center">92.07</td>
<td valign="top" align="center">96.43</td>
<td valign="top" align="center">62.73</td>
<td valign="top" align="center"><bold>97.12</bold></td>
<td valign="top" align="center">82.46</td>
<td valign="top" align="center">96.11</td>
<td valign="top" align="center"><bold>96.43</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center"><bold>97.13</bold></td>
<td valign="top" align="center">95.15</td>
<td valign="top" align="center"><bold>93.74</bold></td>
<td valign="top" align="center"><bold>97.67</bold></td>
<td valign="top" align="center"><bold>84.63</bold></td>
<td valign="top" align="center"><bold>95.80</bold></td>
<td valign="top" align="center">83.77</td>
<td valign="top" align="center"><bold>97.38</bold></td>
<td valign="top" align="center"><bold>97.53</bold></td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center"><bold>95.78</bold></td>
<td valign="top" align="center"><bold>93.67</bold></td>
<td valign="top" align="center"><bold>92.90</bold></td>
<td valign="top" align="center"><bold>97.55</bold></td>
<td valign="top" align="center">71.97</td>
<td valign="top" align="center"><bold>96.45</bold></td>
<td valign="top" align="center">84.99</td>
<td valign="top" align="center">94.47</td>
<td valign="top" align="center"><bold>96.70</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center"><bold>98.54</bold></td>
<td valign="top" align="center">90.64</td>
<td valign="top" align="center">94.71</td>
<td valign="top" align="center"><bold>97.40</bold></td>
<td valign="top" align="center"><bold>90.48</bold></td>
<td valign="top" align="center"><bold>95.65</bold></td>
<td valign="top" align="center">85.42</td>
<td valign="top" align="center">95.20</td>
<td valign="top" align="center"><bold>95.64</bold></td>
</tr>
 <tr>
<td valign="top" align="left">F1 Score (%)</td>
<td valign="top" align="center">96.20</td>
<td valign="top" align="center"><bold>92.94</bold></td>
<td valign="top" align="center">93.37</td>
<td valign="top" align="center"><bold>97.39</bold></td>
<td valign="top" align="center"><bold>74.79</bold></td>
<td valign="top" align="center"><bold>96.34</bold></td>
<td valign="top" align="center">85.46</td>
<td valign="top" align="center">96.62</td>
<td valign="top" align="center"><bold>96.62</bold></td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Prodigal</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">95.96</td>
<td valign="top" align="center">95.40</td>
<td valign="top" align="center">93.96</td>
<td valign="top" align="center">95.81</td>
<td valign="top" align="center">65.15</td>
<td valign="top" align="center">71.85</td>
<td valign="top" align="center">91.50</td>
<td valign="top" align="center">91.17</td>
<td valign="top" align="center">94.66</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">96.13</td>
<td valign="top" align="center">96.31</td>
<td valign="top" align="center">95.77</td>
<td valign="top" align="center">96.97</td>
<td valign="top" align="center">64.99</td>
<td valign="top" align="center">92.27</td>
<td valign="top" align="center">88.99</td>
<td valign="top" align="center">97.88</td>
<td valign="top" align="center">91.96</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">95.97</td>
<td valign="top" align="center">90.26</td>
<td valign="top" align="center">93.62</td>
<td valign="top" align="center">94.81</td>
<td valign="top" align="center">65.31</td>
<td valign="top" align="center">81.52</td>
<td valign="top" align="center">94.17</td>
<td valign="top" align="center">96.47</td>
<td valign="top" align="center">97.52</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">95.93</td>
<td valign="top" align="center">92.87</td>
<td valign="top" align="center">92.82</td>
<td valign="top" align="center">94.77</td>
<td valign="top" align="center">65.25</td>
<td valign="top" align="center">86.56</td>
<td valign="top" align="center">91.61</td>
<td valign="top" align="center">97.40</td>
<td valign="top" align="center">93.68</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">96.70</td>
<td valign="top" align="center">93.38</td>
<td valign="top" align="center">96.15</td>
<td valign="top" align="center">95.07</td>
<td valign="top" align="center">65.07</td>
<td valign="top" align="center">89.35</td>
<td valign="top" align="center">89.90</td>
<td valign="top" align="center">97.53</td>
<td valign="top" align="center">93.30</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">96.32</td>
<td valign="top" align="center">92.60</td>
<td valign="top" align="center">96.03</td>
<td valign="top" align="center">96.93</td>
<td valign="top" align="center">65.03</td>
<td valign="top" align="center">90.78</td>
<td valign="top" align="center">89.06</td>
<td valign="top" align="center">97.70</td>
<td valign="top" align="center">92.62</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Orphelia</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">84.36</td>
<td valign="top" align="center">92.77</td>
<td valign="top" align="center">72.17</td>
<td valign="top" align="center">93.36</td>
<td valign="top" align="center">76.79</td>
<td valign="top" align="center">71.85</td>
<td valign="top" align="center">85.37</td>
<td valign="top" align="center">84.99</td>
<td valign="top" align="center">89.06</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">80.58</td>
<td valign="top" align="center">90.46</td>
<td valign="top" align="center">68.74</td>
<td valign="top" align="center">89.40</td>
<td valign="top" align="center">74.23</td>
<td valign="top" align="center">66.59</td>
<td valign="top" align="center">76.47</td>
<td valign="top" align="center">83.79</td>
<td valign="top" align="center">84.79</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">88.57</td>
<td valign="top" align="center">95.20</td>
<td valign="top" align="center">75.99</td>
<td valign="top" align="center">95.54</td>
<td valign="top" align="center">79.52</td>
<td valign="top" align="center">77.51</td>
<td valign="top" align="center">74.28</td>
<td valign="top" align="center">86.24</td>
<td valign="top" align="center">86.24</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">83.30</td>
<td valign="top" align="center">92.70</td>
<td valign="top" align="center">72.39</td>
<td valign="top" align="center">93.03</td>
<td valign="top" align="center">75.51</td>
<td valign="top" align="center">71.85</td>
<td valign="top" align="center">85.70</td>
<td valign="top" align="center">84.97</td>
<td valign="top" align="center">84.97</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">85.53</td>
<td valign="top" align="center">91.61</td>
<td valign="top" align="center">70.43</td>
<td valign="top" align="center">90.87</td>
<td valign="top" align="center">75.50</td>
<td valign="top" align="center">69.13</td>
<td valign="top" align="center">80.62</td>
<td valign="top" align="center">85.18</td>
<td valign="top" align="center">85.18</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">83.77</td>
<td valign="top" align="center">91.03</td>
<td valign="top" align="center">69.57</td>
<td valign="top" align="center">90.13</td>
<td valign="top" align="center">74.86</td>
<td valign="top" align="center">67.83</td>
<td valign="top" align="center">78.50</td>
<td valign="top" align="center">84.15</td>
<td valign="top" align="center">84.15</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">FragGeneScan</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">83.57</td>
<td valign="top" align="center">62.11</td>
<td valign="top" align="center">72.19</td>
<td valign="top" align="center">86.19</td>
<td valign="top" align="center">71.34</td>
<td valign="top" align="center">57.76</td>
<td valign="top" align="center">68.19</td>
<td valign="top" align="center">79.47</td>
<td valign="top" align="center">59.59</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">82.50</td>
<td valign="top" align="center">75.86</td>
<td valign="top" align="center">78.98</td>
<td valign="top" align="center">84.71</td>
<td valign="top" align="center">63.22</td>
<td valign="top" align="center">63.62</td>
<td valign="top" align="center">74.93</td>
<td valign="top" align="center">87.54</td>
<td valign="top" align="center">63.46</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">83.62</td>
<td valign="top" align="center">52.58</td>
<td valign="top" align="center">66.47</td>
<td valign="top" align="center">87.77</td>
<td valign="top" align="center">77.68</td>
<td valign="top" align="center">52.89</td>
<td valign="top" align="center">62.57</td>
<td valign="top" align="center">72.76</td>
<td valign="top" align="center">63.67</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">83.06</td>
<td valign="top" align="center">62.24</td>
<td valign="top" align="center">72.20</td>
<td valign="top" align="center">86.24</td>
<td valign="top" align="center">72.30</td>
<td valign="top" align="center">57.66</td>
<td valign="top" align="center">68.80</td>
<td valign="top" align="center">79.44</td>
<td valign="top" align="center">59.90</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">86.41</td>
<td valign="top" align="center">68.28</td>
<td valign="top" align="center">75.75</td>
<td valign="top" align="center">89.10</td>
<td valign="top" align="center">77.63</td>
<td valign="top" align="center">60.66</td>
<td valign="top" align="center">71.34</td>
<td valign="top" align="center">83.40</td>
<td valign="top" align="center">61.34</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">83.57</td>
<td valign="top" align="center">71.93</td>
<td valign="top" align="center">77.13</td>
<td valign="top" align="center">90.79</td>
<td valign="top" align="center">72.74</td>
<td valign="top" align="center">62.06</td>
<td valign="top" align="center">72.86</td>
<td valign="top" align="center">85.84</td>
<td valign="top" align="center">62.23</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Tiberius</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">90.26</td>
<td valign="top" align="center">93.73</td>
<td valign="top" align="center">74.33</td>
<td valign="top" align="center">93.00</td>
<td valign="top" align="center">75.75</td>
<td valign="top" align="center">72.34</td>
<td valign="top" align="center">87.21</td>
<td valign="top" align="center">86.94</td>
<td valign="top" align="center">89.50</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">83.82</td>
<td valign="top" align="center">90.46</td>
<td valign="top" align="center">65.27</td>
<td valign="top" align="center">89.80</td>
<td valign="top" align="center">74.76</td>
<td valign="top" align="center">68.49</td>
<td valign="top" align="center">80.33</td>
<td valign="top" align="center">83.90</td>
<td valign="top" align="center">84.67</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">89.51</td>
<td valign="top" align="center">95.20</td>
<td valign="top" align="center">73.30</td>
<td valign="top" align="center">96.72</td>
<td valign="top" align="center">78.23</td>
<td valign="top" align="center">79.54</td>
<td valign="top" align="center">73.26</td>
<td valign="top" align="center">87.15</td>
<td valign="top" align="center">88.45</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">84.03</td>
<td valign="top" align="center">91.36</td>
<td valign="top" align="center">72.92</td>
<td valign="top" align="center">93.22</td>
<td valign="top" align="center">76.12</td>
<td valign="top" align="center">71.57</td>
<td valign="top" align="center">85.37</td>
<td valign="top" align="center">83.90</td>
<td valign="top" align="center">85.39</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">85.53</td>
<td valign="top" align="center">89.11</td>
<td valign="top" align="center">72.34</td>
<td valign="top" align="center">91.88</td>
<td valign="top" align="center">74.90</td>
<td valign="top" align="center">70.40</td>
<td valign="top" align="center">81.34</td>
<td valign="top" align="center">85.89</td>
<td valign="top" align="center">86.17</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">83.77</td>
<td valign="top" align="center">90.32</td>
<td valign="top" align="center">69.99</td>
<td valign="top" align="center">90.41</td>
<td valign="top" align="center">75.45</td>
<td valign="top" align="center">68.82</td>
<td valign="top" align="center">79.49</td>
<td valign="top" align="center">85.13</td>
<td valign="top" align="center">85.87</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Helixer</td>
<td valign="top" align="left">Accuracy(%)</td>
<td valign="top" align="center">89.42</td>
<td valign="top" align="center">90.46</td>
<td valign="top" align="center">76.65</td>
<td valign="top" align="center">93.06</td>
<td valign="top" align="center">77.10</td>
<td valign="top" align="center">70.21</td>
<td valign="top" align="center">84.69</td>
<td valign="top" align="center">86.44</td>
<td valign="top" align="center">82.09</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">79.58</td>
<td valign="top" align="center">76.89</td>
<td valign="top" align="center">72.31</td>
<td valign="top" align="center">89.71</td>
<td valign="top" align="center">74.66</td>
<td valign="top" align="center">65.02</td>
<td valign="top" align="center">79.08</td>
<td valign="top" align="center">80.72</td>
<td valign="top" align="center">78.41</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">84.16</td>
<td valign="top" align="center">88.27</td>
<td valign="top" align="center">68.24</td>
<td valign="top" align="center">79.27</td>
<td valign="top" align="center">78.90</td>
<td valign="top" align="center">72.33</td>
<td valign="top" align="center">86.61</td>
<td valign="top" align="center">85.60</td>
<td valign="top" align="center">87.58</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">83.00</td>
<td valign="top" align="center">80.44</td>
<td valign="top" align="center">71.14</td>
<td valign="top" align="center">79.20</td>
<td valign="top" align="center">74.33</td>
<td valign="top" align="center">66.45</td>
<td valign="top" align="center">85.37</td>
<td valign="top" align="center">80.11</td>
<td valign="top" align="center">79.95</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">83.79</td>
<td valign="top" align="center">84.00</td>
<td valign="top" align="center">79.52</td>
<td valign="top" align="center">86.52</td>
<td valign="top" align="center">74.85</td>
<td valign="top" align="center">68.96</td>
<td valign="top" align="center">81.34</td>
<td valign="top" align="center">85.49</td>
<td valign="top" align="center">85.83</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">83.60</td>
<td valign="top" align="center">82.03</td>
<td valign="top" align="center">72.36</td>
<td valign="top" align="center">87.40</td>
<td valign="top" align="center">75.60</td>
<td valign="top" align="center">66.63</td>
<td valign="top" align="center">79.49</td>
<td valign="top" align="center">84.22</td>
<td valign="top" align="center">79.40</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values indicate the best (optimal) results, which correspond to the highest performance achieved for the respective metrics.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Results obtained by the four methods on the five divided subsets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>A</bold></th>
<th valign="top" align="center"><bold>B</bold></th>
<th valign="top" align="center"><bold>C</bold></th>
<th valign="top" align="center"><bold>D</bold></th>
<th valign="top" align="center"><bold>E</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="6">MHA-preconv</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center"><bold>97.33</bold></td>
<td valign="top" align="center"><bold>96.40</bold></td>
<td valign="top" align="center"><bold>97.78</bold></td>
<td valign="top" align="center"><bold>95.88</bold></td>
<td valign="top" align="center"><bold>95.74</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center"><bold>92.13</bold></td>
<td valign="top" align="center"><bold>93.90</bold></td>
<td valign="top" align="center">81.89</td>
<td valign="top" align="center"><bold>90.75</bold></td>
<td valign="top" align="center">92.13</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center"><bold>98.43</bold></td>
<td valign="top" align="center"><bold>97.65</bold></td>
<td valign="top" align="center"><bold>99.71</bold></td>
<td valign="top" align="center"><bold>97.55</bold></td>
<td valign="top" align="center"><bold>97.55</bold></td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center"><bold>95.56</bold></td>
<td valign="top" align="center"><bold>95.74</bold></td>
<td valign="top" align="center"><bold>89.92</bold></td>
<td valign="top" align="center"><bold>94.03</bold></td>
<td valign="top" align="center">94.76</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center"><bold>99.28</bold></td>
<td valign="top" align="center"><bold>95.21</bold></td>
<td valign="top" align="center"><bold>99.29</bold></td>
<td valign="top" align="center"><bold>94.86</bold></td>
<td valign="top" align="center"><bold>94.93</bold></td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">89.75</td>
<td valign="top" align="center">94.55</td>
<td valign="top" align="center"><bold>89.75</bold></td>
<td valign="top" align="center"><bold>92.76</bold></td>
<td valign="top" align="center">93.51</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Prodigal</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">95.30</td>
<td valign="top" align="center">94.60</td>
<td valign="top" align="center">84.56</td>
<td valign="top" align="center">85.37</td>
<td valign="top" align="center">94.71</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">92.10</td>
<td valign="top" align="center">90.00</td>
<td valign="top" align="center">85.04</td>
<td valign="top" align="center">72.21</td>
<td valign="top" align="center">95.66</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">90.02</td>
<td valign="top" align="center">89.00</td>
<td valign="top" align="center">78.09</td>
<td valign="top" align="center">70.23</td>
<td valign="top" align="center">90.18</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">95.45</td>
<td valign="top" align="center">93.01</td>
<td valign="top" align="center">81.17</td>
<td valign="top" align="center">74.56</td>
<td valign="top" align="center">97.01</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">88.20</td>
<td valign="top" align="center">87.03</td>
<td valign="top" align="center">80.01</td>
<td valign="top" align="center">70.73</td>
<td valign="top" align="center">93.82</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">95.49</td>
<td valign="top" align="center">93.48</td>
<td valign="top" align="center">81.52</td>
<td valign="top" align="center">74.28</td>
<td valign="top" align="center">97.52</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Orphelia</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">83.96</td>
<td valign="top" align="center">92.96</td>
<td valign="top" align="center">73.12</td>
<td valign="top" align="center">76.40</td>
<td valign="top" align="center">84.67</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">84.55</td>
<td valign="top" align="center">92.02</td>
<td valign="top" align="center">74.02</td>
<td valign="top" align="center">78.25</td>
<td valign="top" align="center">85.09</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">82.36</td>
<td valign="top" align="center">91.20</td>
<td valign="top" align="center">70.40</td>
<td valign="top" align="center">75.44</td>
<td valign="top" align="center">82.65</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">88.03</td>
<td valign="top" align="center">95.08</td>
<td valign="top" align="center">76.64</td>
<td valign="top" align="center">80.02</td>
<td valign="top" align="center">86.86</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">78.87</td>
<td valign="top" align="center">89.80</td>
<td valign="top" align="center">66.71</td>
<td valign="top" align="center">72.99</td>
<td valign="top" align="center">81.22</td>
</tr>
 <tr>
<td valign="top" align="left">F1 score (%)</td>
<td valign="top" align="center">87.57</td>
<td valign="top" align="center">95.20</td>
<td valign="top" align="center">74.95</td>
<td valign="top" align="center">79.06</td>
<td valign="top" align="center">86.25</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">FragGeneScan</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">81.89</td>
<td valign="top" align="center">72.30</td>
<td valign="top" align="center">84.21</td>
<td valign="top" align="center">71.50</td>
<td valign="top" align="center">59.13</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">84.01</td>
<td valign="top" align="center">76.85</td>
<td valign="top" align="center">85.30</td>
<td valign="top" align="center">70.71</td>
<td valign="top" align="center">58.02</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">80.12</td>
<td valign="top" align="center">68.02</td>
<td valign="top" align="center">85.24</td>
<td valign="top" align="center">68.34</td>
<td valign="top" align="center">55.36</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">75.54</td>
<td valign="top" align="center">67.50</td>
<td valign="top" align="center">81.11</td>
<td valign="top" align="center">70.99</td>
<td valign="top" align="center">58.27</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">78.02</td>
<td valign="top" align="center">66.82</td>
<td valign="top" align="center">83.67</td>
<td valign="top" align="center">67.23</td>
<td valign="top" align="center">54.73</td>
</tr>
 <tr>
<td valign="top" align="left">F1 Score (%)</td>
<td valign="top" align="center">75.23</td>
<td valign="top" align="center">66.45</td>
<td valign="top" align="center">80.92</td>
<td valign="top" align="center">71.02</td>
<td valign="top" align="center">56.28</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Tiberius</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">82.85</td>
<td valign="top" align="center">79.64</td>
<td valign="top" align="center">84.56</td>
<td valign="top" align="center">74.60</td>
<td valign="top" align="center">78.61</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">76.21</td>
<td valign="top" align="center">77.44</td>
<td valign="top" align="center">85.63</td>
<td valign="top" align="center">76.81</td>
<td valign="top" align="center">69.25</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">86.32</td>
<td valign="top" align="center">65.65</td>
<td valign="top" align="center">80.24</td>
<td valign="top" align="center">65.30</td>
<td valign="top" align="center">75.06</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">74.55</td>
<td valign="top" align="center">70.05</td>
<td valign="top" align="center">82.44</td>
<td valign="top" align="center">72.02</td>
<td valign="top" align="center">71.23</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">83.92</td>
<td valign="top" align="center">79.86</td>
<td valign="top" align="center">85.00</td>
<td valign="top" align="center">74.01</td>
<td valign="top" align="center">74.33</td>
</tr>
 <tr>
<td valign="top" align="left">F1 Score (%)</td>
<td valign="top" align="center">78.20</td>
<td valign="top" align="center">77.00</td>
<td valign="top" align="center">85.92</td>
<td valign="top" align="center">74.90</td>
<td valign="top" align="center">76.29</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="6">Helixer</td>
<td valign="top" align="left">Accuracy (%)</td>
<td valign="top" align="center">82.43</td>
<td valign="top" align="center">76.90</td>
<td valign="top" align="center">82.11</td>
<td valign="top" align="center">74.99</td>
<td valign="top" align="center">79.10</td>
</tr>
 <tr>
<td valign="top" align="left">Sn (%)</td>
<td valign="top" align="center">75.41</td>
<td valign="top" align="center">67.95</td>
<td valign="top" align="center">74.05</td>
<td valign="top" align="center">69.91</td>
<td valign="top" align="center">66.88</td>
</tr>
 <tr>
<td valign="top" align="left">Sp (%)</td>
<td valign="top" align="center">88.47</td>
<td valign="top" align="center">79.63</td>
<td valign="top" align="center">79.27</td>
<td valign="top" align="center">78.54</td>
<td valign="top" align="center">80.30</td>
</tr>
 <tr>
<td valign="top" align="left">HM (%)</td>
<td valign="top" align="center">75.86</td>
<td valign="top" align="center">70.00</td>
<td valign="top" align="center">79.14</td>
<td valign="top" align="center">70.21</td>
<td valign="top" align="center">63.90</td>
</tr>
 <tr>
<td valign="top" align="left">Precision (%)</td>
<td valign="top" align="center">79.12</td>
<td valign="top" align="center">74.15</td>
<td valign="top" align="center">83.91</td>
<td valign="top" align="center">76.11</td>
<td valign="top" align="center">77.70</td>
</tr>
 <tr>
<td valign="top" align="left">F1 Score (%)</td>
<td valign="top" align="center">77.03</td>
<td valign="top" align="center">70.60</td>
<td valign="top" align="center">82.10</td>
<td valign="top" align="center">71.66</td>
<td valign="top" align="center">70.21</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values indicate the best (optimal) results, which correspond to the highest performance achieved for the respective metrics.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>3.4</label>
<title>Comparison with state-of-the-art ACP methods on Dataset_1, and Dataset_2</title>
<p>To comprehensively evaluate the proposed FuncPred-CB model for functional-peptide recognition, we conducted comparative experiments on two benchmark datasets against six state-of-the-art ACP predictors: ACP-DRL, ACP-check, iACP-DRLF, AntiCP 2.0, ACP-CLB (<xref ref-type="bibr" rid="B12">Geng et al., 2025</xref>) and ACP-GCN (<xref ref-type="bibr" rid="B31">Rao et al., 2020</xref>). As illustrated in <xref ref-type="fig" rid="F4">Figure 4</xref> and <xref ref-type="table" rid="T3">Table 3</xref>, FuncPred-CB achieves 92.49% accuracy, 91.19% sensitivity, 93.78% specificity, 86.78% MCC and 94.58% AUC on Dataset_1, and 73.19% accuracy, 71.01% sensitivity, 77.20% specificity, 46.42% MCC and 82.60% AUC on Dataset_2. Except for a marginally lower accuracy than ACP-CLB, FuncPred-CB surpasses all baseline methods on the remaining metrics, demonstrating robust adaptability to complex and heterogeneous data. Collectively, these results validate the superior overall performance, generalizability and competitiveness of FuncPred-CB as a functional-peptide predictor for anticancer-peptide identification.</p>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Comparison of ROC curves for different models across three functional peptide datasets.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0004.tif">
<alt-text content-type="machine-generated">Three ROC curve graphs comparing model performance on different datasets. The first graph shows various models with AUC scores, highlighting ACP-DRL (0.9225) and Our method (0.9858) among others. The second graph includes models like ACP-DRL (0.7625) and ACF-CNN (0.8492). The third graph compares AMPred-HLF (0.9833), Our method (0.9853), and DeepAMPpred (0.9635). Each graph plots True Positive Rate against False Positive Rate.</alt-text>
</graphic>
</fig>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Performance comparison of various models on the Datasets_1 and Datasets_2.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>Sn (%)</bold></th>
<th valign="top" align="center"><bold>Sp (%)</bold></th>
<th valign="top" align="center"><bold>MCC (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="2">ACP-DRL</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center">90.67</td>
<td valign="top" align="center">86.01</td>
<td valign="top" align="center"><bold>95.33</bold></td>
<td valign="top" align="center">80.80</td>
<td valign="top" align="center">92.30</td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center">67.39</td>
<td valign="top" align="center">69.57</td>
<td valign="top" align="center">69.57</td>
<td valign="top" align="center">34.80</td>
<td valign="top" align="center">72.74</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">FuncPred-CB</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center">92.49</td>
<td valign="top" align="center"><bold>91.19</bold></td>
<td valign="top" align="center">93.78</td>
<td valign="top" align="center">86.78</td>
<td valign="top" align="center"><bold>94.58</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center">73.19</td>
<td valign="top" align="center"><bold>71.01</bold></td>
<td valign="top" align="center"><bold>77.20</bold></td>
<td valign="top" align="center">46.42</td>
<td valign="top" align="center"><bold>82.60</bold></td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">ACP-check</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center">90.24</td>
<td valign="top" align="center">90.52</td>
<td valign="top" align="center">90.52</td>
<td valign="top" align="center">86.45</td>
<td valign="top" align="center">88.70</td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center">65.58</td>
<td valign="top" align="center">70.29</td>
<td valign="top" align="center">72.31</td>
<td valign="top" align="center">31.30</td>
<td valign="top" align="center">74.69</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">iACP-DRLF</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center">80.65</td>
<td valign="top" align="center">80.22</td>
<td valign="top" align="center">85.51</td>
<td valign="top" align="center">61.26</td>
<td valign="top" align="center">90.10</td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center">62.32</td>
<td valign="top" align="center">59.42</td>
<td valign="top" align="center">63.42</td>
<td valign="top" align="center">24.68</td>
<td valign="top" align="center">71.10</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">AntiCP 2.0</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center">70.97</td>
<td valign="top" align="center">76.12</td>
<td valign="top" align="center">67.92</td>
<td valign="top" align="center">42.14</td>
<td valign="top" align="center">90.80</td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center">58.70</td>
<td valign="top" align="center">65.94</td>
<td valign="top" align="center">60.57</td>
<td valign="top" align="center">17.58</td>
<td valign="top" align="center">70.25</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">ACP-CLB</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center"><bold>94.33</bold></td>
<td valign="top" align="center">90.12</td>
<td valign="top" align="center">93.24</td>
<td valign="top" align="center"><bold>87.50</bold></td>
<td valign="top" align="center">91.21</td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center"><bold>79.10</bold></td>
<td valign="top" align="center">69.35</td>
<td valign="top" align="center">74.40</td>
<td valign="top" align="center"><bold>60.61</bold></td>
<td valign="top" align="center">77.22</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="2">ACP-GCN</td>
<td valign="top" align="left">Dataset_1</td>
<td valign="top" align="center">84.12</td>
<td valign="top" align="center">80.17</td>
<td valign="top" align="center">84.69</td>
<td valign="top" align="center">64.83</td>
<td valign="top" align="center">86.24</td>
</tr>
 <tr>
<td valign="top" align="left">Dataset_2</td>
<td valign="top" align="center">70.90</td>
<td valign="top" align="center">66.95</td>
<td valign="top" align="center">70.00</td>
<td valign="top" align="center">39.70</td>
<td valign="top" align="center">71.55</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values indicate the best (optimal) results, which correspond to the highest performance achieved for the respective metrics.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>3.5</label>
<title>Comparison with the latest AMP method on Dataset_3</title>
<p>To assess the performance of FuncPred-CB on AMP recognition, we benchmarked it against the recent models AMPpred-MFA and deep-AMPpred (<xref ref-type="bibr" rid="B42">Zhao et al., 2025</xref>) on Dataset_3. The results, provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S3.3</xref> and <xref ref-type="table" rid="T4">Table 4</xref>, show that FuncPred-CB attained 95.9 % accuracy, 97.3 % sensitivity, 96.3 % specificity, 91.7 % MCC, and 98.7 % AUC. Although its accuracy is marginally lower than that of deep-AMPpred, FuncPred-CB delivers superior overall performance relative to both AMPpred-MFA and deep-AMPpred. These findings underscore that FuncPred-CB not only excels in anticancer-peptide prediction but also exhibits strong discriminative power and generalizability in AMP classification tasks.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Comparison of performance metrics between FuncPred-CB, AMPpred-MFA and Deep-AMPpred on the Dataset_3.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>Sn (%)</bold></th>
<th valign="top" align="center"><bold>Sp (%)</bold></th>
<th valign="top" align="center"><bold>MCC (%)</bold></th>
<th valign="top" align="center"><bold>AUC (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FuncPred-CB</td>
<td valign="top" align="center">95.85</td>
<td valign="top" align="center"><bold>97.33</bold></td>
<td valign="top" align="center"><bold>97.30</bold></td>
<td valign="top" align="center"><bold>91.72</bold></td>
<td valign="top" align="center"><bold>98.67</bold></td>
</tr>
<tr>
<td valign="top" align="left">AMPpred-MFA</td>
<td valign="top" align="center">94.72</td>
<td valign="top" align="center">94.74</td>
<td valign="top" align="center">94.00</td>
<td valign="top" align="center">88.00</td>
<td valign="top" align="center">98.53</td>
</tr>
<tr>
<td valign="top" align="left">Deep-AMPpred</td>
<td valign="top" align="center"><bold>97.28</bold></td>
<td valign="top" align="center">90.70</td>
<td valign="top" align="center">93.22</td>
<td valign="top" align="center">80.60</td>
<td valign="top" align="center">96.55</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold values indicate the best (optimal) results, which correspond to the highest performance achieved for the respective metrics.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>3.6</label>
<title>The impact of basic nucleotides on MHA-preconv</title>
<p>Since the differences in base frequencies between coding and non-coding regions can reflect structural and functional characteristics of the genome, the data distribution of coding and non-coding regions was plotted (detailed in <xref ref-type="supplementary-material" rid="SM1">Supplementary File S3.4</xref>), and the impact of including nucleotide composition as a feature on model performance was compared. As shown in <xref ref-type="table" rid="T5">Table 5</xref>, the incorporation of nucleotide composition features improved the model&#x00027;s prediction accuracy, sensitivity, specificity, and other metrics across different genomes.</p>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Comparison of the impact of nucleotide composition features on the performance of MHA-preconv.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Genome ID</bold></th>
<th valign="top" align="center" colspan="3"><bold>base_composition</bold></th>
<th valign="top" align="center" colspan="3"><bold>nobase_composition</bold></th>
</tr>
<tr>
<th/>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>Sn (%)</bold></th>
<th valign="top" align="center"><bold>Sp (%)</bold></th>
<th valign="top" align="center"><bold>Acc (%)</bold></th>
<th valign="top" align="center"><bold>Sn (%)</bold></th>
<th valign="top" align="center"><bold>Sp (%)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">NC_002516.1</td>
<td valign="top" align="center">97.33</td>
<td valign="top" align="center">95.33</td>
<td valign="top" align="center">98.04</td>
<td valign="top" align="center">94.26</td>
<td valign="top" align="center">92.40</td>
<td valign="top" align="center">94.92</td>
</tr>
<tr>
<td valign="top" align="left">NC_000909.1</td>
<td valign="top" align="center">97.24</td>
<td valign="top" align="center">95.16</td>
<td valign="top" align="center">97.97</td>
<td valign="top" align="center">95.70</td>
<td valign="top" align="center">95.50</td>
<td valign="top" align="center">97.35</td>
</tr>
<tr>
<td valign="top" align="left">NC_007426.1</td>
<td valign="top" align="center">97.18</td>
<td valign="top" align="center">94.98</td>
<td valign="top" align="center">98.04</td>
<td valign="top" align="center">93.32</td>
<td valign="top" align="center">92.83</td>
<td valign="top" align="center">93.49</td>
</tr>
<tr>
<td valign="top" align="left">NC_002528.1</td>
<td valign="top" align="center">97.30</td>
<td valign="top" align="center">97.23</td>
<td valign="top" align="center">95.43</td>
<td valign="top" align="center">85.79</td>
<td valign="top" align="center">76.33</td>
<td valign="top" align="center">85.07</td>
</tr>
<tr>
<td valign="top" align="left">NC_007164.1</td>
<td valign="top" align="center">97.17</td>
<td valign="top" align="center">98.02</td>
<td valign="top" align="center">97.04</td>
<td valign="top" align="center">96.70</td>
<td valign="top" align="center">94.56</td>
<td valign="top" align="center">97.71</td>
</tr>
<tr>
<td valign="top" align="left">NC_002932.1</td>
<td valign="top" align="center">97.56</td>
<td valign="top" align="center">96.83</td>
<td valign="top" align="center">97.24</td>
<td valign="top" align="center">93.63</td>
<td valign="top" align="center">95.42</td>
<td valign="top" align="center">93.00</td>
</tr>
<tr>
<td valign="top" align="left">NC_000921.1</td>
<td valign="top" align="center">97.30</td>
<td valign="top" align="center">98.22</td>
<td valign="top" align="center">96.84</td>
<td valign="top" align="center">95.32</td>
<td valign="top" align="center">94.21</td>
<td valign="top" align="center">97.38</td>
</tr>
<tr>
<td valign="top" align="left">NC_007577.1</td>
<td valign="top" align="center">97.43</td>
<td valign="top" align="center">96.43</td>
<td valign="top" align="center">97.33</td>
<td valign="top" align="center">88.37</td>
<td valign="top" align="center">91.96</td>
<td valign="top" align="center">78.99</td>
</tr>
<tr>
<td valign="top" align="left">NC_006833.1</td>
<td valign="top" align="center">97.34</td>
<td valign="top" align="center">95.83</td>
<td valign="top" align="center">97.24</td>
<td valign="top" align="center">95.68</td>
<td valign="top" align="center">95.68</td>
<td valign="top" align="center">94.67</td>
</tr>
<tr>
<td valign="top" align="left">NC_006350.1</td>
<td valign="top" align="center">97.29</td>
<td valign="top" align="center">95.07</td>
<td valign="top" align="center">98.07</td>
<td valign="top" align="center">96.35</td>
<td valign="top" align="center">95.94</td>
<td valign="top" align="center">96.50</td>
</tr></tbody>
</table>
</table-wrap>
<p>Accordingly, the use of standalone CNN and Transformer modules was also compared, different configurations of CNN and encoder layers were tested, and the effect of the Bi-LSTM model on model performance was evaluated. All these modifications significantly enhanced the model&#x00027;s performance. Detailed analyses and results are provided in <xref ref-type="supplementary-material" rid="SM1">Supplementary Files S3.5</xref>&#x02013;<xref ref-type="supplementary-material" rid="SM1">S3.8</xref>.</p>
</sec>
<sec>
<label>3.7</label>
<title>Physicochemical property analysis of functional peptides</title>
<p>To evaluate the potential of the predicted functional peptides in the fields of ACPs and AMPs, we employed the unified prediction framework GP2FI. Coding genes predicted from Dataset1 were translated into protein sequences, which were then subjected to functional peptide identification using FuncPred-CB and comprehensive multiparametric physiochemical characterization. As shown in <xref ref-type="fig" rid="F5">Figures 5</xref>&#x02013;7, the large number of sequences limits the resolution of the heat-maps, which are therefore intended only as an overview of the global physiochemical landscape of ACPs and AMPs. Detailed sequence information and peptide classifications, together with <italic>in-silico</italic> mapping against the CancerPPD database, are provided in <xref ref-type="supplementary-material" rid="SM1">Appendix S3.10</xref>; wet-lab functional assays will be required for definitive validation.</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Heatmap of physicochemical property profiles for functional peptides predicted by the GP2FI model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0005.tif">
<alt-text content-type="machine-generated">Heatmap showing protein sequences analyzed by various properties: GRAVY, Molecular Weight, Aromaticity, Instability Index, Isoelectric Point, Charge at pH 7, Boman Index, and W Content. Color intensity ranges from blue (negative values) to red (positive values).</alt-text>
</graphic>
</fig>
<p><xref ref-type="fig" rid="F5">Figure 5</xref> illustrates the normalized distribution of eight physicochemical properties&#x02014;GRAVY (hydrophobicity), molecular weight, aromaticity, instability index, isoelectric point, net charge at pH 7, Boman index, and tryptophan content (W_Content)&#x02014;across multiple sequences predicted as functional peptides. The heatmap reveals substantial variation in these properties among different sequences, indicating their potential functional divergence. For example, the sequence in row 9 shows high values for isoelectric point, net charge, and Boman index, suggesting strong cationic affinity and protein interaction capability, making it a promising ACP candidate. Similarly, the sequence in row 19 exhibits elevated isoelectric point and positive charge, along with moderate aromaticity and tryptophan content, indicating considerable anticancer potential. Sequences in rows 13 and 41 also display favorable profiles across several ACP-associated features, highlighting their structural suitability as ACPs. In contrast, the sequence in row 35 exhibits high GRAVY and aromaticity values, reflecting strong hydrophobicity and structural stability&#x02014;characteristics well-suited for AMP candidates. The sequence in row 16 also scores high in GRAVY and instability index, indicating favorable membrane affinity and antimicrobial stability. Row 43 shows marked enrichment in aromaticity and tryptophan content, aligning with classic physicochemical traits of AMPs.</p>
<p>To more intuitively reveal the numerical differences in key properties and their association with sequence characteristics, <xref ref-type="fig" rid="F6">Figure 6</xref> presents bar charts of GRAVY, Boman index, and tryptophan content across the predicted functional peptide sequences. Compared to the heatmap, the bar plots offer a clearer depiction of the magnitude of each attribute, facilitating the identification of representative sequences. For instance, these visualizations further confirm that sequences such as that in row 35 exhibit notably high scores in GRAVY and aromaticity, indicating strong hydrophobicity and high structural stability&#x02014;traits that suggest strong potential as AMP candidates.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Bar chart of key physicochemical properties for functional peptide sequences predicted by the GP2FI model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0006.tif">
<alt-text content-type="machine-generated">Bar chart displaying vertical bars of varying heights and colors, including GRAVY, Boman  index, and tryptophan content, mostly clustered around the zero point on the y-axis labeled &#x0201C;Value,&#x0201D; with fluctuations ranging approximately from -3 to +2. The x-axis represents  sequences with densely packed labels that are not clearly discernible.</alt-text>
</graphic>
</fig>
<p>Furthermore, <xref ref-type="fig" rid="F7">Figure 7</xref> presents a scatter plot illustrating the relationship between GRAVY and net charge, with the Boman index encoded as the color gradient. This visualization reveals the multidimensional interaction among these physicochemical properties. Analysis shows that most AMP-like sequences cluster in the region characterized by high GRAVY and low charge, whereas sequences exhibiting moderate hydrophobicity, medium to high charge, and elevated Boman index tend to group in the ACP-favored region. Brighter colors (yellow) represent higher Boman index values, indicating stronger potential for protein&#x02013;protein interactions and possibly higher biological activity. For instance, sequences located in the upper-right region of the plot exhibit both high charge and high Boman index, suggesting strong structural characteristics associated with ACPs, while those in the lower-left region with low charge and high hydrophobicity are more typical of AMP-like features.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Scatter plot of GRAVY and charge properties for functional peptide sequences predicted by the GP2FI model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmicb-17-1736391-g0007.tif">
<alt-text content-type="machine-generated">Scatter plot showing the relationship between GRAVY (hydrophobicity) on the x-axis and charge at pH 7 on the y-axis. Data points are colored based on the Boman Index, with a color gradient from blue to yellow indicating increasing values. Most points are clustered around lower charges and hydrophobicity values, with a few outliers at higher charges.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<p>The main innovations of GP2FI are reflected in the following aspects: First, to address the challenges of gene prediction, the MHA-preconv model integrated CNN and Transformer architectures, enabling the effective extraction of local patterns in ORFs while simultaneously capturing long-range dependencies in nucleotide sequences. Compared with previous models that heavily rely on handcrafted features, a more streamlined architecture, higher prediction efficiency, and reduced dependency on manual feature inputs are achieved by MHA-preconv. Second, the FuncPred-CB model was proposed for dual-task functional peptide prediction. It leverages a pretrained BERT language model to automatically extract contextual semantic representations of amino acid sequences and employs a dual-channel architecture combining CNN and Bi-LSTM to deeply fuse local and global features. Experimental results demonstrate that superior performance across multiple metrics is achieved by FuncPred-CB. Finally, a physicochemical property analysis of the predicted functional peptide sequences was conducted. The results further validated the predictive effectiveness of the model and revealed functional tendencies of different sequences in anticancer and antimicrobial directions, offering valuable insights for subsequent experimental validation and drug development.</p>
<p>Despite its strong predictive performance, GP2FI has several limitations. First, the framework only provides &#x0201C;precursor-level&#x0201D; activity scores for small ribosomally encoded peptides (sREPs) translated via the canonical ribosome; it ingests complete metagenomic ORF sequences and currently does not model signal-peptide removal, proteolytic cleavage, post-translational modifications, or non-ribosomal peptide synthetase (NRPS) pathways. Dedicated maturation modules and wet-lab validation will be incorporated in future work to bridge this gap. In addition, the pipeline remains a two-stage system that requires manual hand-off between gene prediction and peptide function prediction. We plan to introduce joint training to establish a truly end-to-end workflow. The current model also exhibits suboptimal efficiency when handling very long sequences and in deployment scenarios, while physiochemical property analysis is not yet embedded within the predictive loop. Upcoming efforts will focus on integrating structural information to build more biologically interpretable models with enhanced explainability.We acknowledge that, although our study has significantly reduced reliance on manual features compared with traditional approaches, a subset of handcrafted prior features is still employed. In future work, we are committed to achieving high-efficiency prediction without any handcrafted features whatsoever.</p>
<p>Certainly, as <xref ref-type="bibr" rid="B28">Mc Neil and Lee (2025)</xref> emphasized, expanding and prospecting future functional-peptide research requires &#x0201C;novel immunomodulatory molecules to overcome resistance&#x0201D;&#x02014;a niche for which ACPs are ideal candidates. Rapid and accurate genome-wide identification of ACPs equips clinicians with a readily deployable &#x0201C;peptide arsenal&#x0201D; that can be immediately combined with ICIs, perfectly aligning with the review&#x00027;s vision of &#x0201C;personalized ICI-combination therapy.&#x0201D; In our future work, we plan to adopt the multi-model voting ensemble framework proposed by <xref ref-type="bibr" rid="B3">Abbas et al. (2025c)</xref>, originally designed for multi-class peptide tasks, to further boost the robustness and generalizability of our prediction system.</p></sec>
<sec sec-type="conclusions" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>In this study, a unified prediction framework, GP2FI, was proposed, which integrates two deep learning models: MHA-preconv for metagenomic gene prediction and FuncPred-CB for the identification of ACPs and AMPs. As a multitask integrated deep learning framework, GP2FI exhibits excellent performance and practical application potential in both gene prediction and functional peptide recognition. Compared to traditional methods, experimental results across multiple datasets demonstrate that strong performance advantages and broad adaptability in both coding gene detection and functional peptide screening are offered by GP2FI. Ongoing improvements in data integration, model efficiency, and biological interpretability will further strengthen its utility, providing comprehensive computational support for the efficient discovery of functional peptide-based therapeutics.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Gene prediction-related datasets can be obtained from NCBI RefSeq (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/refseq/">https://www.ncbi.nlm.nih.gov/refseq/</ext-link>) and GenBank (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/genbank/">https://www.ncbi.nlm.nih.gov/genbank/</ext-link>). The CAMI dataset is available for download at <ext-link ext-link-type="uri" xlink:href="https://data.cami-challenge.org/">https://data.cami-challenge.org/</ext-link>. The Sharon real metagenomic dataset can be accessed from the NCBI SRA database (<ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/sra">https://www.ncbi.nlm.nih.gov/sra</ext-link>). Cancer peptide datasets can be obtained from the following databases: DADP (<ext-link ext-link-type="uri" xlink:href="http://webs.iiitd.edu.in/raghava/dadp/">http://webs.iiitd.edu.in/raghava/dadp/</ext-link>), CAMP (<ext-link ext-link-type="uri" xlink:href="http://www.camp.bicnirrh.res.in/">http://www.camp.bicnirrh.res.in/</ext-link>), APD/APD2 (<ext-link ext-link-type="uri" xlink:href="https://aps.unmc.edu/">https://aps.unmc.edu/</ext-link>), CancerPPD (<ext-link ext-link-type="uri" xlink:href="http://crdd.osdd.net/raghava/cancerppd/">http://crdd.osdd.net/raghava/cancerppd/</ext-link>), UniProt (<ext-link ext-link-type="uri" xlink:href="https://www.uniprot.org/">https://www.uniprot.org/</ext-link>), and SwissProt (<ext-link ext-link-type="uri" xlink:href="https://www.uniprot.org/help/swiss-prot">https://www.uniprot.org/help/swiss-prot</ext-link>). Antimicrobial peptide datasets can be obtained from ADAM (<ext-link ext-link-type="uri" xlink:href="http://bioinform.info/adam/">http://bioinform.info/adam/</ext-link>), APD (<ext-link ext-link-type="uri" xlink:href="https://aps.unmc.edu/">https://aps.unmc.edu/</ext-link>), CAMP (<ext-link ext-link-type="uri" xlink:href="http://www.camp.bicnirrh.res.in/">http://www.camp.bicnirrh.res.in/</ext-link>), and LAMP (<ext-link ext-link-type="uri" xlink:href="http://biotechlab.fudan.edu.cn/database/lamp/">http://biotechlab.fudan.edu.cn/database/lamp/</ext-link>).</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>CM: Writing &#x02013; original draft, Software. QW: Data curation, Formal analysis, Writing &#x02013; review &#x00026; editing. GW: Data curation, Supervision, Formal analysis, Writing &#x02013; review &#x00026; editing. LY: Data curation, Writing &#x02013; review &#x00026; editing, Funding acquisition. YM: Project administration, Methodology, Writing &#x02013; review &#x00026; editing, Formal analysis, Funding acquisition.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec><sec sec-type="supplementary-material" id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmicb.2026.1736391/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmicb.2026.1736391/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Supplementary_file_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abbas</surname> <given-names>S. R.</given-names></name> <name><surname>Abbas</surname> <given-names>Z.</given-names></name> <name><surname>Zahir</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>S. W.</given-names></name></person-group> (<year>2025a</year>). <article-title>Advancing genome-based precision medicine: a review on machine learning applications for rare genetic disorders</article-title>. <source>Brief. Bioinform</source>. <volume>26</volume>:<fpage>bbaf329</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbaf329</pub-id><pub-id pub-id-type="pmid">40668553</pub-id></mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abbas</surname> <given-names>S. R.</given-names></name> <name><surname>Seol</surname> <given-names>H.</given-names></name> <name><surname>Abbas</surname> <given-names>Z.</given-names></name> <name><surname>Lee</surname> <given-names>S. W.</given-names></name></person-group> (<year>2025b</year>). <article-title>Exploring the role of artificial intelligence in smart healthcare: a capability and function-oriented review</article-title>. <source>Healthcare</source> <volume>13</volume>:<fpage>1642</fpage>. doi: <pub-id pub-id-type="doi">10.3390/healthcare13141642</pub-id><pub-id pub-id-type="pmid">40724669</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abbas</surname> <given-names>Z.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Lee</surname> <given-names>N.</given-names></name> <name><surname>Kazmi</surname> <given-names>S. A. W.</given-names></name> <name><surname>Lee</surname> <given-names>S. W.</given-names></name></person-group> (<year>2025c</year>). <article-title>A robust ensemble framework for anticancer peptide classification using multi-model voting approach</article-title>. <source>Comput. Biol. Med</source>. <volume>188</volume>:<fpage>109750</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.109750</pub-id><pub-id pub-id-type="pmid">40032410</pub-id></mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Abbas</surname> <given-names>Z.</given-names></name> <name><surname>Rehman</surname> <given-names>M. U.</given-names></name> <name><surname>Tayara</surname> <given-names>H.</given-names></name> <name><surname>Lee</surname> <given-names>S. W.</given-names></name> <name><surname>Chong</surname> <given-names>K. T.</given-names></name></person-group> (<year>2024</year>). <article-title>m5C-Seq: machine learning-enhanced profiling of RNA 5-methylcytosine modifications</article-title>. <source>Comput. Biol. Med</source>. <volume>182</volume>:<fpage>109087</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.109087</pub-id><pub-id pub-id-type="pmid">39232403</pub-id></mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Agrawal</surname> <given-names>P.</given-names></name> <name><surname>Bhagat</surname> <given-names>D.</given-names></name> <name><surname>Mahalwal</surname> <given-names>M.</given-names></name> <name><surname>Sharma</surname> <given-names>N.</given-names></name> <name><surname>Raghava</surname> <given-names>G. P. S.</given-names></name></person-group> (<year>2020</year>). <article-title>AntiCP 2.0: an updated model for predicting anticancer peptides</article-title>. <source>Brief. Bioinform</source>. <volume>22</volume>:<fpage>bbaa153</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbaa153</pub-id><pub-id pub-id-type="pmid">32770192</pub-id></mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Al-Ajlan</surname> <given-names>A.</given-names></name> <name><surname>El Allali</surname> <given-names>A.</given-names></name></person-group> (<year>2018a</year>). <article-title>&#x0201C;The effect of machine learning algorithms on metagenomics gene prediction,&#x0201D;</article-title> in <source>Proceedings of the 2018 5th International Conference on Bioinformatics Research and Applications</source>, <fpage>16</fpage>&#x02013;<lpage>21</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3309129.3309136</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Al-Ajlan</surname> <given-names>A.</given-names></name> <name><surname>El Allali</surname> <given-names>A.</given-names></name></person-group> (<year>2018b</year>). <article-title>CNN-MGP: convolutional neural networks for metagenomics gene prediction</article-title>. <source>Interdisc. Sci</source>. <volume>11</volume>, <fpage>628</fpage>&#x02013;<lpage>635</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s12539-018-0313-4</pub-id><pub-id pub-id-type="pmid">30588558</pub-id></mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bhadra</surname> <given-names>P.</given-names></name> <name><surname>Yan</surname> <given-names>J.</given-names></name> <name><surname>Li</surname> <given-names>J.</given-names></name> <name><surname>Fong</surname> <given-names>S.</given-names></name> <name><surname>Siu</surname> <given-names>S. W. I.</given-names></name></person-group> (<year>2018</year>). <article-title>AmPEP: sequence-based prediction of antimicrobial peptides using distribution patterns of amino acid properties and random forest</article-title>. <source>Sci. Rep</source>. <volume>8</volume>:<fpage>1697</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-018-19752-w</pub-id><pub-id pub-id-type="pmid">29374199</pub-id></mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Delcher</surname> <given-names>A. L.</given-names></name> <name><surname>Bratke</surname> <given-names>K. A.</given-names></name> <name><surname>Powers</surname> <given-names>E. C.</given-names></name> <name><surname>Salzberg</surname> <given-names>S. L.</given-names></name></person-group> (<year>2007</year>). <article-title>Identifying bacterial genes and endosymbiont DNA with Glimmer</article-title>. <source>Bioinformatics</source> <volume>23</volume>, <fpage>673</fpage>&#x02013;<lpage>679</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btm009</pub-id><pub-id pub-id-type="pmid">17237039</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>El Allali</surname> <given-names>A.</given-names></name> <name><surname>Rose</surname> <given-names>J. R.</given-names></name></person-group> (<year>2013</year>). <article-title>MGC: a metagenomic gene caller</article-title>. <source>BMC Bioinform</source>. <volume>14</volume>:<fpage>S6</fpage>. doi: <pub-id pub-id-type="doi">10.1186/1471-2105-14-S9-S6</pub-id><pub-id pub-id-type="pmid">23901840</pub-id></mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gabriel</surname> <given-names>L.</given-names></name> <name><surname>Becker</surname> <given-names>F.</given-names></name> <name><surname>Hoff</surname> <given-names>K. J.</given-names></name> <name><surname>Stanke</surname> <given-names>M.</given-names></name></person-group> (<year>2024</year>). <article-title>Tiberius: end-to-end deep learning with an HMM for gene prediction</article-title>. <source>Bioinformatics</source> <volume>40</volume>:<fpage>btae685</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btae685</pub-id><pub-id pub-id-type="pmid">39558581</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Geng</surname> <given-names>A.</given-names></name> <name><surname>Luo</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>A.</given-names></name> <name><surname>Zhang</surname> <given-names>Z.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name> <name><surname>Wei</surname> <given-names>L.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>ACP-CLB: an anticancer peptide prediction model based on multichannel discriminative processing and integration of large pretrained protein language models</article-title>. <source>J. Chem. Inf. Model</source>. <volume>65</volume>, <fpage>2336</fpage>&#x02013;<lpage>2349</lpage>. doi: <pub-id pub-id-type="doi">10.1021/acs.jcim.4c02072</pub-id><pub-id pub-id-type="pmid">39969847</pub-id></mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hoff</surname> <given-names>K. J.</given-names></name> <name><surname>Lingner</surname> <given-names>T.</given-names></name> <name><surname>Meinicke</surname> <given-names>P.</given-names></name> <name><surname>Tech</surname> <given-names>M.</given-names></name></person-group> (<year>2009</year>). <article-title>Orphelia: predicting genes in metagenomic sequencing reads</article-title>. <source>Nucleic Acids Res</source>. <volume>37</volume>, <fpage>W101</fpage>&#x02013;<lpage>W105</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkp327</pub-id><pub-id pub-id-type="pmid">19429689</pub-id></mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hoff</surname> <given-names>K. J.</given-names></name> <name><surname>Tech</surname> <given-names>M.</given-names></name> <name><surname>Lingner</surname> <given-names>T.</given-names></name> <name><surname>Daniel</surname> <given-names>R.</given-names></name> <name><surname>Morgenstern</surname> <given-names>B.</given-names></name> <name><surname>Meinicke</surname> <given-names>P.</given-names></name></person-group> (<year>2008</year>). <article-title>Gene prediction in metagenomic fragments: a large scale machine learning approach</article-title>. <source>BMC Bioinform</source>. <volume>9</volume>:<fpage>217</fpage>. doi: <pub-id pub-id-type="doi">10.1186/1471-2105-9-217</pub-id><pub-id pub-id-type="pmid">18442389</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Holst</surname> <given-names>F.</given-names></name> <name><surname>Bolger</surname> <given-names>A. M.</given-names></name> <name><surname>Kindel</surname> <given-names>F.</given-names></name> <name><surname>G&#x000FC;nther</surname> <given-names>C.</given-names></name> <name><surname>Ma&#x000DF;</surname> <given-names>J.</given-names></name> <name><surname>Triesch</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>Helixer: ab initio prediction of primary eukaryotic gene models combining deep learning and a hidden Markov model</article-title>. <source>Nat. Methods</source> <volume>2025</volume>, <fpage>1</fpage>&#x02013;<lpage>8</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41592-025-02939-1</pub-id><pub-id pub-id-type="pmid">41286201</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hyatt</surname> <given-names>D.</given-names></name> <name><surname>Chen</surname> <given-names>G.-L.</given-names></name> <name><surname>LoCascio</surname> <given-names>P. F.</given-names></name> <name><surname>Land</surname> <given-names>M. L.</given-names></name> <name><surname>Larimer</surname> <given-names>F. W.</given-names></name> <name><surname>Hauser</surname> <given-names>L. J.</given-names></name></person-group> (<year>2010</year>). <article-title>Prodigal: prokaryotic gene recognition and translation initiation site identification</article-title>. <source>BMC Bioinform</source>. <volume>11</volume>, <fpage>1</fpage>&#x02013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1186/1471-2105-11-119</pub-id><pub-id pub-id-type="pmid">20211023</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jhong</surname> <given-names>J.-H.</given-names></name> <name><surname>Chi</surname> <given-names>Y.-H.</given-names></name> <name><surname>Li</surname> <given-names>W.-C.</given-names></name> <name><surname>Lin</surname> <given-names>T.-H.</given-names></name> <name><surname>Huang</surname> <given-names>K.-Y.</given-names></name> <name><surname>Lee</surname> <given-names>T.-Y.</given-names></name></person-group> (<year>2018</year>). <article-title>dbAMP: an integrated resource for exploring antimicrobial peptides with functional activities and physicochemical properties on transcriptome and proteome data</article-title>. <source>Nucleic Acids Res</source>. <volume>47</volume>, <fpage>D285</fpage>&#x02013;<lpage>D297</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gky1030</pub-id><pub-id pub-id-type="pmid">30380085</pub-id></mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kang</surname> <given-names>X.</given-names></name> <name><surname>Dong</surname> <given-names>F.</given-names></name> <name><surname>Shi</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>S.</given-names></name> <name><surname>Sun</surname> <given-names>J.</given-names></name> <name><surname>Chen</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>DRAMP 2.0, an updated data repository of antimicrobial peptides</article-title>. <source>Sci. Data</source> <volume>6</volume>:<fpage>148</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41597-019-0154-y</pub-id><pub-id pub-id-type="pmid">31409791</pub-id></mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Karag&#x000F6;z</surname> <given-names>M. A.</given-names></name> <name><surname>Nalbantoglu</surname> <given-names>O. U.</given-names></name></person-group> (<year>2021</year>). <article-title>Taxonomic classification of metagenomic sequences from Relative Abundance Index profiles using deep learning</article-title>. <source>Biomed. Signal Process. Control</source> <volume>67</volume>:<fpage>102539</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.bspc.2021.102539</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kelley</surname> <given-names>D. R.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name> <name><surname>Delcher</surname> <given-names>A. L.</given-names></name> <name><surname>Pop</surname> <given-names>M.</given-names></name> <name><surname>Salzberg</surname> <given-names>S. L.</given-names></name></person-group> (<year>2011</year>). <article-title>Gene prediction with Glimmer for metagenomic sequences augmented by classification and clustering</article-title>. <source>Nucleic Acids Res</source>. <volume>40</volume>, <fpage>e9</fpage>&#x02013;<lpage>e9</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkr1067</pub-id><pub-id pub-id-type="pmid">22102569</pub-id></mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Larsen</surname> <given-names>T. S.</given-names></name> <name><surname>Krogh</surname> <given-names>A.</given-names></name></person-group> (<year>2003</year>). <article-title>EasyGene &#x02013; a prokaryotic gene finder that ranks ORFs by statistical significance</article-title>. <source>BMC Bioinform</source>. <volume>4</volume>:<fpage>21</fpage>. doi: <pub-id pub-id-type="doi">10.1186/1471-2105-4-21</pub-id></mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lertampaiporn</surname> <given-names>S.</given-names></name> <name><surname>Vorapreeda</surname> <given-names>T.</given-names></name> <name><surname>Hongsthong</surname> <given-names>A.</given-names></name> <name><surname>Thammarongtham</surname> <given-names>C.</given-names></name></person-group> (<year>2021</year>). <article-title>Ensemble-AMPPred: robust AMP Prediction and recognition using the ensemble learning method with a new hybrid feature for differentiating AMPs</article-title>. <source>Genes</source> <volume>12</volume>:<fpage>137</fpage>. doi: <pub-id pub-id-type="doi">10.3390/genes12020137</pub-id><pub-id pub-id-type="pmid">33494403</pub-id></mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name> <name><surname>Jia</surname> <given-names>C.</given-names></name> <name><surname>Zheng</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>AMPpred-MFA: an interpretable antimicrobial peptide predictor with a stacking architecture, multiple features, and multihead attention</article-title>. <source>J. Chem. Inf. Model</source>. <volume>64</volume>, <fpage>2393</fpage>&#x02013;<lpage>2404</lpage>. doi: <pub-id pub-id-type="doi">10.1021/acs.jcim.3c01017</pub-id><pub-id pub-id-type="pmid">37799091</pub-id></mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Guo</surname> <given-names>J.</given-names></name> <name><surname>Hu</surname> <given-names>G.</given-names></name> <name><surname>Zhu</surname> <given-names>H.</given-names></name></person-group> (<year>2013</year>). <article-title>Gene prediction in metagenomic fragments based on the SVM algorithm</article-title>. <source>BMC Bioinform</source>. <volume>14</volume>:<fpage>S5</fpage>. doi: <pub-id pub-id-type="doi">10.1186/1471-2105-14-S5-S12</pub-id><pub-id pub-id-type="pmid">23735199</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Yan</surname> <given-names>K.</given-names></name> <name><surname>Guo</surname> <given-names>Y.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name> <name><surname>Hesham</surname> <given-names>A. E.-L.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>AMPpred-EL: an effective antimicrobial peptide prediction model based on ensemble learning</article-title>. <source>Comput. Biol. Med</source>. <volume>146</volume>:<fpage>105577</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.105577</pub-id><pub-id pub-id-type="pmid">35576825</pub-id></mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lv</surname> <given-names>Z.</given-names></name> <name><surname>Cui</surname> <given-names>F.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Xu</surname> <given-names>L.</given-names></name></person-group> (<year>2021</year>). <article-title>Anticancer peptides prediction with deep representation learning features</article-title>. <source>Brief. Bioinform</source>. <volume>22</volume>:<fpage>bbab008</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbab008</pub-id><pub-id pub-id-type="pmid">33529337</pub-id></mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>Y.</given-names></name> <name><surname>Guo</surname> <given-names>Z.</given-names></name> <name><surname>Xia</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Yu</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Identification of antimicrobial peptides from the human gut microbiome using deep learning</article-title>. <source>Nat. Biotechnol</source>. <volume>40</volume>, <fpage>921</fpage>&#x02013;<lpage>931</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-022-01226-0</pub-id><pub-id pub-id-type="pmid">35241840</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mc Neil</surname> <given-names>V.</given-names></name> <name><surname>Lee</surname> <given-names>S. W.</given-names></name></person-group> (<year>2025</year>). <article-title>Advancing cancer treatment: a review of immune checkpoint inhibitors and combination strategies</article-title>. <source>Cancers</source> <volume>17</volume>:<fpage>1408</fpage>. doi: <pub-id pub-id-type="doi">10.3390/cancers17091408</pub-id><pub-id pub-id-type="pmid">40361336</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Noguchi</surname> <given-names>H.</given-names></name> <name><surname>Taniguchi</surname> <given-names>T.</given-names></name> <name><surname>Itoh</surname> <given-names>T.</given-names></name></person-group> (<year>2008</year>). <article-title>MetaGeneAnnotator: detecting species-specific patterns of ribosomal binding site for precise gene prediction in anonymous prokaryotic and phage genomes</article-title>. <source>DNA Res</source>. <volume>15</volume>, <fpage>387</fpage>&#x02013;<lpage>396</lpage>. doi: <pub-id pub-id-type="doi">10.1093/dnares/dsn027</pub-id><pub-id pub-id-type="pmid">18940874</pub-id></mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Porto</surname> <given-names>W. F.</given-names></name> <name><surname>Pires</surname> <given-names>&#x000C1;. S.</given-names></name> <name><surname>Franco</surname> <given-names>O. L.</given-names></name></person-group> (<year>2012</year>). <article-title>CS-AMPPred: an updated SVM model for antimicrobial activity prediction in cysteine-stabilized peptides</article-title>. <source>PLoS ONE</source> <volume>7</volume>:<fpage>e51444</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0051444</pub-id><pub-id pub-id-type="pmid">23240023</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rao</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name></person-group> (<year>2020</year>). <article-title>ACP-GCN: the identification of anticancer peptides based on graph convolution networks</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>176005</fpage>&#x02013;<lpage>176011</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3023800</pub-id></mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rao</surname> <given-names>B.</given-names></name> <name><surname>Zhou</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>G.</given-names></name> <name><surname>Su</surname> <given-names>R.</given-names></name> <name><surname>Wei</surname> <given-names>L.</given-names></name></person-group> (<year>2019</year>). <article-title>ACPred-Fuse: fusing multi-view information improves the prediction of anticancer peptides</article-title>. <source>Brief. Bioinform</source>. <volume>21</volume>, <fpage>1846</fpage>&#x02013;<lpage>1855</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bib/bbz088</pub-id><pub-id pub-id-type="pmid">31729528</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Rho</surname> <given-names>M.</given-names></name> <name><surname>Tang</surname> <given-names>H.</given-names></name> <name><surname>Ye</surname> <given-names>Y.</given-names></name></person-group> (<year>2010</year>). <article-title>FragGeneScan: predicting genes in short and error-prone reads</article-title>. <source>Nucleic Acids Res</source>. <volume>38</volume>, <fpage>e191</fpage>&#x02013;<lpage>e191</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkq747</pub-id><pub-id pub-id-type="pmid">20805240</pub-id></mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>M.</given-names></name> <name><surname>Hu</surname> <given-names>H.</given-names></name> <name><surname>Pang</surname> <given-names>W.</given-names></name> <name><surname>Zhou</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>ACP-BC: a model for accurate identification of anticancer peptides based on fusion features of bidirectional long short-term memory and chemically derived information</article-title>. <source>Int. J. Mol. Sci</source>. <volume>24</volume>:<fpage>15447</fpage>. doi: <pub-id pub-id-type="doi">10.3390/ijms242015447</pub-id><pub-id pub-id-type="pmid">37895128</pub-id></mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Veltri</surname> <given-names>D.</given-names></name> <name><surname>Kamath</surname> <given-names>U.</given-names></name> <name><surname>Shehu</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>Deep learning improves antimicrobial peptide recognition</article-title>. <source>Bioinformatics</source> <volume>34</volume>, <fpage>2740</fpage>&#x02013;<lpage>2747</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/bty179</pub-id><pub-id pub-id-type="pmid">29590297</pub-id></mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Waghu</surname> <given-names>F. H.</given-names></name> <name><surname>Barai</surname> <given-names>R. S.</given-names></name> <name><surname>Gurung</surname> <given-names>P.</given-names></name> <name><surname>Idicula-Thomas</surname> <given-names>S.</given-names></name></person-group> (<year>2015</year>). <article-title>CAMPR3: a database on sequences, structures and signatures of antimicrobial peptides: Table 1</article-title>. <source>Nucleic Acids Res</source>. <volume>44</volume>, <fpage>D1094</fpage>&#x02013;<lpage>D1097</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkv1051</pub-id></mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Z.</given-names></name></person-group> (<year>2004</year>). <article-title>APD: the antimicrobial peptide database</article-title>. <source>Nucleic Acids Res</source>. <volume>32</volume>, <fpage>590D</fpage>&#x02013;<lpage>592</lpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/gkh025</pub-id><pub-id pub-id-type="pmid">14681488</pub-id></mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>L.</given-names></name> <name><surname>Zhou</surname> <given-names>C.</given-names></name> <name><surname>Su</surname> <given-names>R.</given-names></name> <name><surname>Zou</surname> <given-names>Q.</given-names></name></person-group> (<year>2019</year>). <article-title>PEPred-Suite: improved and robust prediction of therapeutic peptides using adaptive feature representation learning</article-title>. <source>Bioinformatics</source> <volume>35</volume>, <fpage>4272</fpage>&#x02013;<lpage>4280</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btz246</pub-id><pub-id pub-id-type="pmid">30994882</pub-id></mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>X.</given-names></name> <name><surname>Li</surname> <given-names>C.</given-names></name> <name><surname>Yuan</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>Q.</given-names></name> <name><surname>Liu</surname> <given-names>Y.</given-names></name> <name><surname>Zhu</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>ACP-DRL: an anticancer peptides recognition method based on deep representation learning</article-title>. <source>Front. Genet</source>. <volume>15</volume>:<fpage>1376486</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fgene.2024.1376486</pub-id><pub-id pub-id-type="pmid">38655048</pub-id></mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yan</surname> <given-names>K.</given-names></name> <name><surname>Lv</surname> <given-names>H.</given-names></name> <name><surname>Guo</surname> <given-names>Y.</given-names></name> <name><surname>Peng</surname> <given-names>W.</given-names></name> <name><surname>Liu</surname> <given-names>B.</given-names></name></person-group> (<year>2022</year>). <article-title>sAMPpred-GAT: prediction of antimicrobial peptide by graph attention network and predicted peptide structure</article-title>. <source>Bioinformatics</source> <volume>39</volume>:<fpage>btac715</fpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btac715</pub-id><pub-id pub-id-type="pmid">36342186</pub-id></mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>S.-W.</given-names></name> <name><surname>Jin</surname> <given-names>X.-Y.</given-names></name> <name><surname>Zhang</surname> <given-names>T.</given-names></name></person-group> (<year>2017</year>). <article-title>Gene prediction in metagenomic fragments with deep learning</article-title>. <source>Biomed Res. Int</source>. <volume>2017</volume>, <fpage>1</fpage>&#x02013;<lpage>9</lpage>. doi: <pub-id pub-id-type="doi">10.1155/2017/4740354</pub-id><pub-id pub-id-type="pmid">29250541</pub-id></mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>H.</given-names></name> <name><surname>Kang</surname> <given-names>L.</given-names></name> <name><surname>Gao</surname> <given-names>W.</given-names></name> <name><surname>Lu</surname> <given-names>Q.</given-names></name> <name><surname>Rao</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2025</year>). <article-title>deep-AMPpred: a deep learning method for identifying antimicrobial peptides and their functional activities</article-title>. <source>J. Chem. Inf. Model</source>. <volume>65</volume>, <fpage>997</fpage>&#x02013;<lpage>1008</lpage>. doi: <pub-id pub-id-type="doi">10.1021/acs.jcim.4c01913</pub-id><pub-id pub-id-type="pmid">39792442</pub-id></mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>J.</given-names></name> <name><surname>Troyanskaya</surname> <given-names>O. G.</given-names></name></person-group> (<year>2015</year>). <article-title>Predicting effects of noncoding variants with deep learning-based sequence model</article-title>. <source>Nat. Methods</source> <volume>12</volume>, <fpage>931</fpage>&#x02013;<lpage>934</lpage>. doi: <pub-id pub-id-type="doi">10.1038/nmeth.3547</pub-id><pub-id pub-id-type="pmid">26301843</pub-id></mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>L.</given-names></name> <name><surname>Ye</surname> <given-names>C.</given-names></name> <name><surname>Hu</surname> <given-names>X.</given-names></name> <name><surname>Yang</surname> <given-names>S.</given-names></name> <name><surname>Zhu</surname> <given-names>C.</given-names></name></person-group> (<year>2022</year>). <article-title>ACP-check: An anticancer peptide prediction model based on bidirectional long short-term memory and multi-features fusion strategy</article-title>. <source>Comput. Biol. Med</source>. <volume>148</volume>:<fpage>105868</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.105868</pub-id><pub-id pub-id-type="pmid">35868046</pub-id></mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/916147/overview">Bin Wei</ext-link>, Zhejiang University of Technology, China</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2058535/overview">Seung Won Lee</ext-link>, Sungkyunkwan University, Republic of Korea</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3286092/overview">Wenxuan Xing</ext-link>, Beijing Institute of Technology, China</p>
</fn>
</fn-group>
</back>
</article>