<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1513201</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2024.1513201</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>S-DCNN: prediction of ATP binding residues by deep convolutional neural network based on SMOTE</article-title>
<alt-title alt-title-type="left-running-head">Hao et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2024.1513201">10.3389/fgene.2024.1513201</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Hao</surname>
<given-names>Sixi</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1859274/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Cai-Yan</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2640029/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Hu</surname>
<given-names>Xiuzhen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1513889/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Feng</surname>
<given-names>Zhenxing</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/820889/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhang</surname>
<given-names>Gaimei</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yang</surname>
<given-names>Caiyun</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Huimin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>College of Sciences</institution>, <institution>Inner Mongolia University of Technology</institution>, <addr-line>Hohhot</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Mathematics and Statistics</institution>, <institution>Xinyang College</institution>, <addr-line>Xinyang</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>School of Computer Science and Technology/Baotou Medical College</institution>, <addr-line>Baotou</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Department of Obstetrics and Gynecology</institution>, <institution>Hohhot First Hospital</institution>, <addr-line>Hohhot</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/778584/overview">Pu-Feng Du</ext-link>, Tianjin University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2537531/overview">Alberto Cabri</ext-link>, University of Milan, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/182351/overview">Hao Lin</ext-link>, University of Electronic Science and Technology of China, China</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2886688/overview">Qiwen Dong</ext-link>, East China Normal University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Xiuzhen Hu, <email>hxz@imut.edu.cn</email>; Zhenxing Feng, <email>zxfeng@imut.edu.cn</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>06</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>15</volume>
<elocation-id>1513201</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Hao, Li, Hu, Feng, Zhang, Yang and Hu.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Hao, Li, Hu, Feng, Zhang, Yang and Hu</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Background</title>
<p>The realization of many protein functions requires binding with ligands. As a significant protein-binding ligand, ATP plays a crucial role in various biological processes. Currently, the precise prediction of ATP binding residues remains challenging.</p>
</sec>
<sec>
<title>Methods</title>
<p>Based on the sequence information, this paper introduces a method called S-DCNN for predicting ATP binding residues, utilizing a deep convolutional neural network (DCNN) enhanced with the synthetic minority over-sampling technique (SMOTE).</p>
</sec>
<sec>
<title>Results</title>
<p>The incorporation of additional feature parameters such as dihedral angles, energy, and propensity factors into the standard parameter set resulted in a significant enhancement in prediction accuracy on the ATP-289 dataset. The S-DCNN achieved the highest Matthews correlation coefficient value of 0.5031 and an accuracy rate of 97.06% on an independent test set. Furthermore, when applied to the ATP-221 and ATP-388 datasets for validation, the S-DCNN outperformed existing methods on ATP-221 and performed comparably to other methods on ATP-388 during independent testing.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>Our experimental results underscore the efficacy of the S-DCNN in accurately predicting ATP binding residues, establishing it as a potent tool in the prediction of ATP binding residues.</p>
</sec>
</abstract>
<kwd-group>
<kwd>ATP binding residues</kwd>
<kwd>synthetic minority over-sampling technique</kwd>
<kwd>deep convolutional neural network</kwd>
<kwd>propensity factors</kwd>
<kwd>dihedral angle</kwd>
<kwd>energy</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Genomics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Introduction</title>
<p>Adenosine 5&#x2032;-triphosphate (ATP) is an unstable high-energy phosphate compound. It interconverts with adenosine 5&#x2032;-diphosphate (ADP) to achieve energy storage and release in cells and ensures the energy supply of various life activities of cells. As an important ligand, ATP also plays a critical role in the realization of protein functions (<xref ref-type="bibr" rid="B1">Chauhan et al., 2009</xref>). For example, ATP binds with myosin to provide energy and promotes its combination with actin to form cross bridges, which is used to regulate muscle contraction (<xref ref-type="bibr" rid="B9">Holmes et al., 2003</xref>); the combination of ATP and sodium-potassium ATPase can regulate the concentration of intracellular sodium/potassium ions, thus maintaining the resting potential of the cell (<xref ref-type="bibr" rid="B14">Kanai et al., 2013</xref>). In fact, the protein-ATP interactions depend on ATP binding residues on proteins. Therefore, accurate prediction of ATP binding residues is of great value for understanding protein function, disease occurrence and molecular drug design.</p>
<p>At present, substantial advancements have been achieved in the prediction of ATP binding residues from protein sequences (<xref ref-type="bibr" rid="B8">Guo et al., 2005</xref>; <xref ref-type="bibr" rid="B1">Chauhan et al., 2009</xref>; <xref ref-type="bibr" rid="B21">Song et al., 2020b</xref>). In 2017, <xref ref-type="bibr" rid="B7">Ding et al. (2017)</xref> utilized the ATP-221 dataset devised by <xref ref-type="bibr" rid="B27">Yu et al. (2013)</xref>, extracting features from the discrete cosine transform of the position-specific scoring matrix (PSSM) and predicted relative solvent accessibility. They employed the random under-sampling (RUS) technique and weighted sparse representation-based classifier (WSRC) to predict protein-ATP binding sites, achieving a Matthews correlation coefficient (MCC) of 0.506 and an accuracy (ACC) of 96.8% on an independent test set. Similarly, <xref ref-type="bibr" rid="B29">Zhao et al. (2019)</xref> introduced an SXGBsite prediction model in the same year, utilizing PSSM and predicted relative solvent accessibility as parameters with the extreme gradient boosting algorithm. The prediction performance yielded MCC and ACC values of 0.463% and 96.5%, respectively, on the independent test set. <xref ref-type="bibr" rid="B18">Nguyen et al. (2019)</xref> developed a tool of DeepATP for predicting ATP-binding sites in membrane proteins, which combined evolutionary information in the form of PSSM and two-dimensional convolutional neural network. In 2020, our research group (<xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>) constructed a new dataset, ATP-289, and selected amino acids, hydrophilic-hydrophobic, polarity, predicted secondary structure, and relative solvent accessibility as feature parameters. By utilizing random undersampling with the support vector machine (SVM) algorithm, the MCC value reached 0.549 with 5-fold cross-validation. Additionally, <xref ref-type="bibr" rid="B20">Song et al. (2020a)</xref> utilized the ATP-388 dataset, choosing PSSM, predicted secondary structure, predicted relative solvent accessibility, and one-hot encoding as feature parameters. They applied class-weighted ensemble deep learning algorithms, achieving ACC and MCC values of 97.2% and 0.626, respectively, on the independent test set. In 2021, <xref ref-type="bibr" rid="B11">Hu et al. (2021)</xref> introduced the novel method DeepATPseq, achieved ACC and MCC values of 57.42% and 0.655, respectively, on the independent ATP-388 test set. <xref ref-type="bibr" rid="B17">Nguyen et al. (2022)</xref> applied multiple convolutional window scanning filters of a convolutional neural network on PSSM to predict ATP-binding sites, and the resulting model outperformed other algorithms on the same datasets.</p>
<p>In summary, previous studies have primarily enhanced the prediction accuracy of ATP binding residues in three main areas. Firstly, sampling techniques were frequently applied to address the significant imbalance between positive and negative samples. Secondly, novel feature parameters and extraction methods were integrated into the prediction models. Lastly, a variety of traditional machine learning algorithms and deep learning methods were utilized for prediction tasks.</p>
<p>This study introduced the S-DCNN method to enhance the accuracy of predicting ATP binding residues. A balanced dataset was created using the SMOTE algorithm, which preserved information integrity. New parameters, including dihedral angles, energy, and propensity factors, were introduced. Furthermore, the DCNN algorithm with three optimized hyperparameters enhanced the prediction of ATP binding residues. Finally, the S-DCNN was applied to two additional datasets to validate the model.</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>Materials and methods</title>
<sec id="s2-1">
<title>Datasets</title>
<p>The ATP dataset utilized in this study was constructed by our group (<xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>) through the following steps: initially, 1728 ATP protein chains were sourced from the semi-manual BioLip database (<xref ref-type="bibr" rid="B26">Yang et al., 2013</xref>); subsequently, these chains were filtered with sequence length (&#x3e;50 residues), the resolution (&#x3c;3&#xa0;&#xc5;), and the sequence identity (&#x3c;30%); resulting in the ATP-289 dataset with 289 protein chains including 3901 ATP binding residues and 104153 ATP non-binding residues; the dataset was then partitioned randomly into training and testing sets, with the former containing 260 protein chains encompassing 3526 ATP binding residues and 92804 non-binding residues, and the latter comprising 29 protein chains with 375 ATP binding residues and 11349 ATP non-binding residues. We performed five-fold cross-validation using the training set to obtain the trained model and validated the model&#x2019;s effectiveness using an independent test set. The source codes and datasets in this study are available at <ext-link ext-link-type="uri" xlink:href="https://github.com/tlhsx/S-DCNN">https://github.com/tlhsx/S-DCNN</ext-link>.</p>
<p>Previous researches indicated that residues neighboring binding sites can influence ligand interactions with these sites (<xref ref-type="bibr" rid="B13">Hu et al., 2016</xref>; <xref ref-type="bibr" rid="B16">Liu et al., 2019</xref>). To address this, protein sequences were segmented into fragments using the sliding window method, ensuring each amino acid resided at the fragment center by adding (L-1)/2 pseudo-amino acids at both sequence ends. Here, L denotes the fragment length. If a residue of (L &#x2b; 1)/2 was the binding residue, it was defined as a positive sample, otherwise, it was a negative sample. Based on the previous references (<xref ref-type="bibr" rid="B27">Yu et al., 2013</xref>; <xref ref-type="bibr" rid="B7">Ding et al., 2017</xref>; <xref ref-type="bibr" rid="B29">Zhao et al., 2019</xref>; <xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>; <xref ref-type="bibr" rid="B11">Hu et al., 2021</xref>), the intercepted fragment L was 17.</p>
</sec>
<sec id="s2-2">
<title>Statistical analysis and reclassification of predicted dihedral angle</title>
<p>The secondary structure of proteins can reflect the trend of the backbone chain, and the dihedral angle is the main descriptor of the secondary structure, which can reflect the local structural information of proteins and is a very effective feature for predicting protein-ligand binding residues (<xref ref-type="bibr" rid="B3">Chen et al., 2011</xref>; <xref ref-type="bibr" rid="B6">Cui et al., 2019</xref>; <xref ref-type="bibr" rid="B15">Liu et al., 2020</xref>). Here, we applied firstly the reclassified dihedral angles to the prediction of ATP binding residues. First, the values of phi (&#x3c6;) and psi (&#x3c8;) angles were obtained from the primary sequence using ANGLOR software (<xref ref-type="bibr" rid="B24">Wu and Zhang, 2008</xref>), and the value range of &#x3c6; and &#x3c8; angles both were [&#x2212;180&#xb0;, 180&#xb0;]; then every 15&#xb0; was divided into an interval, the &#x3c6; and &#x3c8; angles both were divided into 24 intervals; the difference value of probability of the &#x3c6; and &#x3c8; angles between the positive and negative samples was obtained, as shown in <xref ref-type="fig" rid="F1">Figures 1A, B</xref>. The formula for calculating the probability difference is expressed in <xref ref-type="disp-formula" rid="e1">Equation 1</xref> as follows:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:mo>&#x394;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2212;</mml:mo>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where, <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>24</mml:mn>
</mml:msubsup>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> <italic>n</italic>
<sub>
<italic>ij</italic>
</sub> represents the number of <italic>i</italic>
<sup>th</sup> interval in positive or negative samples; <italic>i</italic> (<italic>i</italic> &#x3d; 1, 2, &#x2026; , 24) represents the divided interval; <italic>j</italic> (&#x2b; or -) represents a positive or negative sample.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>The difference value of probability between positive and negative samples. Note: <bold>(A, B)</bold> represent the &#x3c6; and &#x3c8; angles, respectively. The <italic>x</italic>-axis represents the divided 24 intervals, the <italic>y</italic>-axis represents the difference value of probability between positive and negative samples.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g001.tif"/>
</fig>
<p>In <xref ref-type="fig" rid="F1">Figure 1</xref>, it was found that there exist significant differences in the probabilities of the &#x3c6; and &#x3c8; angles between the positive and negative samples. Using 0 as the threshold, we divided the &#x3c6; and &#x3c8; angles into three intervals, which were represented by functions g(x) and h(x) (i.e., <xref ref-type="disp-formula" rid="e2">Equations 2</xref>, <xref ref-type="disp-formula" rid="e3">3</xref>). Through the above analysis, we selected the reclassification information of the &#x3c6; and &#x3c8; angles as features.<disp-formula id="e2">
<mml:math id="m3">
<mml:mrow>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="&#x7c;">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>I</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>180</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>90</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi mathvariant="italic">II</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="(" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>90</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>60</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi mathvariant="italic">III</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="(" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>60</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mn>180</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
<disp-formula id="e3">
<mml:math id="m4">
<mml:mrow>
<mml:mi mathvariant="normal">h</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi mathvariant="normal">x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="" separators="&#x7c;">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi mathvariant="italic">I</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>180</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>60</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi mathvariant="italic">II</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="(" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mn>60</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mn>0</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi mathvariant="italic">III</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="italic">x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="(" close="]" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mn>0</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mn>180</mml:mn>
<mml:mo>&#x2218;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
</sec>
<sec id="s2-3">
<title>Statistical analysis and reclassification of energy values</title>
<p>In accordance with the principles of physics, the stability of molecular structures increases as energy decreases (<xref ref-type="bibr" rid="B23">Wang et al., 2021</xref>). In consideration of the specificity of ATP binding to proteins, we analyzed the Laplace energy values of the 20 amino acids between positive and negative samples in <xref ref-type="fig" rid="F2">Figure 2</xref>. The analysis revealed varying energy probabilities among the 20 amino acids between the positive and negative samples. Consequently, the amino acids were regrouped into four categories: the first group comprised G, I, S, T and V, with markedly higher values in the positive set than in the negative set; the second group included C, H and M, where the positive set&#x2019;s values slightly surpassed those of the negative set; the third group encompassed F, K, N, R, W and Y, in which the values of negative set were slightly higher than that of positive set; the fourth group consisted of A, D, E, L, P and Q, with notably higher values in the negative set than in the positive set. Subsequently, the energy reclassification details were utilized as feature parameters for ATP binding residue identification.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Energy probability of 20 amino acids in positive and negative samples. Note:The <italic>x</italic>-axis represents 20 amino acids, the <italic>y</italic>-axis represents the probability value, and P and N represent the positive and negative samples, respectively.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g002.tif"/>
</fig>
</sec>
<sec id="s2-4">
<title>Propensity factors feature</title>
<p>Researchers have analyzed the influence of binding residues and their neighboring residues on the protein-ATP binding process at the sequence fragment level. In protein-ligand interactions, the amino acids&#x2019; specific preferences in crucial binding residues that directly engage with the ligand play a vital role in the binding process. Hence, we introduced a novel parameter extraction method termed propensity factors. Originally suggested by <xref ref-type="bibr" rid="B4">Chou and Fasman (1974)</xref>, propensity factors have found utility in predicting protein secondary structures and ion ligand-binding sites (<xref ref-type="bibr" rid="B5">Chou and Fasman, 1979</xref>; <xref ref-type="bibr" rid="B25">Xu et al., 2022</xref>). The formula was expressed in <xref ref-type="disp-formula" rid="e4">Equation 4</xref> as follows:<disp-formula id="e4">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where, <inline-formula id="inf2">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf3">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf4">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>20</mml:mn>
</mml:msubsup>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf5">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>t</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <italic>n</italic>
<sub>
<italic>ij</italic>
</sub> represents the number of amino acid <italic>i</italic> in binding residues or non-binding residues; <italic>N</italic>
<sub>
<italic>j</italic>
</sub> represents the number of binding residues or non-binding residues; <italic>i</italic> (<italic>i</italic> &#x3d; 1, 2, &#x2026; , 20) represents 20 amino acids; <italic>j</italic> (<italic>j</italic> &#x3d; 1, 2) represents binding residues and non-binding residues. The propensity factors of 20 amino acids were statistically analyzed, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>The propensity factors of 20 amino acids of binding residues and non-binding residues. Note: letters on the radius represent 20 amino acids; red triangles and blue dots represent amino acid propensity factor values of binding residues and non-binding residues, respectively.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g003.tif"/>
</fig>
<p>The propensity factor values of amino acids D, G, H, K, R, S, and T within the binding residues exhibited notably higher values compared to those in the non-binding residues. Hence, the propensity factors, serving as a novel extraction method, effectively capture the preferences of the binding residues.</p>
</sec>
<sec id="s2-5">
<title>Basic features</title>
<p>Utilizing sequence information, we extracted amino acids and derived predicted secondary structure information, relative solvent accessibility, and the hydrophilic-hydrophobic profile as fundamental features. These parameters, extensively employed in prior research, have demonstrated exceptional predictive capabilities (<xref ref-type="bibr" rid="B3">Chen et al., 2011</xref>; <xref ref-type="bibr" rid="B28">Zhang et al., 2012</xref>; <xref ref-type="bibr" rid="B27">Yu et al., 2013</xref>; <xref ref-type="bibr" rid="B13">Hu et al., 2016</xref>; <xref ref-type="bibr" rid="B7">Ding et al., 2017</xref>; <xref ref-type="bibr" rid="B29">Zhao et al., 2019</xref>; <xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>; <xref ref-type="bibr" rid="B20">Song et al., 2020a</xref>; <xref ref-type="bibr" rid="B11">Hu et al., 2021</xref>). The hydrophilic-hydrophobic properties were used to classify the 20 amino acids into six distinct categories (<xref ref-type="bibr" rid="B19">P&#xe1;nek et al., 2005</xref>). Secondary structure and solvent accessibility predictions were generated through ANGLOR software, categorizing secondary structure into &#x3b1;-helix, &#x3b2;-sheet and coil. Following guidelines from a source (<xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>), relative solvent accessibility predictions were partitioned into four intervals: (0, 0.2], (0.2, 0.45], (0.45, 0.6], (0.6, 0.85].</p>
</sec>
<sec id="s2-6">
<title>Composition and site conservation information</title>
<p>The researchers observed significant disparities in amino acid frequencies between positive and negative samples, prompting the utilization of amino acid composition as a parameter (<xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>; <xref ref-type="bibr" rid="B23">Wang et al., 2021</xref>; <xref ref-type="bibr" rid="B22">Sun et al., 2022</xref>). Here, from the amino acids composition, secondary structure, relative solvent accessibility, &#x3c6; angle, &#x3c8; angle and energy, we extracted 21, 4, 5, 4, 4 and 5-dimensional composition information, respectively.</p>
<p>Previous studies have shown that the position weight matrix can well reflect the site conservation of amino acids in protein sequences (<xref ref-type="bibr" rid="B12">Hu et al., 2020</xref>; <xref ref-type="bibr" rid="B25">Xu et al., 2022</xref>). Here, the matrix elements were expressed in <xref ref-type="disp-formula" rid="e5">Equation 5</xref> as follows:<disp-formula id="e5">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>ln</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>where, <inline-formula id="inf6">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msqrt>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msqrt>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msqrt>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msqrt>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf7">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>21</mml:mn>
</mml:msubsup>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <italic>P</italic>
<sub>
<italic>0,j</italic>
</sub> represents the background probability, and <italic>n</italic>
<sub>
<italic>i,j</italic>
</sub> represents the frequency of the <italic>j</italic>
<sup>th</sup> amino acid at the <italic>i</italic>
<sup>th</sup> site, <italic>j</italic> represents 20 kinds of amino acids and vacancies, <italic>q</italic> represents the number of classifications, here it is 21. Two standard scoring matrices were obtained from the positive and negative training sets, and 2L-dimensional feature vector were obtained for each segment. Similarly, the predicted secondary structure (<italic>q</italic> &#x3d; 4), relative solvent accessibility (<italic>q</italic> &#x3d; 5), energy (<italic>q</italic> &#x3d; 5), &#x3c6; angle (<italic>q</italic> &#x3d; 4) and &#x3c8; angle (<italic>q</italic> &#x3d; 4) were also extracted by the same methods, and a total of 6 &#xd7; 2L-dimensional site conservative information was obtained.</p>
</sec>
<sec id="s2-7">
<title>Information entropy</title>
<p>The intermolecular hydrophobic effect was a complex process which was mainly determined by the entropy effect (<xref ref-type="bibr" rid="B23">Wang et al., 2021</xref>). The information entropy was an effective method to extract information of hydrophilic-hydrophobic (<xref ref-type="bibr" rid="B23">Wang et al., 2021</xref>; <xref ref-type="bibr" rid="B22">Sun et al., 2022</xref>; <xref ref-type="bibr" rid="B25">Xu et al., 2022</xref>). Here, the information entropy formula was expressed in <xref ref-type="disp-formula" rid="e6">Equation 6</xref> as follows:<disp-formula id="e6">
<mml:math id="m13">
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>q</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>log</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where, <inline-formula id="inf8">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>p</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</inline-formula>, <italic>n</italic>
<sub>
<italic>j</italic>
</sub> represents the frequency of occurrence of the <italic>j</italic>
<sup>th</sup> classification in a segment, <italic>N</italic> is the segment length, and <italic>q</italic> represents the hydrophilic-hydrophobic classifications and vacancies, here it is <italic>q</italic> &#x3d; 7.</p>
</sec>
<sec id="s2-8">
<title>Algorithm</title>
<sec id="s2-8-1">
<title>SMOTE algorithm</title>
<p>The number of non-binding residues was far greater than the that of binding residues, and the serious imbalance of data <ext-link ext-link-type="uri" xlink:href="https://www.baidu.com/link?url=BeFJqHl2d48NPZ4_VPbSPprIlTI22lfBqHbK_Juo8mru5SezVCtRRWdr5zcmUR3ZgZa1ihEwASh9rbuZk4wfiFeTlTD-JMo0dMPYP7cnzBK&#x26;wd=&#x26;eqid=a62c147d001b4d330000000263f4d295">would</ext-link> lead to a high false positive. For this reason, researchers often used random undersampling technology to process the dataset, which randomly selected the same number as the binding residue sample from the non-binding residue sample to construct a balanced set. The disadvantage of this method was the loss of non-binding residue sample information. To overcome the above limitations, we employed an over-sampling technique: SMOTE (<xref ref-type="bibr" rid="B2">Chawla et al., 2002</xref>). It generated the same number of non-binding residue samples from binding residue samples to construct a balanced training set. For each sample of the binding residue, the SMOTE algorithm calculated the distance (i.e., Euclidean distance) between the point and other binding residue sample points and selected the nearest k binding residue samples; then a sample point was randomly selected from the k sample points, the two points drew a line segment; finally it generated a new sample point by interpolation operation on the line segment, where k was the default value. This technique ensured that sample information would not be lost and the data had integrity. The SMOTE algorithm was different from the random over-sampling technique, and the newly generated sample was obtained by the analysis of the binding residue sample rather than direct copy, so it not only conformed to the generality of the binding residue sample, but also differed from each binding residue sample, which can effectively solve the classification over-fitting problem caused by the small decision interval. In the sample space, SMOTE generated new samples according to the following <xref ref-type="disp-formula" rid="e7">Equation 7</xref>:<disp-formula id="e7">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where, <inline-formula id="inf9">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the ATP binding residue sample in the training set, <inline-formula id="inf10">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the newly generated ATP binding residue sample, and <inline-formula id="inf11">
<mml:math id="m18">
<mml:mrow>
<mml:msup>
<mml:mi>x</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> represents a sample of ATP binding residue randomly selected from k neighbors of <inline-formula id="inf12">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>d</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.</p>
</sec>
</sec>
<sec id="s2-9">
<title>Deep convolutional neural network (DCNN)</title>
<p>As one of the most important branches of deep learning framework, DCNN usually consisted of the input layer, convolutional layer, pooling layer, fully connected layer and output layer. The potential complex information was detected for the input raw data, and then through a series of high-dimensional and high-level projection mapping, the deeper representation information of the classified objects was obtained. The alternating distribution of the convolutional layer and the pooling layer made the convolutional neural network had better fault tolerance and parallel processing ability, and the generalization ability and adaptability of the model were greatly enhanced. It has been widely used in various fields.</p>
<p>The DCNN model framework in this paper was implemented by Keras, and the bottom layer was based on the TensorFlow framework. Here, the batch normalization was employed to avoid vanishing gradients and speed up the convergence of the network. In order to prevent over-fitting of the model, the layer of dropout was employed. The relu nonlinear activation function was used to improve the expressive ability of the model and greatly shortened the learning cycle. To effectively avoid over-fitting problems caused by continued training, the early stopping module was used. Adam and cross-entropy were used as optimizer and loss function, respectively. The output layer <ext-link ext-link-type="uri" xlink:href="http://www.baidu.com/link?url=fZZdYsrpvwpfaJMAIlop6nbR-Vraffxm1OlJJT_S7cOUg8W4OwMl7kevcK5IrTOwIZmKRx_eewu3JBqn80lqF04h9hjms3M-gp_sByEVjU3&#x26;wd=&#x26;eqid=864e02240002f5740000000263106d22">applied</ext-link> the sigmoid function to make the classification objects output probability values between 0 and 1. The hyperparameters in DCNN algorithm had an influence on the training speed and performance of the predictor. Based on the previous research, we mainly optimized the following three hyperparameters. Here, the dropout was set as 0.2; the range of the number of convolutional layers was from 1 to 6; the range of filters and batch size was both from 2 to 128. Detailed description of DCNN architecture can be viewed at <ext-link ext-link-type="uri" xlink:href="https://github.com/tlhsx/S-DCNN">https://github.com/tlhsx/S-DCNN</ext-link>.</p>
</sec>
<sec id="s2-10">
<title>Validation methods and evaluation metrics</title>
<p>The validation methods in this paper were 5-fold cross-validation and independent testing. For the evaluation of the prediction results, we adopted the evaluation indicators commonly used in the identification of ATP binding residues: sensitivity (<italic>S</italic>
<sub>
<italic>n</italic>
</sub>), specificity (<italic>S</italic>
<sub>
<italic>p</italic>
</sub>), accuracy (<italic>ACC</italic>), and Matthews correlation coefficient (<italic>MCC</italic>) (i.e., <xref ref-type="disp-formula" rid="e8">Equations 8</xref>&#x2013;<xref ref-type="disp-formula" rid="e11">11</xref>) (<xref ref-type="bibr" rid="B30">Zou et al., 2023</xref>).<disp-formula id="e8">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
<disp-formula id="e9">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi>S</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
<disp-formula id="e10">
<mml:math id="m22">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mtext>CC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="e11">
<mml:math id="m23">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>where, the number of ATP binding residues correctly predicted is <italic>TP</italic>, otherwise it is <italic>FN</italic>; the number of ATP non-binding residues correctly predicted is <italic>TN</italic>, otherwise it is <italic>FP</italic>. In addition, the flowchart was clearly described in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Flowchart of the proposed method for predicting ATP binding residues. Note: B, F, PP, and E represent basic features, propensity factors, dihedral angle, and energy, respectively; DCNN, RUS-DCNN, and S-DCNN represent DCNN predictors with using samples without preprocessing, random undersampling, and SMOTE, respectively; S-RF and S-SVM stand for RF and SVM predictors based on SMOTE, respectively.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g004.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>Results</title>
<sec id="s3-1">
<title>Prediction results of basic feature parameters</title>
<p>The basic feature parameters were input into the S-DCNN predictor, and the results of the 5-fold cross-validation were shown in <xref ref-type="table" rid="T1">Table 1</xref>. Here, the S<sub>n</sub> and MCC values were 43.39% and 0.4101, respectively.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>The prediction results of 5-fold cross-validation.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Features</th>
<th align="center">S<sub>n</sub>(%)</th>
<th align="center">S<sub>p</sub>(%)</th>
<th align="center">ACC(%)</th>
<th align="center">MCC</th>
<th align="center">Hyperparameter</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">B</td>
<td align="center">43.39</td>
<td align="center">97.82</td>
<td align="center">95.83</td>
<td align="center">0.4101</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">B&#x2b;E</td>
<td align="center">44.36</td>
<td align="center">97.83</td>
<td align="center">95.90</td>
<td align="center">0.4176</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">B&#x2b;PP</td>
<td align="center">46.15</td>
<td align="center">97.91</td>
<td align="center">96.04</td>
<td align="center">0.4363</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">B&#x2b;F</td>
<td align="center">47.82</td>
<td align="center">97.93</td>
<td align="center">96.12</td>
<td align="center">0.4509</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">B&#x2b;F&#x2b;E</td>
<td align="center">48.47</td>
<td align="center">97.96</td>
<td align="center">96.15</td>
<td align="center">0.4597</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">B&#x2b;F&#x2b;PP</td>
<td align="center">49.32</td>
<td align="center">97.96</td>
<td align="center">96.18</td>
<td align="center">0.4663</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">B&#x2b;F&#x2b;PP&#x2b;E</td>
<td align="center">50.90</td>
<td align="center">98.01</td>
<td align="center">96.31</td>
<td align="center">0.4773</td>
<td align="center">4,16,16</td>
</tr>
<tr>
<td align="center">(B&#x2b;F&#x2b;PP&#x2b;E)&#x2a;</td>
<td align="center">58.82</td>
<td align="center">98.40</td>
<td align="center">96.97</td>
<td align="center">0.5681</td>
<td align="center">3,16,32</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: B, F, PP, and E represent basic features, propensity factors, dihedral angle and energy, respectively; () &#x2a; represents the prediction results after optimization of hyperparameters; the three hyperparameters are the number of convolution layers, filters and batch size, respectively.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-2">
<title>Prediction results of adding dihedral angle, energy and propensity factors</title>
<p>To improve the prediction performance, the dihedral angle, energy and propensity factors were introduced. The extracted dihedral angle, energy, propensity factor feature parameters and basic feature parameters were fused and input to the S-DCNN predictor, and the results were shown in <xref ref-type="table" rid="T1">Table 1</xref> on 5-fold cross-validation.</p>
<p>In <xref ref-type="table" rid="T1">Table 1</xref>, when the feature parameters PP, E or F were respectively added to the feature B, the prediction results of B &#x2b; F were relatively better. Then, the parameter PP or E was added to the above parameter set of B &#x2b; F, and the results with parameter set of B &#x2b; F &#x2b; PP were relatively better. When the feature parameters PP, E and F were added at the same time, the best prediction results were obtained. The S<sub>n</sub>, S<sub>p</sub>, ACC and MCC values with feature set of B &#x2b; F &#x2b; PP &#x2b; E reached 50.9%, 98.01%, 96.31% and 0.4773, respectively.</p>
</sec>
<sec id="s3-3">
<title>Optimization of hyperparameters</title>
<p>The prediction results of the three hyperparameters of the 5-fold cross-validation were shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. <xref ref-type="fig" rid="F5">Figure 5A</xref> was a bar chart of the MCC and S<sub>n</sub> values changing with the number of convolution layers. When the number of layers was 3, the S<sub>n</sub> and MCC values reached the peak at the same time, then the optimal number of layers is 3. From <xref ref-type="fig" rid="F5">Figures 5B, C</xref>, the optimal filters and batch size were 16 and 32, respectively.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Bar chart of S<sub>n</sub> and MCC values changing with hyperparameters. Note: <bold>(A&#x2013;C)</bold> represent the optimization of the number of convolution layers, filters and batch size, respectively, and the <italic>y</italic>-axis is the value of MCC and S<sub>n</sub>.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g005.tif"/>
</fig>
<p>The prediction results after optimization of hyperparameter were shown in (B &#x2b; F &#x2b; PP &#x2b; E)&#x2a; of <xref ref-type="table" rid="T1">Table 1</xref>. The S<sub>n</sub>, S<sub>p</sub>, ACC and MCC values reached 58.82%, 98.4%, 96.97%, and 0.5681, respectively.</p>
</sec>
<sec id="s3-4">
<title>Prediction results of DCNN algorithm with different preprocessing methods</title>
<p>To assess the efficacy of the SMOTE algorithm in predicting ATP binding residues, we conducted a comparative analysis between SMOTE and random undersampling alongside samples without preprocessing (referred to as RUS-DCNN and DCNN, respectively). In RUS-DCNN, for result stability, negative set samples were randomly selected ten times, with the average outcome of these ten selections serving as the final prediction. The prediction results of 5-fold cross-validation after optimization of hyperparameter were listed in <xref ref-type="table" rid="T2">Table 2</xref>. In <xref ref-type="table" rid="T2">Table 2</xref>, the MCC values of RUS-DCNN, DCNN and S-DCNN reached more than 0.438, and the ACC values of DCNN and S-DCNN reached more than 96.97%.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison of prediction results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Methods</th>
<th align="center">S<sub>n</sub>(%)</th>
<th align="center">S<sub>p</sub>(%)</th>
<th align="center">ACC(%)</th>
<th align="center">MCC</th>
<th align="center">Hyperparameter</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="2" align="left">DCNN</td>
<td align="left">29.75</td>
<td align="left">99.49</td>
<td align="left">96.97</td>
<td align="left">0.4385</td>
<td align="center">3,32,32</td>
</tr>
<tr>
<td align="left">&#x2212;29.07</td>
<td align="left">&#x2b;1.09</td>
<td align="left">0</td>
<td align="left">&#x2212;0.1296</td>
<td align="left"/>
</tr>
<tr>
<td rowspan="2" align="left">RUS-DCNN</td>
<td align="left">79.26</td>
<td align="left">74.24</td>
<td align="left">76.75</td>
<td align="left">0.5357</td>
<td align="center">2,32,32</td>
</tr>
<tr>
<td align="left">&#x2b;20.44</td>
<td align="left">&#x2212;24.16</td>
<td align="left">&#x2212;20.22</td>
<td align="left">&#x2212;0.0324</td>
<td align="left"/>
</tr>
<tr>
<td align="left">S-DCNN</td>
<td align="left">58.82 (49.20)</td>
<td align="left">98.40 (98.64)</td>
<td align="left">96.97 (97.06)</td>
<td align="left">0.5681 (0.5031)</td>
<td align="center">3,16,32</td>
</tr>
<tr>
<td rowspan="2" align="left">S-SVM</td>
<td align="left">52.69 (44.27)</td>
<td align="left">96.65 (97.95)</td>
<td align="left">95.06 (96.23)</td>
<td align="left">0.4171 (0.4097)</td>
<td align="center">&#x2014;</td>
</tr>
<tr>
<td align="left">&#x2212;6.13 (&#x2212;4.93)</td>
<td align="left">&#x2212;1.75 (&#x2212;0.69)</td>
<td align="left">&#x2212;1.91 (-0.83)</td>
<td align="left">&#x2212;0.151 (&#x2212;0.0934)</td>
<td align="left"/>
</tr>
<tr>
<td rowspan="2" align="left">S-RF</td>
<td align="left">41.76 (44.53)</td>
<td align="left">99.21 (98.94)</td>
<td align="left">97.10 (97.20)</td>
<td align="left">0.5140 (0.4950)</td>
<td align="center">&#x2014;</td>
</tr>
<tr>
<td align="left">&#x2212;17.06 (&#x2212;4.67)</td>
<td align="left">&#x2b;0.81 (&#x2b;0.3)</td>
<td align="left">&#x2b;0.13 (&#x2b;0.14)</td>
<td align="left">&#x2212;0.0541 (&#x2212;0.0081)</td>
<td align="left"/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: values in brackets are the prediction results of independent testing; DCNN, RUS-DCNN, and S-DCNN, represent DCNN, predictors with using raw dataset without preprocessing, random undersampling, and SMOTE, respectively; S-RF, and S-SVM, stand for RF, and SVM, predictors based on SMOTE, respectively; the second row of each method represents the difference between the results of the method and the S-DCNN method, &#x201c;&#x2b;&#x201d; and &#x201c;&#x2212;&#x201d; represent the increase and decrease of the prediction performance over the S-DCNN method, respectively; the three hyperparameters are the number of convolution layers, filters and batch size, respectively.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-5">
<title>Prediction results of different algorithms based on SMOTE</title>
<p>To verify the superiority of the S-DCNN algorithm, we computed the results of the SVM and random forest (RF) algorithm with SMOTE(i.e., S-SVM and S-RF) through 5-fold cross-validation, as detailed in <xref ref-type="table" rid="T2">Table 2</xref>. Specifically, the RF model utilized 500 decision trees, the SVM model employed a radial basis function kernel, and other parameters remained at default values.</p>
<p>To test the generalization ability of the prediction model, an independent testing set was utilized to predict ATP binding residues with the corresponding results outlined in <xref ref-type="table" rid="T2">Table 2</xref> within brackets. Across the evaluation metrics of MCC and ACC, S-SVM, S-RF, and S-DCNN achieved values exceeding 0.409% and 96.2% respectively. Notably, S-DCNN demonstrated superior performance in terms of S<sub>n</sub> and MCC. Moreover, the prediction model&#x2019;s performance was assessed using the area under the Receiver Operating Characteristic (ROC) curve (AUC). <xref ref-type="fig" rid="F6">Figure 6</xref> illustrates the ROC curves for various algorithms based on SMOTE on the ATP-289 independent testing set, where S-SVM, S-RF, and S-DCNN yielded AUC values of 0.8585, 0.8841, and 0.9088 respectively.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>ROC curves of different algorithms based on SMOTE on ATP-289 independent testing set.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g006.tif"/>
</fig>
</sec>
<sec id="s3-6">
<title>Comparison with previous results</title>
<p>To further verify the prediction performance of S-DCNN, we applied S-DCNN on another two frequently used datasets. The first set was constructed by <xref ref-type="bibr" rid="B27">Yu et al. (2013)</xref>, in which the training set had 221 protein chains (ATP-221), and the independent testing set had 50 protein chains (ATP-50). The other set was constructed by <xref ref-type="bibr" rid="B10">Hu et al. (2018)</xref>, in which the training set had 388 protein chains (ATP-388), and the independent testing set had 41 protein chains (ATP-41).</p>
</sec>
<sec id="s3-7">
<title>Prediction results on the 5-fold cross-validation</title>
<p>Using 5-fold cross-validation with optimized hyperparameters, the S-DCCN method was performed on the ATP-221 and ATP-388 datasets. The corresponding two prediction results were shown in <xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>, respectively. In <xref ref-type="table" rid="T3">Table 3</xref>, the S-DCNN achieved ACC of 97.0% on the ATP-221 dataset, surpassing other methods by 0.6%&#x2013;0.8%. The MCC of the S-DCNN also exhibited significant improvement, with an increase ranging from 3.6% to 12.5%. In <xref ref-type="table" rid="T4">Table 4</xref>, the ACC and MCC values of the S-DCNN on the ATP-388 dataset reached 97.04% and 0.5887, respectively. To make a better comparison, we also listed the prediction results of the previous on the ATP-221 and ATP-388 datasets.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Comparison of prediction performance on ATP-221 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">S<sub>n</sub>(%)</th>
<th align="center">S<sub>p</sub>(%)</th>
<th align="center">ACC(%)</th>
<th align="center">MCC</th>
<th align="center">Hyperparameter</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">S-DCNN</td>
<td align="center">58.4 (50.2)</td>
<td align="center">98.5 (98.8)</td>
<td align="center">97.0 (97.0)</td>
<td align="center">0.573 (0.545)</td>
<td align="center">3,32,32</td>
</tr>
<tr>
<td align="center">SXGBsite</td>
<td align="center">40.3 (43.7)</td>
<td align="center">98.6 (98.5)</td>
<td align="center">96.4 (96.5)</td>
<td align="center">0.448 (0.463)</td>
<td align="center">&#x2212;</td>
</tr>
<tr>
<td align="center">EC-RUS</td>
<td align="center">58.6 (45.4)</td>
<td align="center">97.9 (98.8)</td>
<td align="center">96.4 (96.8)</td>
<td align="center">0.537 (0.506)</td>
<td align="center">&#x2212;</td>
</tr>
<tr>
<td align="center">TargetS</td>
<td align="center">48.4 (50.1)</td>
<td align="center">98.2 (98.3)</td>
<td align="center">96.2 (96.5)</td>
<td align="center">0.492 (0.502)</td>
<td align="center">&#x2212;</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: values in brackets are the prediction results of independent testing; SXGBsite is data obtained from Reference (<xref ref-type="bibr" rid="B29">Zhao et al., 2019</xref>); EC-RUS, is data obtained from Reference (<xref ref-type="bibr" rid="B7">Ding et al., 2017</xref>); TargetS is data obtained from Reference (<xref ref-type="bibr" rid="B27">Yu et al., 2013</xref>); the three hyperparameters are the number of convolution layers, filters and batch size, respectively.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Comparison of prediction performance on ATP-388 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Method</th>
<th align="center">S<sub>n</sub>(%)</th>
<th align="center">S<sub>p</sub>(%)</th>
<th align="center">ACC(%)</th>
<th align="center">MCC</th>
<th align="center">Hyperparameter</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">S-DCNN</td>
<td align="right">58.97 (50.95)</td>
<td align="right">98.55 (98.99)</td>
<td align="right">97.04 (96.78)</td>
<td align="right">0.5887 (0.5850)</td>
<td align="center">3,32,16</td>
</tr>
<tr>
<td align="center">S-SITEatp</td>
<td align="right">69.88 (67.51)</td>
<td align="right">94.47 (92.65)</td>
<td align="right">93.53 (91.51)</td>
<td align="right">0.4550 (0.4160)</td>
<td align="center">&#x2212;</td>
</tr>
<tr>
<td align="center">NsitePred</td>
<td align="right">(46.74)</td>
<td align="right">(97.70)</td>
<td align="right">(95.39)</td>
<td align="right">(0.4560)</td>
<td align="center">&#x2212;</td>
</tr>
<tr>
<td align="center">TargetATPsit</td>
<td align="right">(41.25)</td>
<td align="right">(99.49)</td>
<td align="right">(96.84)</td>
<td align="right">(0.5590)</td>
<td align="center">&#x2212;</td>
</tr>
<tr>
<td align="center">ATPbinding</td>
<td align="right">59.00 (49.40)</td>
<td align="right">98.80 (99.50)</td>
<td align="right">97.30 (97.20)</td>
<td align="right">0.6130 (0.6260)</td>
<td align="center">&#x2212;</td>
</tr>
<tr>
<td align="center">DeepATPseq</td>
<td align="right">52.20 (57.42)</td>
<td align="right">99.03 (99.22)</td>
<td align="right">97.39 (97.32)</td>
<td align="right">0.6130 (0.6550)</td>
<td align="center">&#x2212;</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: values in brackets are the prediction results of independent testing.; S-SITEatp, NsitePred and TargetATPsit, are data obtained from Reference (<xref ref-type="bibr" rid="B10">Hu et al., 2018</xref>); ATPbinding is data obtained from Reference (<xref ref-type="bibr" rid="B20">Song et al., 2020a</xref>); DeepATPseq, is data obtained from Reference (<xref ref-type="bibr" rid="B11">Hu et al., 2021</xref>); the three hyperparameters are the number of convolution layers, filters and batch size, respectively.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s3-8">
<title>Prediction results of independent testing</title>
<p>The prediction results of independent testing were listed in <xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>, with values displayed in brackets. Notably, <xref ref-type="table" rid="T3">Table 3</xref> showcased the enhanced prediction performance of the S-DCNN method. Specifically, on the independent testing set ATP-50, S-DCNN achieved an ACC value of 97.0%, surpassing other methods by 0.2%&#x2013;0.5%. Concurrently, the S<sub>n</sub> and MCC values of S-DCNN exhibited notable enhancements, reaching 50.2% and 0.573, respectively, marking a 0.1%&#x2013;6.5% increase and 3.9%&#x2013;8.2% improvement compared to alternative methods. The S<sub>p</sub> value of the S-DCNN was slightly higher than that of other methods. In <xref ref-type="table" rid="T4">Table 4</xref>, the ACC and MCC values of the S-DCNN on the independent testing set ATP-41 reached 96.78% and 0.5850 respectively. In addition, we drew the ROC curve of the S-DCNN method on the ATP-50 and ATP-41 sets, as shown in <xref ref-type="fig" rid="F7">Figure 7</xref>. The AUC values of the S-DCNN method on the ATP-50 and ATP-41 sets were 0.9138 and 0.8973 respectively.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>ROC curve of S-DCNN method on ATP-50 <bold>(A)</bold> and ATP-41 set <bold>(B)</bold>.</p>
</caption>
<graphic xlink:href="fgene-15-1513201-g007.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>Discussion</title>
<p>The comparative analysis in <xref ref-type="table" rid="T1">Table 1</xref> revealed that introducing the feature parameters PP, E, and F simultaneously resulted in all four evaluation metrics reaching their maximum values. When compared to the prediction results using parameter B, there were notable increases in the values of S<sub>n</sub>, S<sub>p</sub>, ACC, and MCC. This suggests that incorporating dihedral angles, energy, and propensity factors is beneficial for accurately predicting ATP binding residues. Furthermore, after optimizing the hyperparameters, the prediction results were notably enhanced, with increases of 7.92% in S<sub>n</sub>, 0.39% in S<sub>p</sub>, 0.66% in ACC, and 9.08% in MCC. These results demonstrate the significant performance improvement achieved by optimizing the hyperparameters of the DCNN algorithm.</p>
<p>The data presented in <xref ref-type="table" rid="T2">Table 2</xref> highlights that S-DCNN yielded superior prediction results for the evaluation metrics ACC and MCC. Notably, the S<sub>n</sub> and MCC values of S-DCNN exhibited significant increases of 29.07% and 3.24%, respectively, in comparison to the DCNN outcomes. Similarly, compared to the prediction outcomes of RUS-DCNN, S-DCNN demonstrated substantial enhancements in S<sub>p</sub>, ACC, and MCC values by 24.16%, 20.22%, and 3.24%, respectively. This underscores the efficacy of employing SMOTE-based DCNN in improving prediction performance. Furthermore, in 5-fold cross-validation, S-DCNN demonstrated superior performance over S-SVM with increments in S<sub>n</sub>, S<sub>p</sub>, ACC, and MCC by 6.13%, 1.75%, 1.91%, and 15.1%, respectively, and over S-RF with increases in S<sub>n</sub> and MCC by 17.06% and 5.41%, respectively. For independent testing, S-DCNN displayed better performances than S-RF, with increases in S<sub>n</sub> and MCC by 4.67% and 0.81%, and compared to S-SVM with enhancements in S<sub>n</sub>, S<sub>p</sub>, ACC, and MCC by 4.95%, 0.69%, 0.83%, and 9.34%, respectively. Notably, the comparison of AUC values highlighted S-DCNN as the highest predictor, affirming its robustness in identifying ATP binding residues.</p>
<p>The prediction results from the 5-fold cross-validation and independent testing were shown in <xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref>. Compared to SXGBsite (<xref ref-type="bibr" rid="B29">Zhao et al., 2019</xref>), EC-RUS (<xref ref-type="bibr" rid="B7">Ding et al., 2017</xref>), and TargetS (<xref ref-type="bibr" rid="B27">Yu et al., 2013</xref>), S-DCNN showcased improvements in ACC values by 0.6%, 0.6%, and 0.8%, respectively. Notably, when compared with S-SITEatp (<xref ref-type="bibr" rid="B10">Hu et al., 2018</xref>), the S<sub>p</sub>, ACC, and MCC values of S-DCNN increased significantly by 4.08%, 3.51%, and 13.37%, respectively. Moreover, the S<sub>n</sub> value of S-DCNN closely resembled that of ATPbinding (<xref ref-type="bibr" rid="B21">Song et al., 2020b</xref>) and surpassed that of DeepATPseq (<xref ref-type="bibr" rid="B11">Hu et al., 2021</xref>) by 6.77%. The distinguishing results in independent testing, as detailed in <xref ref-type="table" rid="T3">Tables 3</xref>, <xref ref-type="table" rid="T4">4</xref> (values in brackets), indicated enhancements in ACC values compared to SXGBsite, EC-RUS, and TargetS by 0.5%, 0.2%, and 0.5%, respectively. Through the analysis in <xref ref-type="table" rid="T4">Table 4</xref>, S-DCNN presented superiority over S-SITEatp, NsitePred, and TargetATPsit (<xref ref-type="bibr" rid="B10">Hu et al., 2018</xref>) in terms of the evaluation metric MCC. Furthermore, it outperformed NsitePred, TargetATPsit, and ATPbinding in the evaluation of S<sub>n</sub>. The performance of S-DCNN closely rivaled that of DeepATPseq. Moreover, the enhanced prediction performance of S-DCNN across diverse datasets was evident from <xref ref-type="fig" rid="F7">Figures 7A, B</xref>, highlighting its robustness. In summary, the S-DCNN method demonstrated consistent reliability.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>Conclusion</title>
<p>To precisely predict ATP binding residues is a critical content for understanding protein function. In this paper, we proposed a novel method of S-DCNN for the prediction of ATP binding residues. Utilizing sequence information, we conducted statistical analysis on dihedral angles, energy, and propensity factors to extract new feature parameters. By optimizing hyperparameters in the S-DCNN predictor, we achieved significantly improved prediction results. Our approach in the S-DCNN involved different data optimization methods. The SMOTE algorithm was employed to prevent information loss in non-binding residue samples, while the DCNN algorithm captured in-depth representation from complex feature parameters with enhanced fault tolerance. Comparative analysis of the prediction results among the DCNN, SVM, and RF algorithms based on SMOTE demonstrated the superiority of the S-DCNN algorithm. Furthermore, applying the S-DCNN predictor to two additional datasets yielded further enhancements in ATP binding residue prediction. In conclusion, the S-DCNN predictor stands out as a robust tool for accurate ATP binding residue prediction. In next step, we will further improve prediction accuracy and build a web server with a user-friendly interface to predict the ATP binding residues.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding authors.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>SH: Methodology, Software, Writing&#x2013;original draft. C-YL: Validation, Writing&#x2013;review and editing. XH: Supervision, Validation, Writing&#x2013;review and editing, Investigation. ZF: Writing&#x2013;original draft, Writing&#x2013;review and editing, Supervision. GZ: Data curation, Formal Analysis, Writing&#x2013;review and editing. CY: Data curation, Formal Analysis, Writing&#x2013;review and editing. HH: Writing&#x2013;review and editing, Investigation.</p>
</sec>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research, authorship, and/or publication of this article. This work was supported by the National Natural Science Foundation of China (61961032), the Natural Science Foundation of Inner Mongolia of China (2024MS06027), the Operation expenses basic scientific research of Inner Mongolia of China (JY20230067), Inner Mongolia College students innovation and entrepreneurship training program project (S202119127007) and Baotou Medical College Science Foundation Project (BYJJ-ZROM 202209).</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chauhan</surname>
<given-names>J. S.</given-names>
</name>
<name>
<surname>Mishra</surname>
<given-names>N. K.</given-names>
</name>
<name>
<surname>Raghava</surname>
<given-names>G. P.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Identification of ATP binding residues of a protein from its primary sequence</article-title>. <source>BMC Bioinforma.</source> <volume>10</volume>, <fpage>434</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-10-434</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chawla</surname>
<given-names>N. V.</given-names>
</name>
<name>
<surname>Bowyer</surname>
<given-names>K. W.</given-names>
</name>
<name>
<surname>Hall</surname>
<given-names>L. O.</given-names>
</name>
<name>
<surname>Kegelmeyer</surname>
<given-names>W. P.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>SMOTE: synthetic minority over-sampling technique</article-title>. <source>J. Artif. Intell. Res.</source> <volume>16</volume>, <fpage>321</fpage>&#x2013;<lpage>357</lpage>. <pub-id pub-id-type="doi">10.1613/jair.953</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Mizianty</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Kurgan</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>ATPsite: sequence-based prediction of ATP-binding residues</article-title>. <source>Proteome Sci.</source> <volume>9</volume> (<issue>S1</issue>), <fpage>S4</fpage>. <pub-id pub-id-type="doi">10.1186/1477-5956-9-S1-S4</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chou</surname>
<given-names>P. Y.</given-names>
</name>
<name>
<surname>Fasman</surname>
<given-names>G. D.</given-names>
</name>
</person-group> (<year>1974</year>). <article-title>Conformational parameters for amino acids in helical, beta-sheet, and random coil regions calculated from proteins</article-title>. <source>Biochemistry</source> <volume>13</volume> (<issue>2</issue>), <fpage>211</fpage>&#x2013;<lpage>222</lpage>. <pub-id pub-id-type="doi">10.1021/bi00699a001</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chou</surname>
<given-names>P. Y.</given-names>
</name>
<name>
<surname>Fasman</surname>
<given-names>G. D.</given-names>
</name>
</person-group> (<year>1979</year>). <article-title>Prediction of beta-turns</article-title>. <source>Biophysical J.</source> <volume>26</volume> (<issue>3</issue>), <fpage>367</fpage>&#x2013;<lpage>383</lpage>. <pub-id pub-id-type="doi">10.1016/S0006-3495(79)85259-5</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cui</surname>
<given-names>Y. F.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Q. W.</given-names>
</name>
<name>
<surname>Hong</surname>
<given-names>D. C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X. K.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Predicting protein-ligand binding residues with deep convolutional neural networks</article-title>. <source>BMC Bioinforma.</source> <volume>20</volume> (<issue>1</issue>), <fpage>93</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-019-2672-1</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>Y. J.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Identification of protein-ligand binding sites by sequence information and ensemble classifier</article-title>. <source>J. Chem. Inf. Model.</source> <volume>57</volume> (<issue>12</issue>), <fpage>3149</fpage>&#x2013;<lpage>3161</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.7b00307</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y. X.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Z. R.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>A novel statistical ligand-binding site predictor: application to ATP-binding sites</article-title>. <source>Protein Eng. Des. and Sel.</source> <volume>18</volume> (<issue>2</issue>), <fpage>65</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1093/protein/gzi006</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Holmes</surname>
<given-names>K. C.</given-names>
</name>
<name>
<surname>Angert</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Kull</surname>
<given-names>F. J.</given-names>
</name>
<name>
<surname>Jahn</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Schroder</surname>
<given-names>R. R.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>Electron cryo-microscopy shows how strong binding of myosin to actin releases nucleotide</article-title>. <source>Nature</source> <volume>425</volume> (<issue>6956</issue>), <fpage>423</fpage>&#x2013;<lpage>427</lpage>. <pub-id pub-id-type="doi">10.1038/nature02005</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>D. J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>ATPbind: accurate protein-ATP binding site prediction by combining sequence-profiling and structure-based comparisons</article-title>. <source>J. Chem. Inf. Model.</source> <volume>58</volume> (<issue>2</issue>), <fpage>501</fpage>&#x2013;<lpage>510</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.7b00397</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>L. L.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>Y. S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>K. W.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Accurate prediction of protein-ATP binding residues using position-specific frequency matrix</article-title>. <source>Anal. Biochem.</source> <volume>626</volume>, <fpage>114241</fpage>. <pub-id pub-id-type="doi">10.1016/j.ab.2021.114241</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Ge</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z. X.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Recognizing five molecular ligand&#x2010;binding sites with similar chemical structure</article-title>. <source>J. Comput. Chem.</source> <volume>41</volume> (<issue>2</issue>), <fpage>110</fpage>&#x2013;<lpage>118</lpage>. <pub-id pub-id-type="doi">10.1002/jcc.26077</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>Q. W.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Protein ligand-specific binding residue predictions by an ensemble classifier</article-title>. <source>BMC Bioinforma.</source> <volume>17</volume> (<issue>1</issue>), <fpage>470</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-016-1348-3</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kanai</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ogawa</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Vilsen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Cornelius</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Toyoshima</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Crystal structure of a Na<sup>&#x2b;</sup>-bound Na<sup>&#x2b;</sup>, K<sup>&#x2b;</sup>-ATPase preceding the E1P state</article-title>. <source>Nature</source> <volume>502</volume> (<issue>7470</issue>), <fpage>201</fpage>&#x2013;<lpage>206</lpage>. <pub-id pub-id-type="doi">10.1038/nature12578</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z. X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Recognizing ion ligand-binding residues by random forest algorithm based on optimized dihedral angle</article-title>. <source>Front. Bioeng. Biotechnol.</source> <volume>8</volume>, <fpage>493</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2020.00493</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z. X.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X. J.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Prediction of acid radical ion binding residues by K-nearest neighbors classifier</article-title>. <source>BMC Mol. Cell Biol.</source> <volume>20</volume> (<issue>Suppl. 3</issue>), <fpage>52</fpage>. <pub-id pub-id-type="doi">10.1186/s12860-019-0238-8</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>T. T. D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>Q. T.</given-names>
</name>
<name>
<surname>Ou</surname>
<given-names>Y. Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Using multiple convolutional window scanning of convolutional neural network for an efficient prediction of ATP&#x2010;binding sites in transport proteins</article-title>. <source>Proteins Struct. Funct. Bioinforma.</source> <volume>90</volume> (<issue>7</issue>), <fpage>1486</fpage>&#x2013;<lpage>1492</lpage>. <pub-id pub-id-type="doi">10.1002/prot.26329</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nguyen</surname>
<given-names>T. T. D.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>N. Q. K.</given-names>
</name>
<name>
<surname>Kusuma</surname>
<given-names>R. M. I.</given-names>
</name>
<name>
<surname>Ou</surname>
<given-names>Y. Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Prediction of ATP-binding sites in membrane proteins using a two-dimensional convolutional neural network</article-title>. <source>J. Mol. Graph. Model.</source> <volume>92</volume>, <fpage>86</fpage>&#x2013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1016/j.jmgm.2019.07.003</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>P&#xe1;nek</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Eidhammer</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Aasland</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>A new method for identification of protein(sub)families in a set of proteins based on hydropathy distribution in proteins</article-title>. <source>Proteins Struct. Funct. Bioinforma.</source> <volume>58</volume> (<issue>4</issue>), <fpage>923</fpage>&#x2013;<lpage>934</lpage>. <pub-id pub-id-type="doi">10.1002/prot.20356</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>J. Z.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>Y. C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>G. X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R. Q.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>L. Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020a</year>). <article-title>A novel prediction method for ATP-binding sites from protein primary sequences based on fusion of deep convolutional neural network and ensemble learning</article-title>. <source>IEEE Access</source> <volume>8</volume>, <fpage>21485</fpage>&#x2013;<lpage>21495</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.2968847</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>J. Z.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>G. X.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>C. Y.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>J. Q.</given-names>
</name>
</person-group> (<year>2020b</year>). <article-title>A novel sequence-based prediction method for ATP-binding sites using fusion of SMOTE algorithm and random forests classifier</article-title>. <source>Biotechnol. and Biotechnol. Equip.</source> <volume>34</volume> (<issue>1</issue>), <fpage>1336</fpage>&#x2013;<lpage>1346</lpage>. <pub-id pub-id-type="doi">10.1080/13102818.2020.1840436</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z. X.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Lv</surname>
<given-names>H. T.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Z. Y.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Predicting Ca<sup>2&#x2b;</sup> and Mg<sup>2&#x2b;</sup> ligand binding sites by deep neural network algorithm</article-title>. <source>BMC Bioinforma.</source> <volume>22</volume> (<issue>Suppl. 12</issue>), <fpage>324</fpage>. <pub-id pub-id-type="doi">10.1186/s12859-021-04250-0</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z. X.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Recognition of ion ligand binding sites based on amino acid features with the fusion of energy, physicochemical and structural features</article-title>. <source>Curr. Pharm. Des.</source> <volume>27</volume> (<issue>8</issue>), <fpage>1093</fpage>&#x2013;<lpage>1102</lpage>. <pub-id pub-id-type="doi">10.2174/1381612826666201029100636</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>S. T.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>ANGLOR: a composite machine-learning algorithm for protein backbone torsion angle prediction</article-title>. <source>Plos One</source> <volume>3</volume> (<issue>10</issue>), <fpage>e3400</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0003400</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>X. Z.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z. X.</given-names>
</name>
<name>
<surname>Pang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>You</surname>
<given-names>X. X.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Recognition of metal ion ligand-binding residues by adding correlation features and propensity factors</article-title>. <source>Front. Genet.</source> <volume>12</volume>, <fpage>793800</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2021.793800</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J. Y.</given-names>
</name>
<name>
<surname>Roy</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>BioLiP: a semi-manually curated database for biologically relevant ligand-protein interactions</article-title>. <source>Nuclc Acids Res.</source> <volume>41</volume> (<issue>D1</issue>), <fpage>D1096</fpage>&#x2013;<lpage>D1103</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gks966</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>H. B.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>J. Y.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Designing template-free predictor for targeting protein-ligand binding sites with classifier ensemble and spatial clustering</article-title>. <source>IEEE/ACM Trans. Comput. Biol. Bioinforma.</source> <volume>10</volume> (<issue>4</issue>), <fpage>994</fpage>&#x2013;<lpage>1008</lpage>. <pub-id pub-id-type="doi">10.1109/TCBB.2013.104</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y. N.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S. S.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>Y. X.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>H. B.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Predicting protein-ATP binding sites from primary sequence through fusing bi-profile sampling of multi-view features</article-title>. <source>BMC Bioinforma.</source> <volume>13</volume>, <fpage>118</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2105-13-118</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>Z. Q.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y. H.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>SXGBsite: prediction of protein-ligand binding sites using sequence information and extreme gradient boosting</article-title>. <source>Genes</source> <volume>10</volume> (<issue>12</issue>), <fpage>965</fpage>. <pub-id pub-id-type="doi">10.3390/genes10120965</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>X. D.</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>L. P.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>P. L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>K. J.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Accurately identifying hemagglutinin using sequence information and machine learning methods</article-title>. <source>Front. Med.</source> <volume>10</volume>, <fpage>1281880</fpage>. <pub-id pub-id-type="doi">10.3389/fmed.2023.1281880</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>