<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1736037</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2025.1736037</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Neural network&#x2013;based approach for improving the evaluation of antibody&#x2013;antigen docking poses</article-title>
<alt-title alt-title-type="left-running-head">Meta et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2025.1736037">10.3389/fphy.2025.1736037</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Meta</surname>
<given-names>Alessandro</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3279555"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing&#x2013;original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Ruocco</surname>
<given-names>Giancarlo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/377836"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing&#x2013;original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Milanetti</surname>
<given-names>Edoardo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/921766"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing&#x2013;original draft</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>Department of Physics, Sapienza University</institution>, <city>Rome</city>, <country country="IT">Italy</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>Center for Life Nano &#x26; Neuro Science, Istituto Italiano di Tecnologia</institution>, <city>Rome</city>, <country country="IT">Italy</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>Link Campus University</institution>, <city>Rome</city>, <country country="IT">Italy</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Edoardo Milanetti, <email xlink:href="mailto:edoardo.milanetti@uniroma1.it">edoardo.milanetti@uniroma1.it</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-06">
<day>06</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1736037</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>27</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>05</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Meta, Ruocco and Milanetti.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Meta, Ruocco and Milanetti</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-06">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>The role of artificial intelligence (AI)&#x2013;based approaches in computational biology and molecular biophysics has become increasingly central over the past decade; however, many challenges remain unresolved, such as the accurate prediction of protein&#x2013;protein complexes, the complete solution of which would have a significant impact both on our understanding of cellular mechanisms and on the development of therapeutic and diagnostic strategies. Here, we present a protocol based on multiple minimal neural network (NN)&#x2013;based approaches, trained on a set of carefully selected physicochemical features, to discriminate docking decoy poses (structurally distant from the experimental complex) from native-like poses (structurally close to the native conformation) within a specific class of biologically relevant protein&#x2013;protein complexes, namely antibody&#x2013;antigen systems in which the antigen is a protein. A specific version of the proposed method, trained on a set of antibody&#x2013;antigen interface descriptors, some of which are derived from graph theory to capture the geometric complexity of intermolecular interactions, was compared with ITScore-PP, the docking score provided by HDOCK. This NN-based approach, demonstrates the ability not only to distinguish native-like poses from decoys, but also, more challengly, to discriminate intermediate poses from native-like ones. Furthermore, it was also able to predict the DockQ score, a widely used metric for assessing docking pose quality, showing a larger absolute Pearson correlation coefficient than ITScore-PP. The ability of our NN-based approach, which relies solely on structural interface features, to identify accurate dockings highlights its potential as a valuable tool for improving the ranking of antibody&#x2013;antigen docking poses and underscores the importance of sppropriate feature selection in protein-protein interaction modeling.</p>
</abstract>
<kwd-group>
<kwd>AI-driven approaches</kwd>
<kwd>antibody&#x2013;antigen systems</kwd>
<kwd>binding modes</kwd>
<kwd>binding properties</kwd>
<kwd>CDRs</kwd>
<kwd>docking scores</kwd>
<kwd>decoy docking poses</kwd>
<kwd>docking poses</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was partially funded by grants from ERC-2019-Synergy Grant (ASTRA, n. 855,923); EIC-2022-PathfinderOpen (ivBM-4PAP, n. 101098989); Project &#x2018;National Center for Gene Therapy and Drugs based on RNA Technology&#x2019; (CN00000041) financed by NextGeneration EU PNRRMUR&#x2014;M4C2&#x2014;Action 1.4&#x2014;Call &#x2018;Potenziamento strutture di ricerca e creazione di campioni nazionali di R&#x26;S&#x2019; (CUP J33C22001130001); MUR PRIN 2022 (CUP: B53D2300399 0006) to EM.</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="3"/>
<equation-count count="25"/>
<ref-count count="51"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Complex Physical Systems</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>The field of protein science has experienced a profound transformation in recent years, largely fueled by the rapid development of artificial intelligence (AI) and machine learning approaches [<xref ref-type="bibr" rid="B1">1</xref>, <xref ref-type="bibr" rid="B2">2</xref>]. The continuous growth of experimental datasets, together with increasingly sophisticated learning algorithms and advances in high-performance computing infrastructures, especially GPU-based platforms, has led to unprecedented progress in tackling complex questions in computational biology, bioinformatics, and molecular biophysics [<xref ref-type="bibr" rid="B3">3</xref>].</p>
<p>One of the most striking breakthroughs enabled by AI has been the prediction of tertiary protein structures [<xref ref-type="bibr" rid="B4">4</xref>]. Algorithms such as AlphaFold2 [<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B6">6</xref>] and RoseTTAFold [<xref ref-type="bibr" rid="B7">7</xref>] have fundamentally changed the landscape of structural biology by providing near-experimental accuracy in structural predictions, with a significant impact on protein modeling and rational drug design. Traditional drug discovery is both time-consuming and expensive, but emerging computational methods, including AI-driven approaches, have demonstrated their potential to substantially accelerate the process while reducing costs [<xref ref-type="bibr" rid="B8">8</xref>].</p>
<p>Notably, the most significant advances in protein design for therapeutic purposes, including monoclonal antibody engineering, depend not only on accurate single-protein structure prediction but also on the ability to model protein&#x2013;protein interactions [<xref ref-type="bibr" rid="B9">9</xref>&#x2013;<xref ref-type="bibr" rid="B13">13</xref>]. These interactions are central to understanding cellular mechanisms, both physiological and pathological, and are crucial for structure-based drug design strategies. Although the AlphaFold3 algorithm [<xref ref-type="bibr" rid="B14">14</xref>] has shown remarkable improvements in predicting biomolecular interactions, further approaches are required to fully exploit both computational power and predictive structural models. In particular, biomolecular binding interfaces display diverse physicochemical properties depending on the molecular partners involved (e.g., protein&#x2013;protein versus protein&#x2013;nucleic acid interfaces), highlighting the need for problem-specific feature engineering [<xref ref-type="bibr" rid="B15">15</xref>, <xref ref-type="bibr" rid="B16">16</xref>].</p>
<p>Therefore, despite significant progress, predicting the structure of protein&#x2013;protein complexes remains a challenging task, particularly in the case of antibody&#x2013;antigen systems [<xref ref-type="bibr" rid="B17">17</xref>, <xref ref-type="bibr" rid="B18">18</xref>], which are extensively studied due to their importance in both therapeutic and diagnostic applications. AI-based methods offer unique advantages in this context [<xref ref-type="bibr" rid="B19">19</xref>, <xref ref-type="bibr" rid="B20">20</xref>], providing data-driven strategies that can complement physics-based approaches and capture subtle structural patterns associated with molecular recognition.</p>
<p>Over the past decade, antibodies have emerged as powerful therapeutic agents, benefiting from technological advances that allow their structure and function to be characterized with increasing precision. Effective antibody design requires a deep understanding of the structural determinants of antibody&#x2013;antigen recognition. While experimental methods such as X-ray crystallography, cryo-electron microscopy, NMR, and mutagenesis provide high-resolution insights, they are resource-intensive and time-demanding. Computational approaches, particularly molecular docking, represent a valuable and efficient alternative. Several docking platforms, including ClusPro [<xref ref-type="bibr" rid="B21">21</xref>], LightDock [<xref ref-type="bibr" rid="B22">22</xref>], ZDOCK [<xref ref-type="bibr" rid="B23">23</xref>], HDOCK [<xref ref-type="bibr" rid="B24">24</xref>], and HADDOCK [<xref ref-type="bibr" rid="B25">25</xref>], have been developed to generate docking poses of antibody&#x2013;antigen complexes [<xref ref-type="bibr" rid="B26">26</xref>, <xref ref-type="bibr" rid="B27">27</xref>]. However, identifying near-native conformations remains challenging, as current scoring functions are often optimized for binding affinity rather than structural accuracy. Deep learning methods are increasingly being explored to overcome these limitations by directly extracting informative patterns from structural data [<xref ref-type="bibr" rid="B28">28</xref>, <xref ref-type="bibr" rid="B29">29</xref>]. In this context, we present a study emphasizing the role of careful feature selection and combination strategies in describing antibody&#x2013;antigen interfaces for predictive modeling using both supervised and unsupervised machine learning methods.</p>
<p>Here, we explore the application of minimal yet effective machine learning (ML) techniques, in particular using Neural Network (NN), to the analysis and discrimination of docking poses in antibody&#x2013;antigen complexes. We take into account both supervised and unsupervised approaches, considering in particular the principal component analysis (PCA), to evaluate their ability to distinguish between native-like and fully decoy docking conformations. Furthermore, we demonstrate that a simple NN trained on a set of interface descriptors, some of which are derived from graph-theoretical representations, can not only separate native-like from decoy poses but also correlate strongly with DockQ score [<xref ref-type="bibr" rid="B30">30</xref>], a widely used metric for evaluating docking quality (which is defined as a linear combination of rescaled CAPRI-standard evaluation metrics [<xref ref-type="bibr" rid="B31">31</xref>] (see <xref ref-type="disp-formula" rid="e1">Equations 1</xref>&#x2013;<xref ref-type="disp-formula" rid="e3">3</xref>). Finally, we compare the performance of this minimal NN-based framework with the docking score produced by HDOCK, which has already been used to test the predictive capability of antibody&#x2013;antigen structural models [<xref ref-type="bibr" rid="B26">26</xref>], highlighting its potential as a complementary strategy to improve the ranking of antibody&#x2013;antigen docking poses. In this context, the choice of the docking method is not central, since the methodological requirement is solely to generate both native-like and decoy docking poses, which serve as the basis for training the predictive algorithm, regardless of the success rate of the docking method employed. More specifically, we analyze a dataset of approximately 2,200 experimentally resolved antibody&#x2013;antigen complexes. For each complex, docking was performed using HDOCK to generate a pool of docking poses, which were then classified as decoys or native-like according to the DockQ score. Overall, the presented approach demonstrates how feature engineering combined with AI-driven approaches can effectively classify and predict the quality descriptor of docking poses of antibody&#x2013;antigen conformations, thereby supporting future developments in structure-based antibody design.</p>
</sec>
<sec sec-type="results" id="s2">
<label>2</label>
<title>Results</title>
<p>Despite the significant progress that ML techniques have brought to the field of computational biology, improving the evaluation of docking poses remains a challenge that is not yet fully solved [<xref ref-type="bibr" rid="B32">32</xref>&#x2013;<xref ref-type="bibr" rid="B35">35</xref>]. Here, we show that the appropriate selection of features capable of capturing the geometric properties of the interface between predicted dimeric structures, when used in simple NN models, can help improve the assessment of docking poses provided by the docking score.</p>
<p>In particular, we employed a set of antibody&#x2013;antigen complexes (considering only protein antigens), since incorrectly predicted poses may involve regions other than the complementarity-determining regions (CDRs), which consist of six hypervariable loops and exhibit physicochemical properties that differ considerably from those of the native conformations.</p>
<p>This work focuses on discussing the selection of the number of parameters in a simple NN to achieve generalizable discrimination between decoy and native-like docking poses, as well as accurate prediction of the DockQ score, which is typically used to evaluate the quality of a docking pose. The results are discussed in the following sections.</p>
<sec id="s2-1">
<label>2.1</label>
<title>Dataset analysis and definition of native-like and decoy docking poses</title>
<p>As a first analysis, we investigated the composition of the dataset used, focusing in particular on the docking poses classified as well-predicted (i.e., structurally similar to the experimentally resolved native conformation of the complex) and on those incorrectly predicted (i.e., with structures that are considerably different from the corresponding experimental native conformation). To this end, we employed the DockQ score (see Methods for a more detailed description), which is able to classify native-like poses and decoy poses according to a threshold value.</p>
<p>In <xref ref-type="fig" rid="F1">Figure 1a</xref>, we report the DockQ distribution for all docking poses generated by the HDOCK method. In particular, for each antibody&#x2013;antigen docking simulation, we considered the top 10 docking poses ranked by score. The distribution shows that the two main peaks of the probability density function (PDF) are centered at low DockQ values, which are less then 0.24, and at high DockQ values, which are higher than 0.81, indicating that only a small fraction of poses are predicted as native-like (DockQ<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.81</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>), while the majority correspond to decoy poses (DockQ <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.24</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). A cartoon representation of the structural alignment between the docking pose and the native structure, for different ranges of DockQ values, is shown in <xref ref-type="fig" rid="F1">Figure 1c</xref>. This clearly highlights the difficulty of docking algorithms in accurately predicting the native conformation of interacting proteins. Very high DockQ values (close to 1) typically correspond to very small Root Mean Square Deviation (RMSD) values, which can be interpreted as structural fluctuations within thermal noise of experimentally determined native conformations [<xref ref-type="bibr" rid="B36">36</xref>, <xref ref-type="bibr" rid="B37">37</xref>]. Therefore, the ability of the approaches proposed in the following sections must rely on identifying, based on specific interfacial structural properties, the decoy docking poses. In this way, the algorithm can be trained to discriminate between decoy and native-like conformations in a fully general manner, even when the predicted antibody&#x2013;antigen complex exhibits an interface significantly different from those included in the training set.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Docking pose dataset analysis and class definition. <bold>(a)</bold> Probability density function of DockQ values for all docking poses in the dataset. The inset shows the fraction (and absolute number) of poses in the three classes: Decoy (DockQ <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24), Intermediate (0.24 <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> DockQ <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81), and Native-like (DockQ <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81). On the right, the class proportions (and absolute numbers) are reported for the DNL and DINL datasets, respectively. <bold>(b)</bold> Probability density function of the docking score (ITScore-PP) provided by the HDOCK method. Red and blue curves represent the distributions for the decoy and native-like classes, respectively. <bold>(c)</bold> Cartoon representation (example) of antibody&#x2013;antigen docking structures across different DockQ ranges. The experimentally resolved antibody and antigen structures are shown in blue and gray, respectively, while the antigen structures placed by the docking algorithm are shown in red (decoy, DockQ <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.24), yellow (intermediate, 0.24 <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> DockQ <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81), and pink (native-like, DockQ <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81).</p>
</caption>
<graphic xlink:href="fphy-13-1736037-g001.tif">
<alt-text content-type="machine-generated">Panel a presents 3 bar graphs showing the composition fractions of decoy, intermediate, and native-like structures for all the docking poses and for the DNL and DINL datasets alongside the probability distribution of DockQ in the full dataset. Panel b displays the probability density function plots of ITScore-PP, comparing decoy and native-like structures with overlapping red and blue peaks. Panel c illustrates three docking poses in varying colors, categorized by increasing DockQ scores: less than 0.24, between 0.24 and 0.81, and greater than 0.81, where the ligands are colored in red, yellow, and pink, accordingly to the accuracy of the docking.</alt-text>
</graphic>
</fig>
<p>In particular, according to the DockQ values calculated for each docking pose, the overall dataset is composed as follows: 19,406 decoy docking poses, 790 intermediate docking poses, and 1,684 native-like docking poses (see <xref ref-type="fig" rid="F1">Figure 1a</xref>; <xref ref-type="table" rid="T1">Table 1</xref>). As shown by the bimodal trend in the distribution in <xref ref-type="fig" rid="F1">Figure 1a</xref>, in most cases the docking method returns a pose that is structurally distant from the reference structure (i.e., the experimentally resolved complex). However, for 59% of the complexes in the dataset, the top-ranked pose generated by the algorithm is classified as native-like, in some cases with a very high DockQ score, making the docking model and the experimentally determined native structure nearly indistinguishable. This behavior may be due to the algorithm&#x2019;s prior knowledge of the native structure (or its homologs), as well as particularly easy cases for the algorithm to predict. Nevertheless, this does not hinder the aims of the present work, which first seeks to classify docking poses according to their DockQ value and subsequently to predict the descriptor. In light of this, the development of computational methods capable of identifying decoy docking poses is crucial, as it helps reduce the space of possible binding conformations (by removing these from the candidate solutions) that require further investigation. A deeper insight on the overall dataset is presented in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Overall dataset distribution.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="2" align="left">All the docking poses</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Decoy docking poses</td>
<td align="left">88.69%</td>
</tr>
<tr>
<td align="left">Intermediate docking poses</td>
<td align="left">3.61%</td>
</tr>
<tr>
<td align="left">Native-like docking poses</td>
<td align="left">7.70%</td>
</tr>
</tbody>
</table>
<table>
<thead valign="top">
<tr>
<th colspan="2" align="left">All the native complexes</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Complexes with at least one decoy docking pose</td>
<td align="left">99.95%</td>
</tr>
<tr>
<td align="left">Complexes with at least one intermediate docking pose</td>
<td align="left">20.02%</td>
</tr>
<tr>
<td align="left">Complexes with at least one native-like docking pose</td>
<td align="left">73.17%</td>
</tr>
</tbody>
</table>
<table>
<thead valign="top">
<tr>
<th colspan="2" align="left">The top-ranked docking poses</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Top-ranked best docking poses</td>
<td align="left">57.54%</td>
</tr>
<tr>
<td align="left">Native-like top-ranked docking poses</td>
<td align="left">59.28%</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The second column indicates the percentage of the class presented in the first column. For the first group of classes the frequency has to be intended among all the docking poses, for the second group among all the native complexes, while for the third one among the top-ranked docking poses.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>In this study, we defined two different subsets. The first, referred to as the Decoy&#x2013;Native-like dataset (DNL dataset), includes only decoy and native-like docking poses and is used for the classification approach. The second, which comprises all three classes (decoy, intermediate, and native-like) and is referred to as the DINL dataset, is used to predict the DockQ value of a generic docking pose. The DNL dataset consists of 1,587 decoy docking poses and an equal number of native-like docking poses. Conversely, the DINL dataset, in which the DockQ value of each docking pose is taken into account, is composed of 1,000 decoy poses, 790 intermediate poses, and 1,000 native-like docking poses (see <xref ref-type="fig" rid="F1">Figure 1a</xref>).</p>
<p>Each docking pose is characterized by a docking score, ITScore-PP [<xref ref-type="bibr" rid="B38">38</xref>], which is a numerical value used to rank the predicted binding modes of molecules&#x2014;more negative scores indicate more stable and likely interactions. The distribution of ITScore-PP values is shown in <xref ref-type="fig" rid="F1">Figure 1b</xref> for the native-like and decoy groups separately. The difference between the two distributions is evident, and the classification based on the ITscore-PP descriptor provided by the HDOCK docking method yields an Area Under the Receiver Operating Characteristic (ROC) Curve (AUC) of 0.78 (<xref ref-type="disp-formula" rid="e24">Equation 24</xref>).</p>
<p>The aim of this work is to investigate how a minimal neural network&#x2013;based approach can improve the classification of docking poses into native-like and decoy categories when an appropriate selection of binding properties is adopted, and how it can directly predict the DockQ value on which this classification is based.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Correlation analysis among the features</title>
<p>To evaluate the docking poses, an initial set of 21 features was defined and is listed and described in <xref ref-type="table" rid="T2">Table 2</xref> (see <xref ref-type="disp-formula" rid="e4">Equations 4</xref>&#x2013;<xref ref-type="disp-formula" rid="e19">19</xref> for further details). The selected features primarily describe geometric properties at the interface between antibody&#x2013;antigen docking poses, some of which are based on graph theory to better capture the complexity of the geometric organization of the residues involved in intermolecular interactions.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Table of features (see methods for a more detailed description).</p>
</caption>
<table>
<tbody valign="top">
<tr>
<td align="center">1</td>
<td align="center">pca_stretch_ratio</td>
<td align="left">Ratio between the values of the first and second components of the explained variance of a PCA performed on the residues coordinates. It represents the stretching of the complex shape</td>
</tr>
<tr>
<td align="center">2</td>
<td align="center">pca_flatten_ratio</td>
<td align="left">Ratio between the values of the second and third components of the explained variance of a PCA performed on the residues coordinates. It represents the flatness of the complex shape</td>
</tr>
<tr>
<td align="center">3</td>
<td align="center">pca_alignment_score</td>
<td align="left">Absolute cosine of the angle between the main principal components of a PCA performed both on the antibody and the antigen residues coordinates</td>
</tr>
<tr>
<td align="center">4</td>
<td align="center">bs_sasa_ratio</td>
<td align="left">The fraction of the complex SASA (solvent-accessible surface area) involved in the binding sites</td>
</tr>
<tr>
<td align="center">5</td>
<td align="center">bs_size</td>
<td align="left">Number of residues in the binding site</td>
</tr>
<tr>
<td align="center">6</td>
<td align="center">ab_bs_size</td>
<td align="left">Number of residues in the antibody binding site</td>
</tr>
<tr>
<td align="center">7</td>
<td align="center">ag_bs_size</td>
<td align="left">Number of residues in the antigen binding site</td>
</tr>
<tr>
<td align="center">8</td>
<td align="center">pca_normalized_centroid_distance</td>
<td align="left">Distance between the centroids of the antibody and the antigen, normalized through the main principal component of the PCA performed on the coordinates of the whole complex residues</td>
</tr>
<tr>
<td align="center">9</td>
<td align="center">pca_stretch_ratio_bs</td>
<td align="left">Equivalent to feature 1 for the binding sites residues</td>
</tr>
<tr>
<td align="center">10</td>
<td align="center">pca_flatten_ratio_bs</td>
<td align="left">Equivalent to feature 2 for the binding sites residues</td>
</tr>
<tr>
<td align="center">11</td>
<td align="center">bs_mean_hydrophobicity</td>
<td align="left">Average hydrophobicity of the binding sites residues</td>
</tr>
<tr>
<td align="center">12</td>
<td align="center">bs_delta_hydrophobicity</td>
<td align="left">Absolute difference of average hydrophobicity between the antibody and the antigen binding sites</td>
</tr>
<tr>
<td align="center">13</td>
<td align="center">edge_density</td>
<td align="left">Edge density of the unweighted network composed by the complex residues interactions</td>
</tr>
<tr>
<td align="center">14</td>
<td align="center">mean_degree</td>
<td align="left">Average degree of the unweighted network</td>
</tr>
<tr>
<td align="center">15</td>
<td align="center">mean_strength</td>
<td align="left">Average strength of the weighted network (<inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:mi>w</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for any couple of interacting residues)<break/>
</td>
</tr>
<tr>
<td align="center">16</td>
<td align="center">network_diameter</td>
<td align="left">Diameter of the weighted network</td>
</tr>
<tr>
<td align="center">17</td>
<td align="center">network_radius</td>
<td align="left">Radius of the weighted network</td>
</tr>
<tr>
<td align="center">18</td>
<td align="center">mean_assortativity</td>
<td align="left">Average degree assortativity of the networks</td>
</tr>
<tr>
<td align="center">19</td>
<td align="center">unweighted_mean_clustering</td>
<td align="left">Average clustering coefficient of the unweighted network</td>
</tr>
<tr>
<td align="center">20</td>
<td align="center">weighted_mean_clustering</td>
<td align="left">Average clustering coefficient of the weighted network</td>
</tr>
<tr>
<td align="center">21</td>
<td align="center">network_transitivity</td>
<td align="left">Transitivity of the networks</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>An initial Pearson correlation analysis was performed to remove pairs of features showing high correlation (absolute Pearson correlation coefficient <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.75</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, for both positive and negative correlations). The correlation matrices for all pairs of features, both for the initial 21 features and for the 15 features remaining after filtering, are shown in <xref ref-type="fig" rid="F2">Figure 2a</xref>. In particular, in order to remove highly correlated feature pairs while minimizing feature pruning, the absolute Pearson correlation coefficients were mapped onto a graph in which pairs of strongly correlated features were connected. The resulting problem is equivalent to a minimum vertex cover problem, which was solved exactly using integer linear programming (ILP), given the small number of highly correlated features (see <xref ref-type="disp-formula" rid="e20">Equation 20</xref>). In addition, for both matrices, the corresponding graphs are displayed, where each node represents a feature and each edge between two features is weighted (using a red-to-blue color scale) according to their Pearson correlation.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Correlation analysis and PCA. <bold>(a)</bold> Pearson correlation matrices for all initial features (top) and for the selected features after removing highly correlated pairs (bottom). For each matrix, a fully connected graph is shown, where each node represents a feature and each edge is weighted by the corresponding Pearson correlation value. <bold>(b)</bold> Projection of the 15-dimensional feature vectors onto the essential plane defined by PC1 and PC2. Red and blue points represent decoy and native-like docking poses, respectively. Probability density functions for PC2 (top) and PC1 (right) are shown. <bold>(c)</bold> Explained variance ratio for each of the 15 principal components. <bold>(d)</bold> Feature loadings for PC1 (top) and PC2 (bottom). <bold>(e)</bold> ROC curves for the decoy vs. native-like distributions along PC1 (blue), PC2 (cyan), and for the docking score ITScore-PP (orange).</p>
</caption>
<graphic xlink:href="fphy-13-1736037-g002.tif">
<alt-text content-type="machine-generated">Five-panel data visualization showing various plots: a) Features correlation heatmap with the corresponding network diagrams for the full set and the pruned set of features. b) Scatter plot of two principal components with overlaid density plots for decoy and native-like categories. c) Bar plot of explained variance fraction for up to PC15. d) Bar charts of PC1 and PC2 loadings across 21 features. e) ROC curves comparing ITScorePP, PC1, and PC2, showing true positive vs. false positive rates.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s2-3">
<label>2.3</label>
<title>Unsupervised classification of native-like and decoy docking poses through PCA</title>
<p>The selection of 15 largely independent features, after appropriate normalization (see Methods), allowed us to perform a Principal Component Analysis (PCA). For each docking pose in the DNL dataset, which is used to classify docking poses as decoy or native-like, a vector of 15 normalized features was associated. The PCA results are shown in <xref ref-type="fig" rid="F2">Figure 2b</xref>, and the proportion of variance explained by each eigenvector is reported in <xref ref-type="fig" rid="F2">Figure 2c</xref>, showing that the first two principal components account for 32% of the total variance. The unsupervised PCA approach was employed here to explore a potential blind classification of the two docking pose classes (decoy and native-like). Each point in <xref ref-type="fig" rid="F2">Figure 2b</xref> represents the projection of the 15-dimensional feature vector (associated with a single docking pose) onto the essential plane defined by the first two principal components (PC1&#x2013;PC2). Points are colored red and blue according to their membership in the decoy or native-like class, respectively.</p>
<p>The analysis of the two distributions (decoy and native-like) along PC1 does not reveal a clear separation between the two classes, as evidenced by the strong overlap between distributions. This is also confirmed by the ROC curve shown in <xref ref-type="fig" rid="F2">Figure 2b</xref>, with an AUC of 0.52, which is effectively close to random classification. By contrast, the distributions of decoy and native-like poses along PC2 are noticeably more separated, yielding an ROC AUC of 0.68 (see <xref ref-type="fig" rid="F2">Figure 2e</xref>). Therefore, the use of PC2 alone, in a fully unsupervised manner, provides a moderate but non-negligible discriminative power between decoy and native-like classes.</p>
<p>The loading analysis in this context reveals the contribution of each feature to the definition of each principal component. In particular, the interface properties most relevant for the separation between decoy and native-like poses are pca_flatten_ratio, pca_alignment_score, pca_stretch_ratio_bs and pca_flatten_ratio_bs (features 2, 3, 9 and 10, see <xref ref-type="table" rid="T2">Table 2</xref>), which show a more pronounced difference compared to the corresponding loadings of PC1. The first two features are related to the geometry of the antibody&#x2013;antigen interaction. Specifically, the first feature reflects the globularity of the complex, which increases when the interface lies in proximity to the CDR, while the second describes the relative orientation of the two molecules. Instead, the last two features capture the circularity and concavity of the binding interface, with higher PC2 values corresponding to a flatter interaction surface. A comparison between the ROC curves of PC1 and PC2 (with ROC AUCs of 0.52 and 0.68, respectively) and that of the HDOCK docking score (ROC AUC of 0.80, which was calculated using the DNL dataset) is reported in <xref ref-type="fig" rid="F2">Figure 2e</xref>, highlighting the need to develop supervised methods to better evaluate each docking pose based on interfacial geometric properties.</p>
</sec>
<sec id="s2-4">
<label>2.4</label>
<title>A minimal neural network&#x2013;based approach to classify native-like and decoy docking poses</title>
<p>In this section, a minimal neural network (see Methods and <xref ref-type="disp-formula" rid="e21">Equations 21</xref>&#x2013;<xref ref-type="disp-formula" rid="e23">23</xref>) is employed with the aim of improving the classification between native-like and decoy docking poses as provided by the docking score. The goal is to investigate the contribution of neural network&#x2013;based approaches to enhancing docking pose evaluation. As a first step, the training and test sets were randomly selected. Subsequently, in order to make the procedure as general as possible, multiple training and test datasets were generated so as to be maximally distinct with respect to the features selected for this study.</p>
<p>The first approach was therefore performed by considering one training set and multiple test groups, both drawn from the DNL dataset (see Methods for further details). The predictive performance, in terms of the Area Under the ROC Curve (AUC), was analyzed as a function of the number of network parameters, varying both the total amount of available data and the ratio between training and test samples. The results of this preliminary analysis are shown in <xref ref-type="fig" rid="F3">Figure 3a</xref>, which highlights that, for a number of parameters equal to 70, a plateau in the test AUC curve (in red) is observed in almost all cases considered. By selecting the total number of available complexes&#x2014;after verifying that this amount is sufficient to capture the information required to discriminate between the two classes&#x2014;together with a 0.5 ratio between training and test data and a total of 70 network parameters, we obtained an average test set ROC AUC value of 0.90. This value exceeds the ROC AUC calculated using only the docking score. The comparison between the corresponding ROC curves is also reported in <xref ref-type="fig" rid="F3">Figure 3c</xref>. An analogous analysis performed on the discrimination between native-like and intermediate docking poses in the DINL dataset yields a ROC AUC of 0.77, indicating a promising classification performance on this substantially more challenging and subtle task, where the structural differences between the pose classes are markedly smaller than in the native-like vs. decoy scenario.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Performance of the neural network (NN) in docking pose classification. <bold>(a)</bold> Each plot shows the ROC AUC as a function of the number of parameters used in the NN for the DNL dataset. From left to right, the test set proportion increases, while from bottom to top, the number of complexes used increases. <bold>(b)</bold> The scatterplot displays the first two principal components (PC1 and PC2) obtained from the PCA of normalized feature vectors, along with the probability density function of PC1. Values below the mean (zero) are colored in purple, while those above the mean are colored in green. On the right, the number of complexes classified as decoy and native-like are reported for the first (PC1 <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) and second (PC1 <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) groups, respectively. <bold>(c)</bold> ROC curves are shown for classifications based on the first two principal components of the features (blue and light blue for PC1 and PC2, respectively), the docking score (ITScore-PP, orange), the NN trained and tested on randomly selected sets (green), and the NN trained and tested on sets defined according to differences in docking poses along PC1 (red).</p>
</caption>
<graphic xlink:href="fphy-13-1736037-g003.tif">
<alt-text content-type="machine-generated">Panel of graphs and plots: a) Grid of plots showing the ROC AUC of the classifier in function of the number of parameters. The grid plots are sorted according to the variation in number of parameters and in proportion of the test set. b) Probability density function and scatter plot of Principal Component Analysis (PCA), groupings based on PC1, alongside a bar chart of sample numbers for groups with PC1 &#x003C; 0 and PC1 &#x003E; 0, categorized as decoy and native-like. c) Receiver Operating Characteristic curves showing true positive rate versus false positive rate for different classifiers, including NN classifier and ITScorePP.</alt-text>
</graphic>
</fig>
<p>The importance of the selected descriptors, as indicated by the PC2 loadings, can be assessed by training the NN after removing the descriptors with loadings greater than 0.20 (i.e., those contributing most to the separation between decoy and native-like docking poses according to PC2). This procedure results in a classification performance on the test set that is 14% lower than the performance obtained when retaining all selected features.</p>
<p>In addition, we propose training the NN on training and test sets that are as different as possible in terms of the selected features, in order to make the NN-based classification procedure as generalizable as possible. To this end, the entire dataset was split into two parts (training and test sets, and then swapped) according to the value of PC1, i.e., the projection of the feature vector onto the first principal component of each docking pose. Docking poses with PC1 values below the mean were assigned to one group, while those with higher PC1 values were assigned to the other.</p>
<p>The choice of PC1 as the reference distribution for defining the two groups was motivated by two considerations: (i) PC1, by definition, is the eigenvector associated with the largest proportion of explained variance, thus carrying the highest amount of information; and (ii) the distributions of the PC1 values for the decoy and native-like docking poses show no clear separation (and therefore no intrinsic discriminative power between the two classes), unlike PC2 (see <xref ref-type="fig" rid="F2">Figure 2</xref>). This ensures that, before and after splitting by the PC1 mean, the relative proportion of decoy and native-like poses within each subset remains approximately the same, see <xref ref-type="fig" rid="F3">Figure 3b</xref>.</p>
<p>The results are shown in <xref ref-type="fig" rid="F3">Figure 3c</xref>, which illustrates a neural network discriminative capability between decoy and native-like docking poses that is intermediate between the ITScore-PP docking score provided by HDOCK (ROC AUC of 0.80) and the NN previously trained on randomly selected training and test subsets. A ROC AUC of 0.90 was measured for the NN trained and tested on randomly selected sets, decreasing to values that span between 0.81 and 0.82, when the training and test sets are separated based on the PC1 values associated with each docking pose. In particular, the improved classification capability of the proposed NN-based approach is further confirmed by the steep initial rise of the ROC curves corresponding to the NN-based methods, observed in the early phase (at low true positive rate and false positive rate values). This result highlights the ability of an NN-based approach, when coupled with properly selected features, to improve docking classification performance even when the training and test sets are deliberately constructed to have different underlying properties.</p>
</sec>
<sec id="s2-5">
<label>2.5</label>
<title>The use of neural networks to improve the evaluation of docking poses</title>
<p>In the previous section, we demonstrated the importance of employing simple NN models for the classification of docking poses into decoy and native-like categories in antibody&#x2013;antigen complexes. Furthermore, we emphasized the crucial role of accurate feature selection, which, combined with supervised machine learning methods, can significantly improve predictive performance. Here, a minimal feedforward neural network is trained to directly correlate (rather than classify) with the DockQ value, which is one of the standard metrics used to assess the quality of a docking pose. For this purpose, the DINL dataset was taken into account (see Methods for further details).</p>
<p>In this case as well, the scatterplot of the first two principal components obtained from the PCA of the feature vectors of all docking poses is shown in <xref ref-type="fig" rid="F4">Figure 4a</xref>, where each point (docking pose) is colored according to its corresponding DockQ value. Given the inherent difficulty of capturing, through unsupervised approaches such as PCA, the relationship between the interface descriptors of predicted antibody&#x2013;antigen complexes and their structural deviation from the corresponding experimentally resolved native structures (quantified by DockQ), we developed a neural network (NN) model trained on the DINL dataset.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Performance of the neural network (NN) in DockQ prediction. <bold>(a)</bold> The Pearson correlation coefficient between the predicted DockQ (pDockQ) and the measured DockQ is shown as a function of the number of parameters used to train the NN across different test sets from the DINL dataset. The inset reports the trend of the mean squared error (MSE) for both training and test sets as a function of the number of parameters. <bold>(b)</bold> The Pearson correlation coefficient between the predicted DockQ (pDockQ) and the measured DockQ is shown as a function of the training set size used to train the NN across different test sets from the DINL dataset. The inset reports the difference in mean squared error (<inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>MSE) between training and test sets as a function of the training set size. <bold>(c)</bold> For each panel, the scatterplot (blue points) is combined with a boxplot (in gray) of the measured DockQ, together with the four descriptors: PC1, PC2, docking score (ITScore-PP), and the NN-predicted DockQ (pDockQ). <bold>(d)</bold> Probability density function (PDF) of pDockQ for different DockQ ranges.</p>
</caption>
<graphic xlink:href="fphy-13-1736037-g004.tif">
<alt-text content-type="machine-generated">Four-panel figure showing data visualizations. a) Line graph showing the Pearson correlatian on function of the number of NN parameters, an inset shows the behavior of the MSE. a) Line graph showing the Pearson correlatian on function of the training set size, an inset shows the behavior of the MSE. c) Box plots with data points showing DockQ against PC1, PC2, ITscore-PP and pDockQ. d) Three-dimensional plot showing probability density function with DockQ and pDockQ axes.</alt-text>
</graphic>
</fig>
<p>By randomly selecting the training and test sets (see Methods for further details), we statistically analyzed how the Pearson correlation (<xref ref-type="disp-formula" rid="e25">Equation 25</xref>) in the test set between the experimental DockQ and the predicted DockQ (pDockQ) varies as a function of the number of NN parameters. The results for a NN trained on 80% of the available poses, reported in <xref ref-type="fig" rid="F4">Figure 4a</xref>, show that a substantial performance gain is achieved by increasing the number of parameters up to approximately 70. Beyond this point, the correlation between DockQ and pDockQ increases much more slowly, while the mean square error (MSE) on the training set reaches a plateau for <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:mo>&#x2248;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>86,400 parameters (<xref ref-type="fig" rid="F4">Figure 4a</xref>).</p>
<p>Furthermore, the variation in correlation between pDockQ (computed with an 86,400 parameters NN) and DockQ has been studied as a function of the training set size, spanning from 50% of the DINL dataset to 93%, alongside the difference in MSE (<inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>MSE) between training and test set. For both the measures, the results, reported in <xref ref-type="fig" rid="F4">Figure 4b</xref>, show an optimal average value for the 80% training set proportion.</p>
<p>For the NN trained with 86,400 parameters, on a set composed by 80% of the docking poses, the correlation between DockQ and pDockQ in the DINL set is 0.59.</p>
<p>The comparison between neural network predictions (pDockQ) and the docking score (ITScore-PP) was performed by evaluating the correlation with the DockQ score. In addition, we used projections onto the first two principal components (PC1 and PC2) of each docking score as potential predictors of DockQ. The scatter plots showing the relationship between DockQ and each proposed predictor (supervised and unsupervised) are reported in <xref ref-type="fig" rid="F4">Figure 4c</xref>.</p>
<p>In particular, the correlations between DockQ and PC1, PC2, ITScore-PP (the docking score), and pDockQ are &#x2212;0.04, &#x2212;0.27, &#x2212;0.41, 0.59 respectively (<xref ref-type="table" rid="T3">Table 3</xref>). This indicates that the NN-based approach, which uses as input the 15 selected features, substantially improves the quantitative evaluation of docking poses compared to the original docking score. Of particular note is the correlation between DockQ and the second principal component (PC2) of the PCA performed on the features. As a fully unsupervised descriptor, PC2 provides insight into the features that contribute most to the definition of the component (loadings), thereby offering the opportunity to further refine NN-based models through preliminary feature selection procedures.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Results recap. The reported correlation value refers to the Pearson correlation coefficient.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Descriptor</th>
<th align="center">DNL ROC AUC</th>
<th align="center">Split sets ROC AUC</th>
<th align="center">Corr. with DockQ (p-value)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">PC1</td>
<td align="center">0.52</td>
<td align="center">0.54</td>
<td align="center">&#x2212;0.04 (0.03)</td>
</tr>
<tr>
<td align="center">PC2</td>
<td align="center">0.68</td>
<td align="center">0.62</td>
<td align="center">&#x2212;0.27 <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">ITScore-PP</td>
<td align="center">0.80</td>
<td align="center">0.84</td>
<td align="center">&#x2212;0.41 <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="center">pDockQ</td>
<td align="center">0.90</td>
<td align="center">0.81</td>
<td align="center">0.59 <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>7</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The p-value refers to the null hypothesis that the distributions underlying the samples are uncorrelated and normally distributed.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To better illustrate the ability of the neural network&#x2013;based predictive method to estimate DockQ values even in intermediate cases (0.24 <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> DockQ <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> 0.81), a probability density function (PDF) was computed for each DockQ range (analogous to the boxplot analysis shown in <xref ref-type="fig" rid="F4">Figure 4c</xref>). The distributions of the pDockQ descriptor are progressively shifted with increasing DockQ ranges (see <xref ref-type="fig" rid="F4">Figure 4d</xref>), thereby confirming the method&#x2019;s ability not only to classify docking poses as native-like or decoy&#x2014;as also supported by this DockQ estimation procedure&#x2014;but also to correlate with intermediate DockQ values, with slightly lower yet satisfactory accuracy compared to the docking score.</p>
<p>Furthermore, to assess the overall quality of the pDockQ descriptor, it has been benchmarked in terms of Pearson correlation coefficient against both ITScore-PP and the predicted binding free energy <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>, obtained using an MM/GBSA-based predictor [<xref ref-type="bibr" rid="B39">39</xref>] via the HawkDock server [<xref ref-type="bibr" rid="B40">40</xref>]. This comparison has been performed on a randomly selected small subset of the DINL dataset composed by 84 docking poses (30 decoy, 24 intermediate and 30 native-like poses, in order to maintain the proportions of the DINL dataset). For this analysis, the pDockQ values have been computed by a 86,400 parameters NN trained on all the DINL docking poses that do not share the reference native complex with any test set element. While <inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:mi>G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and pDockQ show a comparable performance (respectively &#x2212;0.62 and 0.67), both have a significantly larger Pearson correlation coefficient in magnitude than ITScore-PP (&#x2212;0.43). Although the small size of the test set of this assessment does not allow a definitive statement, the pDockQ approach results are promising, even when compared with one of the state-of-the-art methods reported in the literature.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s3">
<label>3</label>
<title>Conclusion</title>
<p>In this work, we addressed the role of minimal neural networks (NNs) in tackling the still unresolved problem of accurately evaluating docking poses. Specifically, for a set of experimentally determined antibody&#x2013;antigen complexes, structural predictions were generated using the HDOCK docking method. Each predicted pose is associated with a docking score that is intended to reflect its reliability based on an internal scoring function. The main idea of this study is to improve the assessment of docking scores through the use of neural networks. Each docking pose was evaluated by structural comparison with the experimentally resolved native complex using the DockQ metric, which is commonly employed to assess the performance of molecular docking prediction methods. Threshold values of DockQ were then used to classify docking poses as decoy or native-like. A set of physicochemical features, some of which are derived from graph theory to capture the complexity of residue&#x2013;residue interactions at the antibody&#x2013;antigen interface, was defined with the aim of training one NN for the classification between decoy and native-like poses, and another NN for the direct prediction of DockQ. The results show that, unlike the unsupervised descriptors obtained from the principal components (PCA) of normalized features, the two trained NNs significantly improved both the classification between native-like and decoy poses, as well as between intermediate and native-like docking poses, and the direct prediction of DockQ compared to the docking score provided by HDOCK. These findings highlight the importance of neural network&#x2013;based approaches, combined with the selection of chemically and physically relevant features, in improving the evaluation of docking poses and in describing antibody&#x2013;antigen binding interactions.</p>
</sec>
<sec sec-type="methods" id="s4">
<label>4</label>
<title>Methods</title>
<sec id="s4-1">
<label>4.1</label>
<title>Dataset of antibody-antigen complexes</title>
<p>The initial dataset consisted of 9,780 experimentally resolved antibody&#x2013;antigen complexes retrieved from the SAbDab database [<xref ref-type="bibr" rid="B41">41</xref>]. A first filtering step was applied to retain only complexes in which the antigen was classified as a protein or peptide and consisted of a single chain (thus preserving only monomeric antigens), resulting in 9,486 structures. Structures containing missing residues were either repaired or removed from the dataset, yielding 9,463 structures.</p>
<p>A multiple sequence alignment among all antibody&#x2013;antigen complexes in the dataset was performed to remove redundancy. Specifically, for each complex we considered a single sequence obtained by concatenating the antibody sequence (heavy and light chains) with the sequence of the corresponding antigen. These sequences were then processed with CD-HIT [<xref ref-type="bibr" rid="B42">42</xref>&#x2013;<xref ref-type="bibr" rid="B44">44</xref>] using a sequence identity cutoff of 0.9, resulting in 2,517 centroids, which represent the most representative sequences in the entire dataset. Since the study focuses on the calculation of interface properties, it was crucial to ensure that the interfaces were complete, i.e., without missing residues in the binding region. Therefore, complexes with incomplete interfaces were excluded, reducing the dataset from 2,517 to 2,244 structures.</p>
<p>Finally, energy minimization was performed on all structures, resulting in a final set of 2,188 properly minimized complexes.</p>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Docking simulation of antibody&#x2013;antigen complexes and decoy pose selection</title>
<p>Each antibody&#x2013;antigen complex with a known experimental structure was split into two separate structures, antibody and antigen, which were then subjected to molecular docking simulations using the HDOCK method (thus considering the interacting structures in their bound conformations). For each antibody&#x2013;antigen docking simulation, the top ten poses proposed by the method were retained. Each docking pose was evaluated using the DockQ metric, which is defined according to the following formula:<disp-formula id="e1">
<mml:math id="m25">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>Q</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">nat</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">scaled</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">scaled</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>with<disp-formula id="e2">
<mml:math id="m26">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">scaled</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>8.5</mml:mn>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>and<disp-formula id="e3">
<mml:math id="m27">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">scaled</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1.5</mml:mn>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf25">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">nat</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf26">
<mml:math id="m29">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf27">
<mml:math id="m30">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>R</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are the CAPRI-standard classification metrics [<xref ref-type="bibr" rid="B30">30</xref>, <xref ref-type="bibr" rid="B31">31</xref>].</p>
<p>In particular, for the DNL dataset, we selected for each experimental complex the &#x201c;decoy&#x201d; docking pose as the one associated with the lowest DockQ value among the ten poses considered (ensuring in all cases that DockQ <inline-formula id="inf28">
<mml:math id="m31">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
<mml:mn>0.24</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>), and identified the &#x201c;native-like&#x201d; pose as the one with the highest DockQ value among the ten poses generated by HDOCK (with DockQ <inline-formula id="inf29">
<mml:math id="m32">
<mml:mrow>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>0.81</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>). Furthermore, 1,000 docking poses classified as decoy (based on their DockQ values) and 1,000 docking poses classified as native-like (also based on DockQ) were randomly selected from the docking poses obtained after the previous filtering steps. These poses were used to define the DINL dataset, which served as the training set for the neural network designed to predict DockQ values.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Feature description</title>
<p>The features used throughout the whole paper can be divided into three groups: complex geometry features, interface features, complex graph features. The first group comprises all the measures related to the geometrical arrangement of the antibody-antigen complex <inline-formula id="inf30">
<mml:math id="m33">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-carbon atoms. The first group is composed by<list list-type="bullet">
<list-item>
<p>pca_stretch_ratio:</p>
</list-item>
</list>
<disp-formula id="e4">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>where <inline-formula id="inf31">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf32">
<mml:math id="m36">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf33">
<mml:math id="m37">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the first, second and third component of the explained variance of a PCA performed on the coordinates related to the <inline-formula id="inf34">
<mml:math id="m38">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-carbons of the whole complex;<list list-type="bullet">
<list-item>
<p>pca_flatten_ratio:</p>
</list-item>
</list>
<disp-formula id="e5">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>pca_alignment_score:</p>
</list-item>
</list>
<disp-formula id="e6">
<mml:math id="m40">
<mml:mrow>
<mml:mi>&#x3b8;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x22c5;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <inline-formula id="inf35">
<mml:math id="m41">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf36">
<mml:math id="m42">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>v</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are the two unit vectors on the direction of the main principal component of the PCAs performed separately on the antibody and the antigen;<list list-type="bullet">
<list-item>
<p>pca_normalized_centroid_distance:</p>
</list-item>
</list>
<disp-formula id="e7">
<mml:math id="m43">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>D</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>where <inline-formula id="inf37">
<mml:math id="m44">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the distance between the centroids of the antibody and the antigen.</p>
<p>The second group is composed by features accounting for several properties of the antibody-antigen binding site (BS). For this group, we defined as BS residues those residues whose <inline-formula id="inf38">
<mml:math id="m45">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-carbon is within <inline-formula id="inf39">
<mml:math id="m46">
<mml:mrow>
<mml:mn>12</mml:mn>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to an <inline-formula id="inf40">
<mml:math id="m47">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-carbon atom from a different molecule (i.e. the antibody residues closer than <inline-formula id="inf41">
<mml:math id="m48">
<mml:mrow>
<mml:mn>12</mml:mn>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to an antigen residue and vice versa). The second group features are<list list-type="bullet">
<list-item>
<p>bs_sasa_ratio:</p>
</list-item>
</list>
<disp-formula id="e8">
<mml:math id="m49">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>S</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>S</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">tot</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <inline-formula id="inf42">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf43">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the solvent-accessible surface area (SASA), respectively, of the unbound antibody and antigen, while <inline-formula id="inf44">
<mml:math id="m52">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf45">
<mml:math id="m53">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mtext>SASA</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represent the SASA values of the corresponding unbound antibody and antigen BS residues;<list list-type="bullet">
<list-item>
<p>bs_size: number of BS residues;</p>
</list-item>
<list-item>
<p>ab_bs_size: number of antibody BS residues;</p>
</list-item>
<list-item>
<p>ag_bs_size: number of antigen BS residues;</p>
</list-item>
<list-item>
<p>pca_stretch_ratio_bs:</p>
</list-item>
</list>
<disp-formula id="e9">
<mml:math id="m54">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>where <inline-formula id="inf46">
<mml:math id="m55">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf47">
<mml:math id="m56">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf48">
<mml:math id="m57">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> are the first, second and third component of the explained variance of a PCA performed on the coordinates related to the <inline-formula id="inf49">
<mml:math id="m58">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-carbons of the BS residues;<list list-type="bullet">
<list-item>
<p>pca_flatten_ratio_bs:</p>
</list-item>
</list>
<disp-formula id="e10">
<mml:math id="m59">
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>B</mml:mi>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>bs_mean_hydrophobicity: average hydrophobicity of the BS residues, according to the water orientation probability hydropathy scale (WOPHS) [<xref ref-type="bibr" rid="B45">45</xref>];</p>
</list-item>
<list-item>
<p>bs_delta_hydrophobicity: absolute difference in average hydrophobicity (according to the WOPHS) between the antibody and antigen BS residues.</p>
</list-item>
</list>
</p>
<p>The SASA values are measured using the Shrake-Rupley &#x201c;rolling ball&#x201d; algorithm with probe radius of <inline-formula id="inf50">
<mml:math id="m60">
<mml:mrow>
<mml:mn>1.40</mml:mn>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and definition of 100 points/<inline-formula id="inf51">
<mml:math id="m61">
<mml:mrow>
<mml:msup>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> via the <italic>Biophyton</italic> library [<xref ref-type="bibr" rid="B46">46</xref>].</p>
<p>The third group features are common graph theory descriptors measured on two networks: an unweighted network, where all nodes corresponding to residues whose <inline-formula id="inf52">
<mml:math id="m62">
<mml:mrow>
<mml:mi>&#x3b1;</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-carbons are within 12<inline-formula id="inf53">
<mml:math id="m63">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>A</mml:mi>
</mml:mrow>
<mml:mo>&#x30a;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> are linked, and a weighted network, where to any edge <inline-formula id="inf54">
<mml:math id="m64">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> of the unweighted network is assigned a weight <inline-formula id="inf55">
<mml:math id="m65">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>t</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The following features belong to the third group:<list list-type="bullet">
<list-item>
<p>edge_density: edge density of the unweighted network</p>
</list-item>
</list>
<disp-formula id="e11">
<mml:math id="m66">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>where N is the number of nodes, <inline-formula id="inf56">
<mml:math id="m67">
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the adjacency matrix, i.e. <inline-formula id="inf57">
<mml:math id="m68">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> if <inline-formula id="inf58">
<mml:math id="m69">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf59">
<mml:math id="m70">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> are connected, 0 otherwise, and <inline-formula id="inf60">
<mml:math id="m71">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the node <inline-formula id="inf61">
<mml:math id="m72">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> degree.<list list-type="bullet">
<list-item>
<p>mean_degree: average degree of the unweighted network</p>
</list-item>
</list>
<disp-formula id="e12">
<mml:math id="m73">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>mean_strength: average strength of the weighted network</p>
</list-item>
</list>
<disp-formula id="e13">
<mml:math id="m74">
<mml:mrow>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>network_diameter: diameter of the weighted network</p>
</list-item>
</list>
<disp-formula id="e14">
<mml:math id="m75">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where <inline-formula id="inf62">
<mml:math id="m76">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the length of the shortest path between nodes <inline-formula id="inf63">
<mml:math id="m77">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf64">
<mml:math id="m78">
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> on the weighted network and it is measured via Dijkstra&#x2019;s algorithm;<list list-type="bullet">
<list-item>
<p>network_radius: radius of the weighted network</p>
</list-item>
</list>
<disp-formula id="e15">
<mml:math id="m79">
<mml:mrow>
<mml:mi>r</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>min</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>e</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(15)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>mean_assortativity: average degree assortativity of the networks</p>
</list-item>
</list>
<disp-formula id="e16">
<mml:math id="m80">
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>n</mml:mi>
<mml:mi>m</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(16)</label>
</disp-formula>where <inline-formula id="inf65">
<mml:math id="m81">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is the frequency of edges linking nodes with degree <inline-formula id="inf66">
<mml:math id="m82">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf67">
<mml:math id="m83">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf68">
<mml:math id="m84">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the probability of a link to connect to a node with degree <inline-formula id="inf69">
<mml:math id="m85">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>, i.e. <inline-formula id="inf70">
<mml:math id="m86">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf71">
<mml:math id="m87">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the standard deviation of the distribution <inline-formula id="inf72">
<mml:math id="m88">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>.<list list-type="bullet">
<list-item>
<p>unweighted_mean_clustering: average clustering coefficient of the unweighted network</p>
</list-item>
</list>
<disp-formula id="e17">
<mml:math id="m89">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>u</mml:mi>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(17)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>weighted_mean_clustering: average clustering coefficient of the weighted network</p>
</list-item>
</list>
<disp-formula id="e18">
<mml:math id="m90">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>C</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>w</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msup>
<mml:mo>;</mml:mo>
</mml:mrow>
</mml:math>
<label>(18)</label>
</disp-formula>
<list list-type="bullet">
<list-item>
<p>network_transitivity: transitivity of the networks</p>
</list-item>
</list>
<disp-formula id="e19">
<mml:math id="m91">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>h</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mi>h</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(19)</label>
</disp-formula>
</p>
<p>The network-related features are measured via the <italic>NetworkX</italic> library [<xref ref-type="bibr" rid="B47">47</xref>].</p>
<p>The set of 21 features has been reduced in order to avoid redundancy due to the presence of highly correlated features. In this instance, the least amount of features such that any remaining couple has absolute Pearson correlation <inline-formula id="inf73">
<mml:math id="m92">
<mml:mrow>
<mml:mo>&#x3c;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>0.75 was removed. Linking the highly correlated features in an undirected unweighted network, this problem results to be equivalent to a minimum vertex cover problem (pruning the least amount of nodes such that each remaining node is isolated), therefore exactly solvable via integer linear programming (ILP). The corresponding ILP formulation, with <inline-formula id="inf74">
<mml:math id="m93">
<mml:mrow>
<mml:mi>E</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> the set of edges,<disp-formula id="e20">
<mml:math id="m94">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mtext>Given:&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="">
<mml:mrow>
<mml:mtable class="cases">
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>if&#x2009;node&#x2009;</mml:mtext>
<mml:mi>i</mml:mi>
<mml:mtext>&#x2009;is&#x2009;removed</mml:mtext>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="left">
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mtext>otherwise</mml:mtext>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mtext>Minimize:&#x2009;</mml:mtext>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right"/>
<mml:mtd columnalign="left">
<mml:mtext>Subject&#x2009;to:&#x2009;</mml:mtext>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mspace width="1em"/>
<mml:mo>&#x2200;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>E</mml:mi>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(20)</label>
</disp-formula>was solved via the <italic>PuLP</italic> modeler [<xref ref-type="bibr" rid="B48">48</xref>]. The remaining features were 15: pca_stretch_ratio, pca_flatten_ratio, pca_alignment_score, bs_sasa_ratio, ab_bs_size, ag_bs_size, pca_stretch_ratio_bs, pca_flatten_ratio_bs, bs_mean_hydrophobicity, mean_strength, network_radius, mean_assortativity, weighted_mean_clustering, network_transitivity.</p>
<p>For processing, each feature was normalized via the <italic>scikit-learn</italic> library [<xref ref-type="bibr" rid="B49">49</xref>], such that all the features share the same weight. The PCA was performed via <italic>scikit-learn</italic>, as well.</p>
</sec>
<sec id="s4-4">
<label>4.4</label>
<title>Neural network architecture and optimization</title>
<p>Every NN in this work has been defined via <italic>Tensorflow</italic> [<xref ref-type="bibr" rid="B50">50</xref>] and has the same structure: two-hidden-layers feed forward NN. Each hidden layer has <italic>reLu</italic> activaction function, furthermore, the output layer of the NNs used in <xref ref-type="sec" rid="s2-5">Section 2.5</xref> are provided with a sigmoid activation function, in order to retrieve <inline-formula id="inf75">
<mml:math id="m95">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>Q</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">[</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">]</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. While varying the number of parameters the proportion of nodes in the first and second hidden layer is kept fixed at <inline-formula id="inf76">
<mml:math id="m96">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>:</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>. Therefore, naming <inline-formula id="inf77">
<mml:math id="m97">
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> the number of first layer nodes, one can retrieve the number of parameters <inline-formula id="inf78">
<mml:math id="m98">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>:<disp-formula id="e21">
<mml:math id="m99">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mi>M</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(21)</label>
</disp-formula>where <inline-formula id="inf79">
<mml:math id="m100">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>15</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula> is the number of input features. The NN weights are fitted via AdamW algorithm with learning rate 0.001, through 300 epochs for the classifiers (<xref ref-type="sec" rid="s2-4">Section 2.4</xref>) and 400 epochs for the predictor NNs (<xref ref-type="sec" rid="s2-5">Section 2.5</xref>). In <xref ref-type="sec" rid="s2-4">Section 2.4</xref> binary cross-entropy was used as loss function:<disp-formula id="e22">
<mml:math id="m101">
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">pred</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo stretchy="false">&#x2016;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">true</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">pred</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mi>l</mml:mi>
<mml:mi>n</mml:mi>
<mml:mfenced open="[" close="]">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="italic">true</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
<label>(22)</label>
</disp-formula>where <inline-formula id="inf80">
<mml:math id="m102">
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:mn>0,1</mml:mn>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> is the set of the possible classifications, i.e. &#x201c;Decoy&#x201d; or &#x201c;Native-like&#x201d;. In <xref ref-type="sec" rid="s2-5">Section 2.5</xref> mean square error (MSE) was the loss function, instead:<disp-formula id="e23">
<mml:math id="m103">
<mml:mrow>
<mml:mtext>MSE</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(23)</label>
</disp-formula>where <inline-formula id="inf81">
<mml:math id="m104">
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> is the number of docking predictions in the dataset and <inline-formula id="inf82">
<mml:math id="m105">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf83">
<mml:math id="m106">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are the values of pDockQ and DockQ associated to the <inline-formula id="inf84">
<mml:math id="m107">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>-th prediction. In <xref ref-type="sec" rid="s2-5">Section 2.5</xref>, in order to obtain the pDockQ values for the whole DINL dataset, it has been split into several complementary subsets, according to the proportion of the training set. The pDockQ values of each subset have been computed using the others as training set.</p>
</sec>
<sec id="s4-5">
<label>4.5</label>
<title>Statistical analysis</title>
<p>The area under the receiver operating characteristic curve (ROC AUC) was used to assess the quality of the classifications throughout <xref ref-type="sec" rid="s2-4">Section 2.4</xref>. Given two classes (Positive and Negative) and the distribution of a measure for each of the classes, the ROC curve is the parametric curve <inline-formula id="inf85">
<mml:math id="m108">
<mml:mrow>
<mml:mtext>ROC</mml:mtext>
<mml:mo>&#x2009;&#x2009;&#x2009;</mml:mo>
<mml:mtext>curve</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> representing the variation of the false positive rate <inline-formula id="inf86">
<mml:math id="m109">
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and the true positive rate <inline-formula id="inf87">
<mml:math id="m110">
<mml:mrow>
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> in function of the measure threshold <inline-formula id="inf88">
<mml:math id="m111">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> used to split the classes, where<disp-formula id="e24">
<mml:math id="m112">
<mml:mrow>
<mml:mtable class="aligned">
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>t</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>&#x23;&#x2009;true&#x2009;positives</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>&#x23;&#x2009;positives</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd columnalign="right">
<mml:mi>f</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mtd>
<mml:mtd columnalign="left">
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>&#x23;&#x2009;false&#x2009;positives</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>&#x23;&#x2009;negatives</mml:mtext>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(24)</label>
</disp-formula>
</p>
<p>The ROC AUC equates the probability that, given a random negative element and a random positive element, the negative element correspond to a measure larger than the positive. In this instance the Negative and the Positive classes were &#x201c;Decoy&#x201d; and &#x201c;Native-like&#x201d;. The ROC curves and the ROC AUCs were computed via the <italic>scikit-learn</italic> library [<xref ref-type="bibr" rid="B49">49</xref>].</p>
<p>Regarding the regression tasks (<xref ref-type="sec" rid="s2-5">Section 2.5</xref>), the assessment was done via Pearson correlation coefficient <inline-formula id="inf89">
<mml:math id="m113">
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> between any measure <inline-formula id="inf90">
<mml:math id="m114">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the <inline-formula id="inf91">
<mml:math id="m115">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> score of the docking prediction:<disp-formula id="e25">
<mml:math id="m116">
<mml:mrow>
<mml:mi>&#x3c1;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x2329;</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>Q</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">&#x27e8;</mml:mo>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>Q</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x27e9;</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>D</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>k</mml:mi>
<mml:mi>Q</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(25)</label>
</disp-formula>The validity of the Pearson correlation was assessed performing a p-value test of the null hypothesis that the distributions underlying the samples are uncorrelated and normally distributed. Both the Pearson correlation coefficient and the p-value were computed via the <italic>SciPy</italic> library [<xref ref-type="bibr" rid="B51">51</xref>].</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>AM: Investigation, Writing &#x2013; original draft, Data curation, Software. GR: Writing &#x2013; original draft, Conceptualization, Supervision. EM: Supervision, Conceptualization, Investigation, Writing &#x2013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The handling editor SS declared a past co-authorship with the author GR.</p>
<p>The authors EM, GR declared that they were an editorial board member of Frontiers at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="ai-statement" id="s9">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/72820/overview">Sauro Succi</ext-link>, Italian Institute of Technology (IIT), Italy</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2370574/overview">Sebastiano Pilati</ext-link>, University of Camerino, Italy</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3269328/overview">Matthew McFee</ext-link>, University of Toronto, Canada</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peace Chinedu-Nzereogu</surname>
<given-names>O</given-names>
</name>
<name>
<surname>Atoyebi</surname>
<given-names>TO</given-names>
</name>
<name>
<surname>Adebayo</surname>
<given-names>MA</given-names>
</name>
<name>
<surname>Kenneth Maduike</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Alebel Dejene</surname>
<given-names>T</given-names>
</name>
</person-group>
<collab>Tochukwu Excellent Okechukwu, and Yetunde Victoria</collab>. <article-title>Harnessing ai-driven crispr bioinformatics: transforming precision diagnostics for antimicrobial resistance and chemical pathology</article-title>. (<year>2025</year>).</mixed-citation>
</ref>
<ref id="B2">
<label>2.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>A comprehensive review and comparison of existing computational methods for protein function prediction</article-title>. <source>Brief Bioinform</source> (<year>2024</year>) <volume>25</volume>(<issue>4</issue>):<fpage>bbae289</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbae289</pub-id>
<pub-id pub-id-type="pmid">39003530</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<label>3.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bettanti</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Beccari</surname>
<given-names>AR</given-names>
</name>
<name>
<surname>Biccarino</surname>
<given-names>M</given-names>
</name>
</person-group>. <article-title>Exploring the future of biopharmaceutical drug discovery: can advanced ai platforms overcome current challenges?</article-title> <source>Discover Artif Intelligence</source> (<year>2024</year>) <volume>4</volume>(<issue>1</issue>):<fpage>102</fpage>. <pub-id pub-id-type="doi">10.1007/s44163-024-00188-3</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<label>4.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Callaway</surname>
<given-names>E</given-names>
</name>
</person-group>. <article-title>&#x2019;it will change everything&#x2019;: deepmind&#x2019;s ai makes gigantic leap in solving protein structures</article-title>. <source>Nature</source> (<year>2020</year>) <volume>588</volume>(<issue>7837</issue>):<fpage>203</fpage>&#x2013;<lpage>5</lpage>. <pub-id pub-id-type="doi">10.1038/d41586-020-03348-4</pub-id>
<pub-id pub-id-type="pmid">33257889</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<label>5.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jumper</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Evans</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Pritzel</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Green</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Figurnov</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Ronneberger</surname>
<given-names>O</given-names>
</name>
<etal/>
</person-group> <article-title>Highly accurate protein structure prediction with alphafold</article-title>. <source>Nature</source> (<year>2021</year>) <volume>596</volume>(<issue>7873</issue>):<fpage>583</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-021-03819-2</pub-id>
<pub-id pub-id-type="pmid">34265844</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<label>6.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wayment-Steele</surname>
<given-names>HK</given-names>
</name>
<name>
<surname>Ojoawo</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Otten</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Apitz</surname>
<given-names>JM</given-names>
</name>
<name>
<surname>Pitsawong</surname>
<given-names>W</given-names>
</name>
<name>
<surname>H&#xf6;mberger</surname>
<given-names>M</given-names>
</name>
<etal/>
</person-group> <article-title>Predicting multiple conformations <italic>via</italic> sequence clustering and alphafold2</article-title>. <source>Nature</source> (<year>2024</year>) <volume>625</volume>(<issue>7996</issue>):<fpage>832</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-023-06832-9</pub-id>
<pub-id pub-id-type="pmid">37956700</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<label>7.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Baek</surname>
<given-names>M</given-names>
</name>
<name>
<surname>DiMaio</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Anishchenko</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Dauparas</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ovchinnikov</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>GR</given-names>
</name>
<etal/>
</person-group> <article-title>Accurate prediction of protein structures and interactions using a three-track neural network</article-title>. <source>Science</source> (<year>2021</year>) <volume>373</volume>(<issue>6557</issue>):<fpage>871</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1126/science.abj8754</pub-id>
<pub-id pub-id-type="pmid">34282049</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<label>8.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>T-Y</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>C</given-names>
</name>
</person-group>. <article-title>Application of computational biology and artificial intelligence in drug design</article-title>. <source>Int Journal Molecular Sciences</source> (<year>2022</year>) <volume>23</volume>(<issue>21</issue>):<fpage>13568</fpage>. <pub-id pub-id-type="doi">10.3390/ijms232113568</pub-id>
<pub-id pub-id-type="pmid">36362355</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<label>9.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Norman</surname>
<given-names>RA</given-names>
</name>
<name>
<surname>Ambrosetti</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Bonvin</surname>
<given-names>AMJJ</given-names>
</name>
<name>
<surname>Colwell</surname>
<given-names>LJ</given-names>
</name>
<name>
<surname>Kelm</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Kumar</surname>
<given-names>S</given-names>
</name>
<etal/>
</person-group> <article-title>Computational approaches to therapeutic antibody design: established methods and emerging trends</article-title>. <source>Brief Bioinformatics</source> (<year>2020</year>) <volume>21</volume>(<issue>5</issue>):<fpage>1549</fpage>&#x2013;<lpage>67</lpage>. <pub-id pub-id-type="doi">10.1093/bib/bbz095</pub-id>
<pub-id pub-id-type="pmid">31626279</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<label>10.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ambrosetti</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Olsen</surname>
<given-names>TH</given-names>
</name>
<name>
<surname>Paolo Olimpieri</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Jim&#xe9;nez-Garc&#xed;a</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Milanetti</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Marcatilli</surname>
<given-names>P</given-names>
</name>
<etal/>
</person-group> <article-title>proabc-2: prediction of antibody contacts v2 and its application to information-driven docking</article-title>. <source>Bioinformatics</source> (<year>2020</year>) <volume>36</volume>(<issue>20</issue>):<fpage>5107</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btaa644</pub-id>
<pub-id pub-id-type="pmid">32683441</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<label>11.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Milanetti</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Miotto</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Di Rienzo</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Monti</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Gosti</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Ruocco</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>2d zernike polynomial expansion: finding the protein-protein binding regions</article-title>. <source>Comput Structural Biotechnology Journal</source> (<year>2021</year>) <volume>19</volume>:<fpage>29</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2020.11.051</pub-id>
<pub-id pub-id-type="pmid">33363707</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<label>12.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Di Rienzo</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Milanetti</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Lepore</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Paolo Olimpieri</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Tramontano</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Superposition-free comparison and clustering of antibody binding sites: implications for the prediction of the nature of their antigen</article-title>. <source>Scientific Reports</source> (<year>2017</year>) <volume>7</volume>(<issue>1</issue>):<fpage>45053</fpage>. <pub-id pub-id-type="doi">10.1038/srep45053</pub-id>
<pub-id pub-id-type="pmid">28338016</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<label>13.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Di Rienzo</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Miotto</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Desantis</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Grassmann</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Ruocco</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Milanetti</surname>
<given-names>E</given-names>
</name>
</person-group>. <article-title>Dynamical changes of sars-cov-2 spike variants in the highly immunogenic regions impact the viral antibodies escaping</article-title>. <source>Proteins: Struct Funct Bioinformatics</source> (<year>2023</year>) <volume>91</volume>(<issue>8</issue>):<fpage>1116</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1002/prot.26497</pub-id>
<pub-id pub-id-type="pmid">37078559</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<label>14.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Abramson</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Adler</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Dunger</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Evans</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Green</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Pritzel</surname>
<given-names>A</given-names>
</name>
<etal/>
</person-group> <article-title>Accurate structure prediction of biomolecular interactions with alphafold 3</article-title>. <source>Nature</source> (<year>2024</year>) <volume>630</volume>(<issue>8016</issue>):<fpage>493</fpage>&#x2013;<lpage>500</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-024-07487-w</pub-id>
<pub-id pub-id-type="pmid">38718835</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<label>15.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma Nooren</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Thornton</surname>
<given-names>JM</given-names>
</name>
</person-group>. <article-title>Diversity of protein&#x2013;protein interactions</article-title>. <source>The EMBO Journal</source> (<year>2003</year>). <pub-id pub-id-type="doi">10.1093/emboj/cdg359</pub-id>
<pub-id pub-id-type="pmid">12853464</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<label>16.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grassmann</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Miotto</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Desantis</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Di Rienzo</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Tartaglia</surname>
<given-names>GG</given-names>
</name>
<name>
<surname>Pastore</surname>
<given-names>A</given-names>
</name>
<etal/>
</person-group> <article-title>Computational approaches to predict protein&#x2013;protein interactions in crowded cellular environments</article-title>. <source>Chem Rev</source> (<year>2024</year>) <volume>124</volume>(<issue>7</issue>):<fpage>3932</fpage>&#x2013;<lpage>77</lpage>. <pub-id pub-id-type="doi">10.1021/acs.chemrev.3c00550</pub-id>
<pub-id pub-id-type="pmid">38535831</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<label>17.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>F</given-names>
</name>
</person-group>. <article-title>Antibinder: utilizing bidirectional attention and hybrid encoding for precise antibody&#x2013;antigen interaction prediction</article-title>. <source>Brief Bioinform</source> (<year>2024</year>) <volume>26</volume>(<issue>1</issue>):<fpage>bbaf008</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbaf008</pub-id>
<pub-id pub-id-type="pmid">39831890</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<label>18.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>De Lauro</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Di Rienzo</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Miotto</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Paolo Olimpieri</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Milanetti</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Ruocco</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Shape complementarity optimization of antibody&#x2013;antigen interfaces: the application to sars-cov-2 spike protein</article-title>. <source>Front Mol Biosciences</source> (<year>2022</year>) <volume>9</volume>:<fpage>874296</fpage>. <pub-id pub-id-type="doi">10.3389/fmolb.2022.874296</pub-id>
<pub-id pub-id-type="pmid">35669567</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<label>19.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>W</given-names>
</name>
<etal/>
</person-group> <article-title>Mvsf-ab: accurate antibody&#x2013;antigen binding affinity prediction <italic>via</italic> multi-view sequence feature learning</article-title>. <source>Bioinformatics</source> (<year>2025</year>) <volume>41</volume>(<issue>5</issue>):<fpage>btae579</fpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btae579</pub-id>
<pub-id pub-id-type="pmid">39363630</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<label>20.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Michalewicz</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Barahona</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Bravi</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>Antipasti: interpretable prediction of antibody binding affinity exploiting normal modes and deep learning</article-title>. <source>Structure</source> (<year>2024</year>) <volume>32</volume>(<issue>12</issue>):<fpage>2422</fpage>&#x2013;<lpage>34</lpage>. <pub-id pub-id-type="doi">10.1016/j.str.2024.10.001</pub-id>
<pub-id pub-id-type="pmid">39461331</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<label>21.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kozakov</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Hall</surname>
<given-names>DR</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Porter</surname>
<given-names>KA</given-names>
</name>
<name>
<surname>Padhorny</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Yueh</surname>
<given-names>C</given-names>
</name>
<etal/>
</person-group> <article-title>The cluspro web server for protein&#x2013;protein docking</article-title>. <source>Nat Protocols</source> (<year>2017</year>) <volume>12</volume>(<issue>2</issue>):<fpage>255</fpage>&#x2013;<lpage>78</lpage>. <pub-id pub-id-type="doi">10.1038/nprot.2016.169</pub-id>
<pub-id pub-id-type="pmid">28079879</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<label>22.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jim&#xe9;nez-Garc&#xed;a</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Roel-Touris</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Barradas-Bautista</surname>
<given-names>D</given-names>
</name>
</person-group>. <article-title>The lightdock server: artificial intelligence-powered modeling of macromolecular interactions</article-title>. <source>Nucleic Acids Research</source> (<year>2023</year>) <volume>51</volume>(<issue>W1</issue>):<fpage>W298</fpage>&#x2013;<lpage>W304</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkad327</pub-id>
<pub-id pub-id-type="pmid">37140054</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<label>23.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>Z</given-names>
</name>
</person-group>. <article-title>Zdock: an initial-stage protein-docking algorithm</article-title>. <source>Proteins: Struct Funct Bioinformatics</source> (<year>2003</year>) <volume>52</volume>(<issue>1</issue>):<fpage>80</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1002/prot.10389</pub-id>
<pub-id pub-id-type="pmid">12784371</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<label>24.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yan</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>H</given-names>
</name>
<name>
<surname>He</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>S-Y</given-names>
</name>
</person-group>. <article-title>The hdock server for integrated protein&#x2013;protein docking</article-title>. <source>Nat Protocols</source> (<year>2020</year>) <volume>15</volume>(<issue>5</issue>):<fpage>1829</fpage>&#x2013;<lpage>52</lpage>. <pub-id pub-id-type="doi">10.1038/s41596-020-0312-x</pub-id>
<pub-id pub-id-type="pmid">32269383</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<label>25.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dominguez</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Boelens</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Bonvin</surname>
<given-names>AMJJ</given-names>
</name>
</person-group>. <article-title>Haddock: a protein-protein docking approach based on biochemical or biophysical information</article-title>. <source>J Am Chem Soc</source> (<year>2003</year>) <volume>125</volume>(<issue>7</issue>):<fpage>1731</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1021/ja026939x</pub-id>
<pub-id pub-id-type="pmid">12580598</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<label>26.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Gong</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Abag-docking benchmark: a non-redundant structure benchmark dataset for antibody&#x2013;antigen computational docking</article-title>. <source>Brief Bioinform</source> (<year>2024</year>) <volume>25</volume>(<issue>2</issue>):<fpage>bbae048</fpage>. <pub-id pub-id-type="doi">10.1093/bib/bbae048</pub-id>
<pub-id pub-id-type="pmid">38385879</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<label>27.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ambrosetti</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Jim&#xe9;nez-Garc&#xed;a</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Roel-Touris</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Bonvin</surname>
<given-names>AMJJ</given-names>
</name>
</person-group>. <article-title>Modeling antibody-antigen complexes by information-driven docking</article-title>. <source>Structure</source> (<year>2020</year>) <volume>28</volume>(<issue>1</issue>):<fpage>119</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1016/j.str.2019.10.011</pub-id>
<pub-id pub-id-type="pmid">31727476</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<label>28.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vittorio</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Lunghini</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Morerio</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Gadioli</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Orlandini</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Silva</surname>
<given-names>P</given-names>
</name>
<etal/>
</person-group> <article-title>Addressing docking pose selection with structure-based deep learning: recent advances, challenges and opportunities</article-title>. <source>Comput Struct Biotechnol J</source> (<year>2024</year>) <volume>23</volume>:<fpage>2141</fpage>&#x2013;<lpage>51</lpage>. <pub-id pub-id-type="doi">10.1016/j.csbj.2024.05.024</pub-id>
<pub-id pub-id-type="pmid">38827235</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<label>29.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dong</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Deepumqa-x: comprehensive and insightful estimation of model accuracy for protein single-chain and complex</article-title>. <source>Nucleic Acids Res</source> (<year>2025</year>) <volume>53</volume>(<issue>W1</issue>):<fpage>W219</fpage>&#x2013;<lpage>W227</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaf380</pub-id>
<pub-id pub-id-type="pmid">40322921</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<label>30.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Basu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Wallner</surname>
<given-names>B</given-names>
</name>
</person-group>. <article-title>Dockq: a quality measure for protein-protein docking models</article-title>. <source>PLOS ONE</source> (<year>2016</year>) <volume>11</volume>(<issue>8</issue>):<fpage>e0161879</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0161879</pub-id>
<pub-id pub-id-type="pmid">27560519</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<label>31.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lensink</surname>
<given-names>MF</given-names>
</name>
<name>
<surname>Wodak</surname>
<given-names>SJ</given-names>
</name>
</person-group>. <article-title>Docking, scoring, and affinity prediction in capri</article-title>. <source>Proteins</source> (<year>2013</year>) <volume>81</volume>(<issue>12</issue>):<fpage>2082</fpage>&#x2013;<lpage>95</lpage>. <pub-id pub-id-type="doi">10.1002/prot.24428</pub-id>
<pub-id pub-id-type="pmid">24115211</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<label>32.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Collins</surname>
<given-names>KW</given-names>
</name>
<name>
<surname>Copeland</surname>
<given-names>MM</given-names>
</name>
<name>
<surname>Brysbaert</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Wodak</surname>
<given-names>SJ</given-names>
</name>
<name>
<surname>Bonvin</surname>
<given-names>AMJJ</given-names>
</name>
<name>
<surname>Kundrotas</surname>
<given-names>PJ</given-names>
</name>
<etal/>
</person-group> <article-title>Capri-q: the capri resource evaluating the quality of predicted structures of protein complexes</article-title>. <source>J Molecular Biology</source> (<year>2024</year>) <volume>436</volume>(<issue>17</issue>):<fpage>168540</fpage>. <pub-id pub-id-type="doi">10.1016/j.jmb.2024.168540</pub-id>
<pub-id pub-id-type="pmid">39237205</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<label>33.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Graber</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Stockinger</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Meyer</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Mishra</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Horn</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Buller</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Resolving data bias improves generalization in binding affinity prediction</article-title>. <source>Nat Machine Intelligence</source> (<year>2025</year>) <volume>7</volume>:<fpage>1713</fpage>&#x2013;<lpage>25</lpage>. <pub-id pub-id-type="doi">10.1038/s42256-025-01124-5</pub-id>
<pub-id pub-id-type="pmid">41143208</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<label>34.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pellicani</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Ben</surname>
<given-names>DD</given-names>
</name>
<name>
<surname>Perali</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Pilati</surname>
<given-names>S</given-names>
</name>
</person-group>. <article-title>Machine learning scoring functions for drug discovery from experimental and computer-generated protein&#x2013;ligand structures: towards per-target scoring functions</article-title>. <source>Molecules</source> (<year>2023</year>) <volume>28</volume>(<issue>4</issue>):<fpage>1661</fpage>. <pub-id pub-id-type="doi">10.3390/molecules28041661</pub-id>
<pub-id pub-id-type="pmid">36838647</pub-id>
</mixed-citation>
</ref>
<ref id="B35">
<label>35.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>N</given-names>
</name>
</person-group>. <article-title>Predicting or pretending: artificial intelligence for protein-ligand interactions lack of sufficiently large and unbiased datasets</article-title>. <source>Front Pharmacol</source> (<year>2020</year>) <fpage>11</fpage>&#x2013;<lpage>2020</lpage>. <pub-id pub-id-type="doi">10.3389/fphar.2020.00069</pub-id>
<pub-id pub-id-type="pmid">32161539</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<label>36.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmad</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Jadhav</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Shinde</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Jagtap</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Girase</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Patel</surname>
<given-names>H</given-names>
</name>
</person-group>. <article-title>Optimizing bedaquiline for cardiotoxicity by structure based virtual screening, dft analysis and molecular dynamic simulation studies to identify selective mdr-tb inhibitors</article-title>. <source>Silico Pharmacol</source> (<year>2021</year>) <volume>9</volume>(<issue>1</issue>):<fpage>23</fpage>. <pub-id pub-id-type="doi">10.1007/s40203-021-00086-x</pub-id>
<pub-id pub-id-type="pmid">33854869</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<label>37.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Alandijany</surname>
<given-names>TA</given-names>
</name>
<name>
<surname>El-Daly</surname>
<given-names>MM</given-names>
</name>
<name>
<surname>Ahmed</surname>
<given-names>MT</given-names>
</name>
<name>
<surname>Bajrai</surname>
<given-names>LH</given-names>
</name>
<name>
<surname>Khateb</surname>
<given-names>AM</given-names>
</name>
<name>
<surname>Alsaady</surname>
<given-names>IM</given-names>
</name>
<etal/>
</person-group> <article-title>Investigating the mechanism of action of anti-dengue compounds as potential binders of zika virus rna-dependent rna polymerase</article-title>. <source>Viruses</source> (<year>2023</year>) <volume>15</volume>(<issue>7</issue>):<fpage>1501</fpage>. <pub-id pub-id-type="doi">10.3390/v15071501</pub-id>
<pub-id pub-id-type="pmid">37515188</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<label>38.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>SY</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>An iterative knowledge-based scoring function for protein-protein recognition</article-title>. <source>Proteins</source> (<year>2008</year>) <volume>72</volume>(<issue>2</issue>):<fpage>557</fpage>&#x2013;<lpage>79</lpage>. <pub-id pub-id-type="doi">10.1002/prot.21949</pub-id>
<pub-id pub-id-type="pmid">18247354</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<label>39.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hou</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Assessing the performance of the Mm/pbsa and mm/gbsa methods. 1. the accuracy of binding free energy calculations based on molecular dynamics simulations</article-title>. <source>J Chem Inf Model</source> (<year>2011</year>) <volume>51</volume>(<issue>1</issue>):<fpage>69</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1021/ci100275a</pub-id>
<pub-id pub-id-type="pmid">21117705</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<label>40.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Weng</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>O</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>M</given-names>
</name>
<etal/>
</person-group> <article-title>Hawkdock version 2: an updated web server to predict and analyze the structures of protein&#x2013;protein complexes</article-title>. <source>Nucleic Acids Res</source> (<year>2025</year>) <volume>53</volume>(<issue>W1</issue>):<fpage>W306</fpage>&#x2013;<lpage>W315</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkaf379</pub-id>
<pub-id pub-id-type="pmid">40326522</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<label>41.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dunbar</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Krawczyk</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Leem</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Baker</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Fuchs</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Georges</surname>
<given-names>G</given-names>
</name>
<etal/>
</person-group> <article-title>Sabdab: the structural antibody database</article-title>. <source>Nucleic Acids Research</source> (<year>2014</year>) <volume>42</volume>(<issue>D1</issue>):<fpage>D1140</fpage>&#x2013;<lpage>D1146</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkt1043</pub-id>
<pub-id pub-id-type="pmid">24214988</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<label>42.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Cd-hit: accelerated for clustering the next-generation sequencing data</article-title>. <source>Bioinformatics</source> (<year>2012</year>) <volume>28</volume>(<issue>23</issue>):<fpage>3150</fpage>&#x2013;<lpage>2</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bts565</pub-id>
<pub-id pub-id-type="pmid">23060610</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<label>43.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Godzik</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Cd-hit: a fast program for clustering and comparing large sets of protein or nucleotide sequences</article-title>. <source>Bioinformatics</source> (<year>2006</year>) <volume>22</volume>(<issue>13</issue>):<fpage>1658</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btl158</pub-id>
<pub-id pub-id-type="pmid">16731699</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<label>44.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W</given-names>
</name>
</person-group>. <article-title>Cd-hit suite: a web server for clustering and comparing biological sequences</article-title>. <source>Bioinformatics</source> (<year>2010</year>) <volume>26</volume>(<issue>5</issue>):<fpage>680</fpage>&#x2013;<lpage>2</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btq003</pub-id>
<pub-id pub-id-type="pmid">20053844</pub-id>
</mixed-citation>
</ref>
<ref id="B45">
<label>45.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bonella</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Raimondo</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Milanetti</surname>
<given-names>E</given-names>
</name>
<name>
<surname>Tramontano</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Ciccotti</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Mapping the hydropathy of amino acids based on their local solvation structure</article-title>. <source>The J Phys Chem B</source> (<year>2014</year>) <volume>118</volume>(<issue>24</issue>):<fpage>6604</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1021/jp500980x</pub-id>
<pub-id pub-id-type="pmid">24845543</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<label>46.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cock</surname>
<given-names>PJA</given-names>
</name>
<name>
<surname>Antao</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Chang</surname>
<given-names>JT</given-names>
</name>
<name>
<surname>Chapman</surname>
<given-names>BA</given-names>
</name>
<name>
<surname>Cox</surname>
<given-names>CJ</given-names>
</name>
<name>
<surname>Dalke</surname>
<given-names>A</given-names>
</name>
<etal/>
</person-group> <article-title>Biopython: freely available python tools for computational molecular biology and bioinformatics</article-title>. <source>Bioinformatics</source> (<year>2009</year>) <volume>25</volume>(<issue>11</issue>):<fpage>1422</fpage>&#x2013;<lpage>3</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btp163</pub-id>
<pub-id pub-id-type="pmid">19304878</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<label>47.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Hagberg</surname>
<given-names>AA</given-names>
</name>
<name>
<surname>Schult</surname>
<given-names>DA</given-names>
</name>
<name>
<surname>Swart</surname>
<given-names>PJ</given-names>
</name>
</person-group>. <article-title>Exploring network structure, dynamics, and function using networkx</article-title>. In: <person-group person-group-type="editor">
<name>
<surname>Varoquaux</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Vaught</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Millman</surname>
<given-names>J</given-names>
</name>
</person-group>, editors. <source>Proceedings of the 7th python in science conference</source>. <publisher-loc>Pasadena, CA USA</publisher-loc> (<year>2008</year>). p. <fpage>11</fpage>&#x2013;<lpage>5</lpage>.</mixed-citation>
</ref>
<ref id="B48">
<label>48.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mitchell</surname>
<given-names>S</given-names>
</name>
<name>
<surname>O&#x2019;Sullivan</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Dunning</surname>
<given-names>I</given-names>
</name>
</person-group>. <article-title>Pulp: a linear programming toolkit for python</article-title>. (<year>2011</year>).</mixed-citation>
</ref>
<ref id="B49">
<label>49.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O</given-names>
</name>
<etal/>
</person-group> <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J Machine Learn Res</source> (<year>2011</year>) <volume>12</volume>:<fpage>2825</fpage>&#x2013;<lpage>30</lpage>.</mixed-citation>
</ref>
<ref id="B50">
<label>50.</label>
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Abadi</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Barham</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>A</given-names>
</name>
<name>
<surname>Dean</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>Tensorflow: a system for large-scale machine learning</article-title>. In: <source>Proceedings of the 12th USENIX conference on operating systems design and implementation, OSDI&#x2019;16</source>. <publisher-loc>USA</publisher-loc>: <publisher-name>USENIX Association</publisher-name> (<year>2016</year>). p. <fpage>265</fpage>&#x2013;<lpage>83</lpage>.</mixed-citation>
</ref>
<ref id="B51">
<label>51.</label>
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Virtanen</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Gommers</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Oliphant</surname>
<given-names>TE</given-names>
</name>
<name>
<surname>Haberland</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Reddy</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Cournapeau</surname>
<given-names>D</given-names>
</name>
<etal/>
</person-group> <article-title>SciPy 1.0: fundamental algorithms for scientific computing in python</article-title>. <source>Nat Methods</source> (<year>2020</year>) <volume>17</volume>:<fpage>261</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1038/s41592-019-0686-2</pub-id>
<pub-id pub-id-type="pmid">32015543</pub-id>
</mixed-citation>
</ref>
</ref-list>
</back>
</article>