<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mol. Biosci.</journal-id>
<journal-title>Frontiers in Molecular Biosciences</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mol. Biosci.</abbrev-journal-title>
<issn pub-type="epub">2296-889X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">756075</article-id>
<article-id pub-id-type="doi">10.3389/fmolb.2021.756075</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Molecular Biosciences</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Accurate Prediction of Hydration Sites of Proteins Using Energy Model With Atom Embedding</article-title>
<alt-title alt-title-type="left-running-head">Huang et&#x20;al.</alt-title>
<alt-title alt-title-type="right-running-head">Protein Hydration Site Prediction</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Huang</surname>
<given-names>Pin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1437224/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Xing</surname>
<given-names>Haoming</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1456090/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zou</surname>
<given-names>Xun</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Han</surname>
<given-names>Qi</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1456088/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Ke</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1462963/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Sun</surname>
<given-names>Xiangyan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1455964/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wu</surname>
<given-names>Junqiu</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1435462/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Fan</surname>
<given-names>Jie</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<label>
<sup>1</sup>
</label>College of Life Sciences, Beijing Normal University, <addr-line>Beijing</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<label>
<sup>2</sup>
</label>Accutar Biotechnology Inc., <addr-line>Brooklyn</addr-line>, <addr-line>NY</addr-line>, <country>United&#x20;States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/761473/overview">Yong Wang</ext-link>, Zhejiang University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1242685/overview">Wei Chen</ext-link>, Independent researcher, Austin, United&#x20;States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/391804/overview">Ye Mei</ext-link>, East China Normal University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Pin Huang, <email>pinhuang@accutarbio.com</email>
</corresp>
<fn fn-type="other">
<p>This article was submitted to Biological Modeling and Simulation, a section of the journal Frontiers in Molecular Biosciences</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>09</month>
<year>2021</year>
</pub-date>
<pub-date pub-type="collection">
<year>2021</year>
</pub-date>
<volume>8</volume>
<elocation-id>756075</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>08</month>
<year>2021</year>
</date>
<date date-type="accepted">
<day>02</day>
<month>09</month>
<year>2021</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2021 Huang, Xing, Zou, Han, Liu, Sun, Wu and Fan.</copyright-statement>
<copyright-year>2021</copyright-year>
<copyright-holder>Huang, Xing, Zou, Han, Liu, Sun, Wu and Fan</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these&#x20;terms.</p>
</license>
</permissions>
<abstract>
<p>We propose a method based on neural networks to accurately predict hydration sites in proteins. In our approach, high-quality data of protein structures are used to parametrize our neural network model, which is a differentiable score function that can evaluate an arbitrary position in 3D structures on proteins and predict the nearest water molecule that is not present. The score function is further integrated into our water placement algorithm to generate explicit hydration sites. In experiments on the OppA protein dataset used in previous studies and our selection of protein structures, our method achieves the highest model quality in terms of F1 score, compared to several previous studies.</p>
</abstract>
<kwd-group>
<kwd>machine learning</kwd>
<kwd>protein</kwd>
<kwd>hydration sites</kwd>
<kwd>atom embedding</kwd>
<kwd>prediction</kwd>
</kwd-group>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<sec id="s1-1">
<title>1.1 Protein Hydration Prediction</title>
<p>Solvation of biomolecules is essential for their functionality, and water molecules are crucial in various biochemical processes, such as bridging secondary structures of proteins, acting as proton donor/acceptors in proton wires, and discriminating ligands at binding sites, all of which require knowledge about positions and orientations of explicit water molecules (<xref ref-type="bibr" rid="B3">Bellissent-Funel et&#x20;al., 2016</xref>). Among these functions, water-mediated protein-ligand interactions are of great interest from the computational side. In an analysis of 392&#x20;high-resolution protein structures, 76% of the protein-ligand complexes had at least one bridging water molecule at the interface (<xref ref-type="bibr" rid="B16">Lu et&#x20;al., 2007</xref>). Accordingly, many docking programs have been developed to incorporate explicit water molecules in the docking process and yield prediction results, such as WScore (<xref ref-type="bibr" rid="B22">Murphy et&#x20;al., 2016</xref>) from Schr&#xf6;dinger, Rosetta (<xref ref-type="bibr" rid="B14">Lemmon and Meiler, 2013</xref>), AutoDock4 (<xref ref-type="bibr" rid="B6">Forli and Olson, 2012</xref>). Better understanding and modeling of this interaction is utilized in structure-based drug designs where drug candidates are modified to replace water molecules in the binding pocket, primarily for entropic gains (<xref ref-type="bibr" rid="B5">Bucher et&#x20;al., 2018</xref>).</p>
<p>In the laboratory, water positions in protein structures are mainly obtained by X-ray crystallography, and crystallographic data have shown that protein structures of a 1 &#x00C5; resolution contain 66<italic>%</italic> more resolved water molecules than a structure of 2 &#x00C5; resolution(<xref ref-type="bibr" rid="B18">Maurer and Oostenbrink, 2019</xref>). Despite this, over 50<italic>%</italic> of deposited structures in the Protein Data Bank (PDB) database have a resolution larger than 2.0&#x00a0;&#x00c5; (<xref ref-type="bibr" rid="B1">RCSB, 2020</xref>), which indicates plenty of crystalline water molecules are not resolved due to the transient dynamic of water molecules and a lack of local information in the density map. Furthermore, protein structures, either obtained by experimental techniques such as nuclear magnetic resonance (NMR) or predicted through computational tools such as AlphaFold (<xref ref-type="bibr" rid="B30">Senior et&#x20;al., 2020</xref>), provide no information about water molecules.</p>
<p>There is an unmet need for a reliable predictive model for protein hydration that can be integrated into and benefit other modeling and experimental systems. However, how the way to implement such a model, via exploiting a limited amount of experimental data, is still being explored.</p>
</sec>
<sec id="s1-2">
<title>1.2 Related Works</title>
<p>Many <italic>force field</italic> based methods are proposed, given an abundance of simulation programs that already incorporated some established physical models, with built-in approaches for simulations such as Molecular Dynamics (MD) and Monte Carlo (MC) available. For the prediction of explicit hydration sites, an extra step is needed to analyze and cluster the trajectory or histogram of simulations performed on an equilibrated system comprising a protein macromolecule solvated by explicit water molecules. Examples of MD-based methods are WaterMap (<xref ref-type="bibr" rid="B24">Schrodinger, 2020</xref>), which is based on the Inhomogeneous Fluid approach to Solvation Thermodynamics (IFST) (<xref ref-type="bibr" rid="B13">Lazaridis, 1998</xref>) and WATSite (<xref ref-type="bibr" rid="B11">Hu and Lill, 2014</xref>; <xref ref-type="bibr" rid="B34">Yang et&#x20;al., 2017</xref>) which integrates over a probability density function of water molecules to estimate the entropic change. Both of these methods claim an effective consideration of entropic terms, which are believed to contribute substantially to the free energy change in cases like solvation of cavities (<xref ref-type="bibr" rid="B36">Young et&#x20;al., 2007</xref>). The main disadvantage is the time cost, as MD simulations sometimes struggled to escape local minima and failed to sample the state space efficiently. One attempt to circumvent this problem is an MC-based method called JAWS (Just Add Water moleculeS) (<xref ref-type="bibr" rid="B19">Michel et&#x20;al., 2009</xref>) that employs a grid-based Metropolis Sampling of water molecules to directly estimate the free energy. Results from JAWS are satisfactory for isolated cavities, but are not ideal for rather exposed grids due to convergence issues.</p>
<p>The reference interaction site model (RISM) (<xref ref-type="bibr" rid="B2">Beglov and Roux, 1997</xref>) with the Kovalenko-Hirata (KH) closure, or the 3D-RISM (<xref ref-type="bibr" rid="B12">Kovalenko and Hirata, 1999</xref>), on the other hand, calculate the 3D solvent distribution function directly via the statistical mechanics-based integral equation of liquids, saving simulation time. The distribution function has been used for hydration-site analysis of biomolecules (<xref ref-type="bibr" rid="B35">Yoshidome et&#x20;al., 2020</xref>), and also has been utilized as an intermediate to yield explicit hydration sites by the combining use of water-placement algorithms such as Placevent (<xref ref-type="bibr" rid="B31">Sindhikara et&#x20;al., 2012</xref>), which iteratively finds maximum points of the distribution function for atom insertion, and GAsol (<xref ref-type="bibr" rid="B7">Fusani et&#x20;al., 2018</xref>), a genetic algorithm that decides the occupancy of selected potential hydration sites. The quality of 3D-RISM results depends on the force field parameters used in its calculations, thus it requires careful parameter choices before being put in predictive purposes for specific systems (<xref ref-type="bibr" rid="B28">Roy and Kovalenko, 2021</xref>). <xref ref-type="bibr" rid="B17">Masters et&#x20;al. (2018)</xref> studied the combination of WATsite and 3D-RISM with GAsol and claimed a better prediction by the joint&#x20;model.</p>
<p>Methods that utilize empirical, ad hoc functions for energy estimation of water molecules have been widely adopted for their rapidity. For instance, one of the first attempts to predict hydration sites of protein, GRID (<xref ref-type="bibr" rid="B9">Goodford, 1985</xref>), reported over 30&#x20;years ago, evaluates the energy of water molecules at certain grid points by a combination of empirical functions (Lennard-Jones, electrostatic and hydrogen bond). Some individual cases were analyzed in this work using contours of energy isosurfaces as a rough depiction of minima of the energy function in space, but no systematic assessment on the predictive power was performed.</p>
<p>The WarPP (WateR Placement Procedure) (<xref ref-type="bibr" rid="B23">Nittinger et&#x20;al., 2018</xref>) method is built on an empirical score of water molecules based on interaction geometries dedicated to hydrogen bond modeling, which is then parametrized manually through large-scale experimental data. The specially chosen score function is continuously differentiable, thus gradient optimizable. Another method called GalaxyWater-wKGB (<xref ref-type="bibr" rid="B10">Heo et&#x20;al., 2021</xref>) used a generalized Born model that also considers hydrogen bond orientation and distance, more importantly, it also includes the solvent accessibility between a protein atom and a water oxygen atom. This method was tested to have a similar recovery rate (it recovers about 80% of crystallographic waters at the cost of producing seven to eight times the number of water molecules) with methods like 3D-RISM while being 180&#xa0;times faster.</p>
<p>Recently, a method named Hydramap (<xref ref-type="bibr" rid="B15">Li et&#x20;al., 2020</xref>) was proposed to estimate the energy in &#x201c;statistical potentials&#x201d;, which quantifies pairwise interactions between water molecules and atoms of protein by counting the occurrence of atoms of certain types near a crystalline water molecule in experimental data. The resulting density map of the statistical potential is then clustered to predict explicit water sites. Although the mean-field strategy significantly reduced the computational cost, this method falls short of the performance of MD-based methods in high-resolution structures, possibly because a coarse grid is used for the placement of water molecules.</p>
<p>Other than using simulation-based methods, docking-based methods like WaterDock (<xref ref-type="bibr" rid="B26">Ross et&#x20;al., 2012</xref>) are developed. WaterDock directly treats water molecules as ligands and uses the ligand-docking program AutoDock Vina <xref ref-type="bibr" rid="B33">Trott and Olson (2010)</xref> to predict the docking position of the water molecule. The updated WaterDock2.0&#x20;(<xref ref-type="bibr" rid="B32">Sridhar et&#x20;al., 2017</xref>) includes explicit water sites summarized from MD simulations for each functional group, reporting a lower false positive rate. Another related work that builds on WaterDock and Dowser (<xref ref-type="bibr" rid="B20">Morozenko et&#x20;al., 2014</xref>) is the Dowser&#x2b;&#x2b; program (<xref ref-type="bibr" rid="B21">Morozenko and Stuchebrukhov, 2016</xref>). Dowser&#x2b;&#x2b; takes Dowser&#x2019;s emphasis on the charge-dipole interactions in energy calculation, fixes issues like crashing water sites, and extends the scope of prediction of WaterDock from near the binding pocket to the whole protein. Although Dowser&#x2b;&#x2b; outperforms its predecessors, there is a constant underestimation of the number of water molecules, the reasons of which are speculated to be the limited number of predictions allowed in WaterDock and the independent insertion of water molecules with no water-water interaction considered.</p>
<p>Several attempts to introduce neural networks (NN) into this problem have been reported by <xref ref-type="bibr" rid="B8">Ghanbarpour et&#x20;al. (2020)</xref>. However, they were unable to produce prediction results for explicit hydration sites. Instead, a modified U-net architecture has been used to feed an input structure into multiple 3D convolutional layers to generate occupancy values at grid points. The U-net is trained using an input data set derived from the aforementioned WATSite <xref ref-type="bibr" rid="B11">Hu and Lill (2014)</xref> analysis of thousands of MD simulations, followed by another fully connected layer that predicts thermodynamic properties from the occupancy values.</p>
</sec>
</sec>
<sec id="s2">
<title>2 Methodology</title>
<p>Inspired by recent efforts (<xref ref-type="bibr" rid="B29">Sch&#xfc;tt et&#x20;al., 2018</xref>) in molecular modeling that utilizes NNs as universal approximators to describe physical interactions, our solution to the protein hydration prediction problem is based on explicit NN modeling of the interactions among water molecules and protein atoms, instead of predicting intermediate occupancy values.</p>
<p>Our method comprises two components: scoring and sampling. In the scoring part, we train a neural network-based scoring function <italic>Score</italic>(<bold>
<italic>p</italic>
</bold> &#x2223; <bold>
<italic>prot</italic>
</bold>) from protein structures in the publicly available protein data bank (<xref ref-type="bibr" rid="B4">Berman et&#x20;al., 2000</xref>). The scoring function evaluates the environment of an arbitrary position <bold>
<italic>p</italic>
</bold> in a protein <bold>
<italic>prot</italic>
</bold> and then predicts the shortest distance between <bold>
<italic>p</italic>
</bold> and a potential water molecule. In the sampling part, we tackle the end-to-end hydration prediction problem. Given a protein structure without water molecules or only partially hydrated, our algorithm utilizes the trained scoring function and successively places missing water molecules into the protein structure.</p>
<sec id="s2-1">
<title>2.1 Learned Scoring Function</title>
<p>As the primary component of our solution, the scoring function (<inline-formula id="inf1">
<mml:math id="m1">
<mml:mi>S</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mo stretchy="false">&#x2223;</mml:mo>
<mml:mi mathvariant="bold-italic">p</mml:mi>
<mml:mi mathvariant="bold-italic">r</mml:mi>
<mml:mi mathvariant="bold-italic">o</mml:mi>
<mml:mi mathvariant="bold-italic">t</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="double-struck">R</mml:mi>
</mml:math>
</inline-formula>, or <italic>the scorer</italic>) probes a given protein structure <bold>
<italic>prot</italic>
</bold> for potential missing water molecules, by predicting the Euclidean distance from a position <bold>
<italic>p</italic>
</bold> to the nearest water molecule that is not in the input protein structure&#x20;<bold>
<italic>prot</italic>
</bold>.</p>
<p>
<xref ref-type="fig" rid="F1">Figure&#x20;1</xref> serves as an illustrative overview of the workflow of our scorer. For a given position <bold>
<italic>p</italic>
</bold>, we calculate interaction embedding for each atom within 4.0&#x00C5; of <bold>
<italic>p</italic>
</bold>. As shown in <xref ref-type="fig" rid="F1">Figure&#x20;1A</xref>, the calculation is based on interaction terms consisting of distance terms and angle terms. These terms are analogues to distance and angle potentials in conventional force fields. We use a statistical reduction method to reduce all embeddings represented by these terms into a single interaction embedding of the atom. After the interaction embedding of each atom is computed, we employ another statistical reduction of these embeddings to obtain the final score, as shown in <xref ref-type="fig" rid="F1">Figure&#x20;1B</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Architecture of the scoring function. <bold>(A)</bold>. Generate the interaction embedding for atom <bold>
<italic>q</italic>
</bold> in protein from all interaction terms between <bold>
<italic>q</italic>
</bold> and <italic>position</italic>. <bold>(B)</bold>. Evaluate the score of <italic>position</italic> based on interaction embeddings of all atoms in the receptive field of a 4.0&#x00C5; radius. PolyNN is our modified version of multilayer perceptron with three fully connected layers.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g001.tif"/>
</fig>
<p>After parametrization of the scorer, our objective is to find positions with scores approaching zero. Apart from modeling atom and bond iterations in the protein structure, we implement a scorer neural network which is continuous and differentiable. This allows us to calculate the derivative of the score over the position and use this as the direction for gradient descent optimization.</p>
<sec id="s2-1-1">
<title>2.1.1 Embeddings</title>
<p>
<bold>Atom and bond embeddings.</bold> To obtain the embedding for each atom in the input protein, we categorize all atoms that appeared in protein structures into discrete atom types based on their element types, bonded neighbors, and hybridization configurations. An embedding vector is then assigned to each atom type as learnable parameters that will be updated during the training process. Bond type embeddings are similarly categorized, based on the bond&#x20;types.</p>
<p>
<bold>Interaction embedding.</bold> The interaction embedding of an atom <bold>
<italic>q</italic>
</bold> encapsulates its local information (atom and bond types) and spatial relationship to the position of interest <bold>
<italic>p</italic>
</bold>. Specifically, it is computed over the following interaction terms:<list list-type="simple">
<list-item>
<p>&#x2022; <bold>Distance term:</bold> Information includes <bold>
<italic>q</italic>
</bold>&#x2019;s atom type embedding, appended with <bold>
<italic>q</italic>
</bold>&#x2019;s Euclidean distance to <bold>
<italic>p</italic>
</bold>. Pairwise atom force potentials such as the van der Waals and electrostatic potentials are modeled.</p>
</list-item>
<list-item>
<p>&#x2022; <bold>Angle term:</bold> Computed for each atom <bold>
<italic>r</italic>
</bold> bonded to <bold>
<italic>q</italic>
</bold>. We include atom type embeddings of <bold>
<italic>q</italic>
</bold> and <bold>
<italic>r</italic>
</bold>, concatenated with their bond type embedding, and the angle <inline-formula id="inf2">
<mml:math id="m2">
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>&#x2220;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">q</mml:mi>
<mml:mi mathvariant="bold-italic">p</mml:mi>
</mml:mrow>
<mml:mo>&#x20d7;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi mathvariant="bold-italic">q</mml:mi>
<mml:mi mathvariant="bold-italic">r</mml:mi>
</mml:mrow>
<mml:mo>&#x20d7;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>. This term mainly captures the anisotropy of electron distribution, which is critical to the formation of hydrogen&#x20;bonds.</p>
</list-item>
</list>
</p>
<p>For each interaction term, the input embeddings and other information are concatenated and fed through a differentiable multi-layer perceptron <xref ref-type="sec" rid="s2-1-2">(section 2.1.2)</xref> to obtain an interaction embedding. Interaction embeddings of all atoms in the receptive field are then collected and reduced to a single embedding using the statistical reduction algorithm <xref ref-type="sec" rid="s2-1-3">(section 2.1.3)</xref>. The reduced embedding is connected to another multi-layer perceptron to compute the final&#x20;score.</p>
</sec>
<sec id="s2-1-2">
<title>2.1.2 Continuous and Differentiable Multi-Layer Perceptron</title>
<p>The aggregation and reduction function used in previous sections needs to perform vector-to-vector transformations. This is typically implemented using a multi-layer perceptron in neural networks. Since our trained scoring function needs to be used in the subsequent optimization process, it is desirable to be differentiable.</p>
<p>We use a specifically designed layer function for this purpose. This function is called polynomial neural network function (PolyNN), which is a modified version of the multi-layer perceptron. It has three fully connected layers (from input <bold>
<italic>x</italic>
</bold>
<sub>
<bold>0</bold>
</sub> to output <bold>
<italic>x</italic>
</bold>
<sub>
<bold>3</bold>
</sub>):<disp-formula id="e2_1">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>w</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>h</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold-italic">0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold-italic">0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">b</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">0</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
<label>(2.1)</label>
</disp-formula>
<disp-formula id="e2_2">
<mml:math id="m4">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:math>
<label>(2.2)</label>
</disp-formula>
<disp-formula id="e2_3">
<mml:math id="m5">
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">3</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">W</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:math>
<label>(2.3)</label>
</disp-formula>
</p>
<p>Here, <bold>
<italic>x</italic>
</bold>
<sub>
<bold>
<italic>1</italic>
</bold>
</sub>
<bold>
<italic>,x</italic>
</bold>
<sub>
<bold>
<italic>2</italic>
</bold>
</sub> are intermediate layers, <bold>
<italic>W</italic>
</bold>
<sub>
<bold>
<italic>0</italic>
</bold>
</sub>
<bold>
<italic>,W</italic>
</bold>
<sub>
<bold>
<italic>1</italic>
</bold>
</sub>
<bold>
<italic>,W</italic>
</bold>
<sub>
<bold>
<italic>2</italic>
</bold>
</sub> are parameter matrices, and <bold>
<italic>b</italic>
</bold>
<sub>
<bold>0</bold>
</sub> is a bias vector. The Swish activation function <xref ref-type="disp-formula" rid="e2_1">Eq. 2.1</xref> is described and tested by <xref ref-type="bibr" rid="B25">Ramachandran et&#x20;al. (2017)</xref>.</p>
<p>Besides being differentiable, the definition allows a continuous modeling of arbitrary algebraic functions rather than conventional multi-layer perceptrons which tends to learn step function-based structures.</p>
</sec>
<sec id="s2-1-3">
<title>2.1.3 Statistical Reduction</title>
<p>The PolyNN network is a one-to-one vector function approximator. Hence for a variable-sized set of vectors such as the interaction embedding set to act as the input of PolyNN, a reduction process is needed. In our work, the statistical reduction is chosen to collect several statistical characteristics as descriptors of the input set, including the summation, average, maximum, and standard deviation values of sets of corresponding components taken from each vector in the input&#x20;set.</p>
<p>Let the input <italic>n</italic>-dimensional vector set of size <italic>M</italic> be {<bold>
<italic>x</italic>
</bold>
<sub>1</sub>, <bold>
<italic>x</italic>
</bold>
<sub>2</sub>, &#x2026; ,&#x20;<bold>
<italic>x</italic>
</bold>
<sub>
<italic>M</italic>
</sub>}, and the <italic>j</italic>th element of vector <bold>
<italic>x</italic>
</bold>
<sub>
<italic>i</italic>
</sub> be <bold>
<italic>x</italic>
</bold>
<sub>
<italic>ij</italic>
</sub>. The statistical reduction layer first calculates the following vectors:<disp-formula id="e2_4">
<mml:math id="m6">
<mml:mi mathvariant="bold-italic">s</mml:mi>
<mml:mi mathvariant="bold-italic">u</mml:mi>
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
<label>(2.4)</label>
</disp-formula>
<disp-formula id="e2_5">
<mml:math id="m7">
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">v</mml:mi>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
<mml:mi mathvariant="bold-italic">u</mml:mi>
<mml:mi mathvariant="bold-italic">m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(2.5)</label>
</disp-formula>
<disp-formula id="e2_6">
<mml:math id="m8">
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">{</mml:mo>
<mml:mrow>
<mml:munder>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mi>max</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">}</mml:mo>
</mml:mrow>
</mml:math>
<label>(2.6)</label>
</disp-formula>
<disp-formula id="e2_7">
<mml:math id="m9">
<mml:mi mathvariant="bold-italic">s</mml:mi>
<mml:mi mathvariant="bold-italic">t</mml:mi>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">v</mml:mi>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn mathvariant="bold">1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>i</mml:mi>
<mml:mo>&#x2264;</mml:mo>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">v</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold-italic">g</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi mathvariant="bold-italic">n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(2.7)</label>
</disp-formula>
</p>
<p>The calculated statistical vectors are then concatenated together and a PolyNN layer is then applied to obtain the final output vector of reduction layer <bold>
<italic>y</italic>
</bold>:<disp-formula id="e2_8">
<mml:math id="m10">
<mml:mi mathvariant="bold-italic">y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>y</mml:mi>
<mml:mi>N</mml:mi>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi mathvariant="bold-italic">s</mml:mi>
<mml:mi mathvariant="bold-italic">u</mml:mi>
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">v</mml:mi>
<mml:mi mathvariant="bold-italic">g</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-italic">m</mml:mi>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi mathvariant="bold-italic">s</mml:mi>
<mml:mi mathvariant="bold-italic">t</mml:mi>
<mml:mi mathvariant="bold-italic">d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
<label>(2.8)</label>
</disp-formula>
</p>
</sec>
</sec>
<sec id="s2-2">
<title>2.2 Model Training</title>
<p>The parametrization of the scoring function is accomplished by standard supervised training with the back-propagation algorithm, as illustrated in <xref ref-type="fig" rid="F2">Figure&#x20;2</xref>. A training instance consists of a pair {<bold>
<italic>water</italic>
</bold>, <bold>
<italic>prot</italic>
</bold>}, where <bold>
<italic>water</italic>
</bold> is the water position to be predicted, <bold>
<italic>prot</italic>
</bold> is the environment of the water to be predicted, i.e.,&#x20;protein structure excluding the water. For each training instance, a <bold>
<italic>label</italic>
</bold> is assigned to denote the distance from the water to its nearest ground truth position. Hence, the training objective is to let <italic>Score</italic>(<bold>
<italic>water</italic>
</bold> &#x2223; <bold>
<italic>prot</italic>
</bold>) approximate <bold>
<italic>label</italic>
</bold>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Training process of the scoring function.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g002.tif"/>
</fig>
<p>We first extract <italic>static</italic> positive and negative training instances from crystal structures in the following ways:<list list-type="simple">
<list-item>
<p>&#x2022; Ground truth positives: Positive instances are generated from crystal water positions. The <bold>
<italic>label</italic>
</bold> is 0 by definition.</p>
</list-item>
<list-item>
<p>&#x2022; Nearby sampled negatives: For each crystal water, we randomly move its position within 0.8&#x00a0;&#x00c5; and form a negative instance. The <bold>
<italic>label</italic>
</bold> is set accordingly.</p>
</list-item>
<list-item>
<p>&#x2022; Random position negatives: We generate new water molecules and place them randomly in the protein structure. This makes sure the model does not place excess water molecules. We use the full protein <bold>
<italic>prot</italic>
</bold> in this case without removing any waters from it, and define the <bold>
<italic>label</italic>
</bold> to be <italic>&#x221e;</italic>.</p>
</list-item>
</list>
</p>
<p>Such simple extracted negative instances are insufficient for training, because most randomly sampled negatives are trivial to identify by the model. To improve the sampling efficiency, we implement <italic>dynamic</italic> negative sampling procedures by generating negative instances on-the-fly during training:<list list-type="simple">
<list-item>
<p>&#x2022; Leave-one-out negatives: When processing a batch in the training process, we examine each positive instance in the batch. The current model after updating the last batch is used to optimize the position of the ground truth water molecule by gradient descent. The optimized position is appended to the current&#x20;batch.</p>
</list-item>
<list-item>
<p>&#x2022; End-to-end negatives: In the final water placement stage, as in section 2.3, we will encounter proteins with partially or wrongly determined environmental waters. To make the model robust in this scenario, we remove all water molecules in a crystal structure and use the water placement algorithm to predict all water positions from scratch. For each predicted water molecule, we find its nearest crystal water position and generate a training instance accordingly.</p>
</list-item>
</list>
</p>
<p>The loss function for training is defined as:<disp-formula id="e2_9">
<mml:math id="m11">
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:munder>
<mml:mi>w</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x22c5;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>d</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>b</mml:mi>
<mml:mi>e</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>&#x3b8;</mml:mi>
<mml:msup>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(2.9)</label>
</disp-formula>where the instance index <italic>i</italic> is iterated through the whole minibatch and &#x7c;<italic>&#x3b8;</italic>&#x7c;<sup>2</sup> represents L2-regularization.</p>
<p>The weights <italic>weight</italic>
<sub>
<italic>i</italic>
</sub> of the training instances are designed to prioritize training on instances having more interactions with atoms in the protein and less exposure to the bulk solvent, because these water molecules are more likely to be stable and correctly determined by crystallography. The weight is calculated as:<disp-formula id="e2_10">
<mml:math id="m12">
<mml:mi>w</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>h</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1.5</mml:mn>
<mml:mo>&#x22c5;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>amino_count</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mtext>water_count</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msqrt>
</mml:math>
<label>(2.10)</label>
</disp-formula>where amino_count and water_count correspond to the number of amino and water molecules in the instance&#x2019;s environment, respectively.</p>
<p>From an optimization perspective, the importance of deviations to the model decreases relatively as the absolute distance between the position to be predicted and the ground truth become larger. In order to implement this heuristic, we use a hand tuned normalized function to normalize the labels by a continuous function <italic>Norm</italic>(<italic>d</italic>) that is steep when <italic>d</italic> is relatively small, and become almost constant for <italic>d</italic>&#x20;&#x2208; [0.8, &#x2b; <italic>&#x221e;</italic>) (see <xref ref-type="fig" rid="F3">Figure&#x20;3</xref>).<disp-formula id="e2_11">
<mml:math id="m13">
<mml:mi>N</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>5</mml:mn>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>ln</mml:mi>
<mml:mo>&#x2061;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(2.11)</label>
</disp-formula>
</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Normalization function <italic>Norm</italic>(<italic>d</italic>), which converts the Euclidean distance <italic>d</italic> to our label value within the [0, 1)&#x20;range.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g003.tif"/>
</fig>
</sec>
<sec id="s2-3">
<title>2.3 Water Placement</title>
<p>By design, the property that differs our work from previous studies that handpicked empirical functions or output discrete values like occupancy is the differentiability of our automatically learned function. This differentiability enables subsequent optimization such as gradient descent to be performed. In practice, to attenuate problems such as traps of local minima and suboptimal conformations that arise from the sequential addition of water molecules, algorithms for water placements have been developed as complements to the scorer.</p>
<p>Our water placement algorithm comprises two parts, a placement part and a refinement part. In the placement part, our algorithm probes the current protein structure and finds the location of the potentially missing water molecules; the refinement part combines the water calculated in the previous step with the water molecules already in the protein and optimizes the overall position of all water molecules. Our algorithm runs these two parts alternatively until the placement part cannot add any new water molecule.</p>
<sec id="s2-3-1">
<title>2.3.1 Placement</title>
<p>The placement process starts with encompassing the protein with a 3D grid of bounding boxes. The dimension of each bounding box is 0.8&#x00a0;&#x00c5;, a value that is small enough to ensure the existence of at most one water molecule in each box. After placing the water molecule at the center of each box, gradient descent can be applied to optimize the position.</p>
<p>One can directly calculate scores of these optimized water molecules and keep those with scores better than a predefined threshold. However, there are two major problems in this simple water placement procedure:<list list-type="simple">
<list-item>
<p>1. Because each water is placed and optimized independently, it is possible that the best positions calculated for adjacent grids actually correspond to the same potential water molecule.</p>
</list-item>
<list-item>
<p>2. In crystal structures, there are water molecules that require joint interactions of the protein and other water molecules to stabilize. Such water molecules cannot be probed until all other water molecules that participate in the stabilization are revealed in the input protein structure.</p>
</list-item>
</list>
</p>
<p>To address these issues, our algorithm uses an iterative placement strategy. In each iteration, we re-optimize water molecules in each box and recalculate their scores, accommodating water molecules added in previous iterations. We then add the water molecule with the best score to the predicted structure. The iteration ends when the best water score is worse than a predefined threshold.</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 Refinement</title>
<p>The placement step places and optimizes water molecules individually. Therefore it is desirable to optimize all the added water molecules simultaneously. This is easily doable via gradient descent as our scorer is differentiable.</p>
<p>However, solely relying on gradient descent may lead to water molecules trapped in local minima, similar to the behavior seen in force field simulations. To alleviate this problem, we develop a local resampling strategy. Each time a number of adjacent water molecules are selected, and the water molecules in this region are resampled. The resampling procedure first removes water molecules from the prediction results and then tries to add back a subset of these water molecules. The subset with the best score is kept and iteration continues. When the algorithm cannot discover any subset that can be improved, the optimization process&#x20;ends.</p>
</sec>
</sec>
</sec>
<sec sec-type="results|discussion" id="s3">
<title>3 Experiment Results and Discussions</title>
<sec id="s3-1">
<title>3.1 Evaluation Metric</title>
<p>We evaluate and compare our method with our methods using the using the typical precision and recall metric:<disp-formula id="e3_1">
<mml:math id="m14">
<mml:mtext>precision</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>true&#x2009;positive&#x2009;count</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>number&#x2009;of&#x2009;predicted&#x2009;waters</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(3.1)</label>
</disp-formula>
<disp-formula id="e3_2">
<mml:math id="m15">
<mml:mtext>recall</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>true&#x2009;positive&#x2009;count</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>number&#x2009;of&#x2009;crystal&#x2009;waters</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(3.2)</label>
</disp-formula>
<disp-formula id="e3_3">
<mml:math id="m16">
<mml:mtext>F</mml:mtext>
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mtext>precision</mml:mtext>
<mml:mo>&#x2a;</mml:mo>
<mml:mtext>recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>precision</mml:mtext>
<mml:mo>&#x2b;</mml:mo>
<mml:mtext>recall</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(3.3)</label>
</disp-formula>
</p>
<p>To count true positives using 3D coordinates of our prediction and crystal water, we set three different cutoffs of the Euclidean distance in our analysis: 0.5 &#x00C5;, 1.0 &#x00C5; and 1.5 &#x00C5;. For each crystal water, at most one predicted water located within the cutoff range is counted as a true positive prediction.</p>
</sec>
<sec id="s3-2">
<title>3.2 Performance Case Study</title>
<p>In this section, we use the 14&#x20;Oligopeptide-binding protein structures (OppA) bound to different KXK tripeptides in the AcquaAlta paper (<xref ref-type="bibr" rid="B27">Rossato et&#x20;al., 2011</xref>) to evaluate the performance of our water placement algorithms. We compare our model with some previous methods: Dowser&#x2b;&#x2b;, wKGB, HydraMap, GAsol, and WATsite.</p>
<p>We first compare the performances of predicting water positions in ligand binding pockets. In this benchmark, we only consider waters within 4.0&#x00C5; of both the protein and the binding ligand. The statistical results are shown in <xref ref-type="table" rid="T1">Table&#x20;1</xref>, with the median running times of every method. Our model has large leads on the F1 measure with a moderate running time. It can be seen that other empirical function-based methods, especially wKGB, tend to predict an excessive number of water molecules. Under the 1.5&#x00a0;&#x00c5; cutoff, wKGB can recall all crystal waters, yet with a much lower precision, compromising the model&#x2019;s predictive power, which is reflected by its F1 score. This surplus of predicted water molecules suggests the algorithm is oversampling the water molecules and the outputs some clean-up, such as clustering or use of specific water placement algorithms. Among the others, WATsite shows a large lead in terms of performance, which showcases the power of its MD simulation. However, its running time suffers greatly because of the computational heavy MD process. Our neural network model achieves even better performance than WATsite, while maintaining speed comparable to other fast methods.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Results of predicting binding-site waters on the 14-structure OppA dataset (For wKGB, the default output and its output with different score threholds (6,8,10) are all included.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th colspan="3" align="center">Recall</th>
<th colspan="3" align="center">Precision</th>
<th colspan="3" align="center">F1 score</th>
<th rowspan="2" align="center">Median running time(s)</th>
</tr>
<tr>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Ours</td>
<td align="char" char=".">
<bold>0.581</bold>
</td>
<td align="char" char=".">0.847</td>
<td align="char" char=".">0.935</td>
<td align="char" char=".">
<bold>0.386</bold>
</td>
<td align="char" char=".">
<bold>0.654</bold>
</td>
<td align="char" char=".">0.789</td>
<td align="char" char=".">
<bold>0.460</bold>
</td>
<td align="char" char=".">
<bold>0.732</bold>
</td>
<td align="char" char=".">
<bold>0.850</bold>
</td>
<td align="center">380.3</td>
</tr>
<tr>
<td align="left">Dowser&#x2b;&#x2b;</td>
<td align="char" char=".">0.434</td>
<td align="char" char=".">0.723</td>
<td align="char" char=".">0.854</td>
<td align="char" char=".">0.267</td>
<td align="char" char=".">0.494</td>
<td align="char" char=".">0.633</td>
<td align="char" char=".">0.329</td>
<td align="char" char=".">0.582</td>
<td align="char" char=".">0.720</td>
<td align="center">1625.4</td>
</tr>
<tr>
<td align="left">HydraMap</td>
<td align="char" char=".">0.179</td>
<td align="char" char=".">0.644</td>
<td align="char" char=".">0.836</td>
<td align="char" char=".">0.069</td>
<td align="char" char=".">0.266</td>
<td align="char" char=".">0.397</td>
<td align="char" char=".">0.099</td>
<td align="char" char=".">0.376</td>
<td align="char" char=".">0.536</td>
<td align="center">
<bold>7.9</bold>
</td>
</tr>
<tr>
<td align="left">wKGB_all</td>
<td align="char" char=".">0.520</td>
<td align="char" char=".">
<bold>0.905</bold>
</td>
<td align="char" char=".">
<bold>1.000</bold>
</td>
<td align="char" char=".">0.122</td>
<td align="char" char=".">0.228</td>
<td align="char" char=".">0.285</td>
<td align="char" char=".">0.196</td>
<td align="char" char=".">0.364</td>
<td align="char" char=".">0.441</td>
<td align="center">265.2</td>
</tr>
<tr>
<td align="left">wKGB_6</td>
<td align="char" char=".">0.520</td>
<td align="char" char=".">
<bold>0.905</bold>
</td>
<td align="char" char=".">
<bold>1.000</bold>
</td>
<td align="char" char=".">0.141</td>
<td align="char" char=".">0.268</td>
<td align="char" char=".">0.335</td>
<td align="char" char=".">0.221</td>
<td align="char" char=".">0.412</td>
<td align="char" char=".">0.499</td>
<td align="center">265.2</td>
</tr>
<tr>
<td align="left">wKGB_8</td>
<td align="char" char=".">0.520</td>
<td align="char" char=".">
<bold>0.905</bold>
</td>
<td align="char" char=".">0.995</td>
<td align="char" char=".">0.151</td>
<td align="char" char=".">0.290</td>
<td align="char" char=".">0.358</td>
<td align="char" char=".">0.233</td>
<td align="char" char=".">0.437</td>
<td align="char" char=".">0.524</td>
<td align="center">265.2</td>
</tr>
<tr>
<td align="left">wKGB_10</td>
<td align="char" char=".">0.520</td>
<td align="char" char=".">0.899</td>
<td align="char" char=".">0.989</td>
<td align="char" char=".">0.162</td>
<td align="char" char=".">0.308</td>
<td align="char" char=".">0.380</td>
<td align="char" char=".">0.246</td>
<td align="char" char=".">0.457</td>
<td align="char" char=".">0.546</td>
<td align="center">265.2</td>
</tr>
<tr>
<td align="left">GAsol</td>
<td align="char" char=".">0.149</td>
<td align="char" char=".">0.465</td>
<td align="char" char=".">0.708</td>
<td align="char" char=".">0.085</td>
<td align="char" char=".">0.307</td>
<td align="char" char=".">0.522</td>
<td align="char" char=".">0.108</td>
<td align="char" char=".">0.367</td>
<td align="char" char=".">0.597</td>
<td align="center">1149</td>
</tr>
<tr>
<td align="left">WATsite</td>
<td align="char" char=".">0.448</td>
<td align="char" char=".">0.747</td>
<td align="char" char=".">0.843</td>
<td align="char" char=".">0.326</td>
<td align="char" char=".">0.645</td>
<td align="char" char=".">
<bold>0.816</bold>
</td>
<td align="char" char=".">0.375</td>
<td align="char" char=".">0.686</td>
<td align="char" char=".">0.823</td>
<td align="center">&#x223c; 15000</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Best result(s) in each column is(are) in bold font</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>We also test the harder task of predicting all water molecules within the protein structure. Only the methods capable of predicting non-binding-site waters are compared. The results are shown in <xref ref-type="table" rid="T2">Table&#x20;2</xref>. Again, our method outperforms prior&#x20;works.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Results of predicting all waters in the 14-structure OppA dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th colspan="3" align="center">Recall</th>
<th colspan="3" align="center">Precision</th>
<th colspan="3" align="center">F1 score</th>
<th rowspan="2" align="center">Median running time(s)</th>
</tr>
<tr>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Ours</td>
<td align="char" char=".">
<bold>0.340</bold>
</td>
<td align="char" char=".">0.550</td>
<td align="char" char=".">0.675</td>
<td align="char" char=".">
<bold>0.218</bold>
</td>
<td align="char" char=".">0.354</td>
<td align="char" char=".">0.437</td>
<td align="char" char=".">
<bold>0.264</bold>
</td>
<td align="char" char=".">
<bold>0.428</bold>
</td>
<td align="char" char=".">
<bold>0.527</bold>
</td>
<td align="char" char=".">380.3</td>
</tr>
<tr>
<td align="left">Dowser&#x2b;&#x2b;</td>
<td align="char" char=".">0.134</td>
<td align="char" char=".">0.261</td>
<td align="char" char=".">0.359</td>
<td align="char" char=".">0.208</td>
<td align="char" char=".">
<bold>0.403</bold>
</td>
<td align="char" char=".">
<bold>0.558</bold>
</td>
<td align="char" char=".">0.162</td>
<td align="char" char=".">0.315</td>
<td align="char" char=".">0.434</td>
<td align="char" char=".">1625.4</td>
</tr>
<tr>
<td align="left">wKGB_all</td>
<td align="char" char=".">0.278</td>
<td align="char" char=".">
<bold>0.738</bold>
</td>
<td align="char" char=".">
<bold>0.964</bold>
</td>
<td align="char" char=".">0.037</td>
<td align="char" char=".">0.098</td>
<td align="char" char=".">0.134</td>
<td align="char" char=".">0.065</td>
<td align="char" char=".">0.173</td>
<td align="char" char=".">0.235</td>
<td align="char" char=".">
<bold>265.2</bold>
</td>
</tr>
<tr>
<td align="left">wKGB_6</td>
<td align="char" char=".">0.253</td>
<td align="char" char=".">0.638</td>
<td align="char" char=".">0.837</td>
<td align="char" char=".">0.081</td>
<td align="char" char=".">0.205</td>
<td align="char" char=".">0.281</td>
<td align="char" char=".">0.122</td>
<td align="char" char=".">0.309</td>
<td align="char" char=".">0.419</td>
<td align="char" char=".">
<bold>265.2</bold>
</td>
</tr>
<tr>
<td align="left">wKGB_8</td>
<td align="char" char=".">0.227</td>
<td align="char" char=".">0.557</td>
<td align="char" char=".">0.732</td>
<td align="char" char=".">0.109</td>
<td align="char" char=".">0.270</td>
<td align="char" char=".">0.369</td>
<td align="char" char=".">0.147</td>
<td align="char" char=".">0.362</td>
<td align="char" char=".">0.489</td>
<td align="char" char=".">
<bold>265.2</bold>
</td>
</tr>
<tr>
<td align="left">wKGB_10</td>
<td align="char" char=".">0.203</td>
<td align="char" char=".">0.479</td>
<td align="char" char=".">0.622</td>
<td align="char" char=".">0.145</td>
<td align="char" char=".">0.343</td>
<td align="char" char=".">0.461</td>
<td align="char" char=".">0.168</td>
<td align="char" char=".">0.398</td>
<td align="char" char=".">
<bold>0.527</bold>
</td>
<td align="char" char=".">
<bold>265.2</bold>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Best result(s) in each column is(are) in bold font</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To better understand the prediction results, we analyze several structural scenarios in the OppA protein dataset. Water molecules interacting with several polar atoms in the protein structure are relatively easy for most models, such as water molecules in <xref ref-type="fig" rid="F4">Figures&#x20;4A,B</xref> with ideal distances to several polar atoms for the formation of hydrogen bonds. Those easy cases are usually buried, single water molecules in a hydrophilic environment inside the protein, and will be reproduced correctly as long as the model has accurate knowledge of hydrogen bonds such as length and angle distribution.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Examples of predicted water molecules of our model in the OppA protein dataset (PDB code: 1B3F), compared to wKGB, Hydramap, and Dowser&#x2b;&#x2b; (Red: Ground truth; Yellow: Our prediction; Blue: Dowser&#x2b;&#x2b;; Purple: wKGB; Brown: Hydramap; Grey: WATsite; White: GAsol.) Prediction results from Hydramap, WATsite, GAsol are binding-site only.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g004.tif"/>
</fig>
<p>For cases with a mixed environment in terms of hydrophilicity, correct prediction of the mere existence of water molecules can be challenging for many models. For instance, other models predicted the existence of multiple water molecules in <xref ref-type="fig" rid="F5">Figures 5A,B</xref>, while there is zero and one ground truth water within the environment, respectively. Our model, in both cases, outputs the correct number of water molecules, with the position precisely spotted. Water molecules predicted by other models in these two cases seem to be output by the model merely due to their proximity to polar atoms, which suggests the greater difficulty in such environments might arise from the complexity of interactions that necessitate holistic modeling of entropy-enthalpy trade-offs. For example, the addition of a water molecule to a mixed environment may benefit the stability by forming hydrogen bonds with other water molecules or polar atoms yet sacrifice entropic penalties by being too close to hydrophobic moieties.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Examples of better prediction made by our model in the OppA protein dataset. PDB ID: <bold>(A)</bold>. 1B3F; <bold>(B)</bold>. 1B5I.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g005.tif"/>
</fig>
<p>To reproduce water-water networks in proteins is more complicated, requiring not only an accurate energy model but also an advanced water placement algorithm. The water molecule reproduced by our model only (at the center of <xref ref-type="fig" rid="F6">Figure&#x20;6A</xref>) is an interesting starting example as it bounds to three other &#x201c;trivial&#x201d; water molecules that are predicted by all three models, with a rather safe distance with hydrophobic atoms insight. This water molecule, being mostly stabilized by water-water interactions, may be hard to predict if the algorithm cannot iteratively update the environment and uses only the input protein structure for predictions. Two more successful predictions of water networks are given in <xref ref-type="fig" rid="F6">Figures 6B,C</xref>. In both cases, while other models can predict part of the network, which are mainly ones that directly interact with the protein, our model bridges the gap and reproduces the water network precisely.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Examples of successfully reproduced water-water interactions in our prediction results, in the OppA protein dataset. PDB ID: <bold>(A)</bold>, <bold>(B)</bold> 1B3F; <bold>(C)</bold> 1B4Z.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g006.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>3.3 Large Dataset Benchmark</title>
<p>In this section, we test the algorithms on a large structure dataset comprising 413 high resolutions X-ray structures randomly selected from the RCSB PDB database. Due to time and usage constraints, we only compare our method with Dowser&#x2b;&#x2b; and HydraMap. The results are shown in <xref ref-type="table" rid="T3">Table&#x20;3</xref>. Our method shows similar performance as it did in the small dataset, while Dowser&#x2b;&#x2b; suffers greatly. This is possibly due to the manually parameterized docking algorithm it is based on, while our neural network-based method is better generalized on a large variety of structures.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Results of predicting all waters by our model and Dowser&#x2b;&#x2b;, on 413 selected protein structures (The results of Dowser&#x2b;&#x2b; are averaged over 380 successfully processed structures).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th colspan="3" align="center">Recall</th>
<th colspan="3" align="center">Precision</th>
<th colspan="3" align="center">F1 score</th>
</tr>
<tr>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Our_full</td>
<td align="char" char=".">
<bold>0.307</bold>
</td>
<td align="char" char=".">
<bold>0.512</bold>
</td>
<td align="char" char=".">
<bold>0.640</bold>
</td>
<td align="char" char=".">
<bold>0.229</bold>
</td>
<td align="char" char=".">0.384</td>
<td align="char" char=".">0.486</td>
<td align="char" char=".">
<bold>0.256</bold>
</td>
<td align="char" char=".">
<bold>0.427</bold>
</td>
<td align="char" char=".">
<bold>0.537</bold>
</td>
</tr>
<tr>
<td align="left">Dowser&#x2b;&#x2b;</td>
<td align="char" char=".">0.076</td>
<td align="char" char=".">0.155</td>
<td align="char" char=".">0.215</td>
<td align="char" char=".">0.188</td>
<td align="char" char=".">
<bold>0.390</bold>
</td>
<td align="char" char=".">
<bold>0.544</bold>
</td>
<td align="char" char=".">0.104</td>
<td align="char" char=".">0.214</td>
<td align="char" char=".">0.297</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Best result(s) in each column is(are) in bold font</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To test the performance on binding-site waters, we remove protein structures without a proper binding ligand from the previous dataset and obtain 100 protein structures. The results are shown in <xref ref-type="table" rid="T4">Table&#x20;4</xref>. In this scenario, Dowser&#x2b;&#x2b; performed better compared to the previous task, but our method still holds a clear&#x20;edge.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Results of predicting binding-site waters by our model, Dowser&#x2b;&#x2b; and HydraMap, on 100 selected protein structures (The results of Dowser&#x2b;&#x2b; are averaged over 91 successfully processed structures).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="left">Model</th>
<th colspan="3" align="center">Recall</th>
<th colspan="3" align="center">Precision</th>
<th colspan="3" align="center">F1 score</th>
</tr>
<tr>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
<th align="center">0.5&#x00C5;</th>
<th align="center">1.0&#x00C5;</th>
<th align="center">1.5&#x00C5;</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Our</td>
<td align="char" char=".">
<bold>0.389</bold>
</td>
<td align="char" char=".">
<bold>0.621</bold>
</td>
<td align="char" char=".">
<bold>0.755</bold>
</td>
<td align="char" char=".">
<bold>0.240</bold>
</td>
<td align="char" char=".">
<bold>0.390</bold>
</td>
<td align="char" char=".">
<bold>0.490</bold>
</td>
<td align="char" char=".">
<bold>0.283</bold>
</td>
<td align="char" char=".">
<bold>0.455</bold>
</td>
<td align="char" char=".">
<bold>0.559</bold>
</td>
</tr>
<tr>
<td align="left">Dowser&#x2b;&#x2b;</td>
<td align="char" char=".">0.151</td>
<td align="char" char=".">0.294</td>
<td align="char" char=".">0.369</td>
<td align="char" char=".">0.144</td>
<td align="char" char=".">0.313</td>
<td align="char" char=".">0.457</td>
<td align="char" char=".">0.133</td>
<td align="char" char=".">0.275</td>
<td align="char" char=".">0.368</td>
</tr>
<tr>
<td align="left">HydraMap</td>
<td align="char" char=".">0.043</td>
<td align="char" char=".">0.314</td>
<td align="char" char=".">0.753</td>
<td align="char" char=".">0.014</td>
<td align="char" char=".">0.083</td>
<td align="char" char=".">0.185</td>
<td align="char" char=".">0.018</td>
<td align="char" char=".">0.122</td>
<td align="char" char=".">0.277</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Best result(s) in each column is(are) in bold font</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>To further analyze the performance characteristics of the algorithms in terms of different types of water molecules, we categorize water molecules in the benchmark dataset into subsets and compare the recall rate on these sets. <xref ref-type="fig" rid="F7">Figure&#x20;7</xref> shows the comparison chart. In <xref ref-type="fig" rid="F7">Figure&#x20;7A</xref>, we categorize the water molecules by their real-space correlation coefficient (RSCC, a common measure used in crystallography to measure the similarity between the model and the experimental density map) of the oxygen atom, and test the recall rate on different RSCC value ranges. The figure shows that water molecules with lower RSCC tend to be harder to predict, which agrees with the fact that RSCC can measure the certainty of the existence of an atom at its location in the model. Lower RSCC corresponds to higher uncertainty of the atom&#x2019;s position, and may even indicate an incorrectly resolved water molecule at this position, which should not be predicted by a reliable model. In <xref ref-type="fig" rid="F7">Figure&#x20;7B</xref>, we find the number of nearby polar atoms of each water molecule and calculated the recall rate for water molecules grouped by the number of polar atom neighbors. The results indicate that without explicit prior domain knowledge, the model successfully learns that polar atoms are highly related to the distribution of water molecules, hence having a very high success rate when the number of polar atoms surrounding a water molecule is high. <xref ref-type="fig" rid="F7">Figure&#x20;7C</xref> shows the differences in performance when water molecules are categorized by the number of contacting waters, which can be seen as a measure of the solvent exposure ratio of a certain location.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Recall rates of water molecules, categorized by different categorizations of the water molecules <bold>(A)</bold>. The Real Space Correlation Coefficient. <bold>(B)</bold>. Number of polar atoms of the protein nearby. <bold>(C)</bold>. Number of water molecules nearby. O: our model, D: Dowser&#x2b;&#x2b;.</p>
</caption>
<graphic xlink:href="fmolb-08-756075-g007.tif"/>
</fig>
</sec>
</sec>
<sec id="s4">
<title>4 Conclusion</title>
<p>Due to the importance of water molecules in protein modeling, many methods for predicting water molecule positions are developed over the years. One major drawback of previous works is the reliance on domain knowledge and explicit parameterization. In this paper, we discuss a novel water placement algorithm using deep learning. We show that without any manual parameterization, the performance of our model surpassed peers by a large margin. Such progress of hydration site prediction is expected to serve other applications as well, such as ligand docking and protein crystal structure refinement.</p>
</sec>
</body>
<back>
<sec id="s5">
<title>Data Availability Statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://www.rcsb.org/">https://www.rcsb.org/</ext-link>. Free assessment of our model is available on our Accutar Open Access platform (<ext-link ext-link-type="uri" xlink:href="https://oa.accutarbio.com/">https://oa.accutarbio.com/</ext-link>).</p>
</sec>
<sec id="s6">
<title>Author Contributions</title>
<p>PH conceived the idea and conceptualized it with KL and XS. PH, KL, and XS then formalized the methodology and designed the experiments. Experiments are carried out by PH, with help on software from HX and JW, and data curation efforts from XZ and JW. The investigation and analysis of experimental results are performed by PH, HX, XS, XZ, and KL. This manuscript was prepared by PH, along with QH, XS, and HX who proofread and edited the contents. PH and QH also visualized the concepts and results. The whole project is supervised by KL and XS, and the funding/resource acquisition is achieved by JF. All authors have read and agreed to the published version of the manuscript.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This research is funded by the Accutar Biotechnology Inc. This study received funding from Accutar Biotechnology Inc. The funder had the following involvement with the study: provider of computational resources.</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of Interest</title>
<p>Authors PH, HX, XZ, QH, KL, XS, JW, and JF were employed by the company Accutar Biotechnology Inc.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s Note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s10">
<title>Supplementary Material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fmolb.2021.756075/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fmolb.2021.756075/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.ZIP" id="SM1" mimetype="application/ZIP" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet3.PDF" id="SM2" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet2.XLSX" id="SM3" mimetype="application/XLSX" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Beglov</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Roux</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>1997</year>). <article-title>An Integral Equation to Describe the Solvation of Polar Molecules in Liquid Water</article-title>. <source>J.&#x20;Phys. Chem. B.</source> <volume>101</volume>, <fpage>7821</fpage>&#x2013;<lpage>7826</lpage>. <pub-id pub-id-type="doi">10.1021/jp971083h</pub-id> </citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bellissent-Funel</surname>
<given-names>M.-C.</given-names>
</name>
<name>
<surname>Hassanali</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Havenith</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Henchman</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Pohl</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Sterpone</surname>
<given-names>F.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Water Determines the Structure and Dynamics of Proteins</article-title>. <source>Chem. Rev.</source> <volume>116</volume>, <fpage>7673</fpage>&#x2013;<lpage>7697</lpage>. <pub-id pub-id-type="doi">10.1021/acs.chemrev.5b00664</pub-id> </citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Berman</surname>
<given-names>H. M.</given-names>
</name>
<name>
<surname>Westbrook</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Gilliland</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Bhat</surname>
<given-names>T. N.</given-names>
</name>
<name>
<surname>Weissig</surname>
<given-names>H.</given-names>
</name>
<etal/>
</person-group> (<year>2000</year>). <article-title>The Protein Data Bank</article-title>. <source>Nucleic Acids Res.</source> <volume>28</volume>, <fpage>235</fpage>&#x2013;<lpage>242</lpage>. <pub-id pub-id-type="doi">10.1093/nar/28.1.235</pub-id> </citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bucher</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Stouten</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Triballeau</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Shedding Light on Important Waters for Drug Design: Simulations Versus Grid-Based Methods</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>58</volume>, <fpage>692</fpage>&#x2013;<lpage>699</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.7b00642</pub-id> </citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Forli</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Olson</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>A Force Field With Discrete Displaceable Waters and Desolvation Entropy for Hydrated Ligand Docking</article-title>. <source>J.&#x20;Med. Chem.</source> <volume>55</volume>, <fpage>623</fpage>&#x2013;<lpage>638</lpage>. <pub-id pub-id-type="doi">10.1021/jm2005145</pub-id> </citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Fusani</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Wall</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Palmer</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Cortes</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Optimal Water Networks in Protein Cavities With Gasol and 3d-Rism</article-title>. <source>Bioinformatics.</source> <volume>34</volume>, <fpage>1947</fpage>&#x2013;<lpage>1948</lpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/bty024</pub-id> </citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ghanbarpour</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mahmoud</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Lill</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>On-the-fly Prediction of Protein Hydration Densities and Free Energies Using Deep Learning</article-title>. <comment>arXiv preprint arXiv:2001.02201</comment> </citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Goodford</surname>
<given-names>P. J.</given-names>
</name>
</person-group> (<year>1985</year>). <article-title>A Computational Procedure for Determining Energetically Favorable Binding Sites on Biologically Important Macromolecules</article-title>. <source>J.&#x20;Med. Chem.</source> <volume>28</volume>, <fpage>849</fpage>&#x2013;<lpage>857</lpage>. <pub-id pub-id-type="doi">10.1021/jm00145a002</pub-id> </citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Heo</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Park</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Seok</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Galaxywater-Wkgb: Prediction of Water Positions on Protein Structure Using Wkgb Statistical Potential</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>61</volume>, <fpage>2283</fpage>&#x2013;<lpage>2293</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.0c01434</pub-id> </citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lill</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Watsite: Hydration Site Prediction Program With Pymol Interface</article-title>. <source>J.&#x20;Comput. Chem.</source> <volume>35</volume>, <fpage>1255</fpage>&#x2013;<lpage>1260</lpage>. <pub-id pub-id-type="doi">10.1002/jcc.23616</pub-id> </citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kovalenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hirata</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Potential of Mean Force Between Two Molecular Ions in a Polar Molecular Solvent: A Study by the Three-Dimensional Reference Interaction Site Model</article-title>. <source>J.&#x20;Phys. Chem. B.</source> <volume>103</volume>, <fpage>7942</fpage>&#x2013;<lpage>7957</lpage>. <pub-id pub-id-type="doi">10.1021/jp991300&#x2b;</pub-id> </citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lazaridis</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Inhomogeneous Fluid Approach to Solvation Thermodynamics. 1. Theory</article-title>. <source>J.&#x20;Phys. Chem. B.</source> <volume>102</volume>, <fpage>3531</fpage>&#x2013;<lpage>3541</lpage>. <pub-id pub-id-type="doi">10.1021/jp9723574</pub-id> </citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lemmon</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Meiler</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Towards Ligand Docking Including Explicit Interface Water Molecules</article-title>. <source>PloS one.</source> <volume>8</volume>, <fpage>e67536</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0067536</pub-id> </citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Holloway</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Prediction of the Favorable Hydration Sites in a Protein Binding Pocket and its Application to Scoring Function Formulation</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>60</volume>, <fpage>4359</fpage>&#x2013;<lpage>4375</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.9b00619</pub-id> </citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>C.-Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Analysis of Ligand-Bound Water Molecules in High-Resolution Crystal Structures of Protein&#x2212;Ligand Complexes</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>47</volume>, <fpage>668</fpage>&#x2013;<lpage>675</lpage>. <pub-id pub-id-type="doi">10.1021/ci6003527</pub-id> </citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Masters</surname>
<given-names>M. R.</given-names>
</name>
<name>
<surname>Mahmoud</surname>
<given-names>A. H.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Lill</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Efficient and Accurate Hydration Site Profiling for Enclosed Binding Sites</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>58</volume>, <fpage>2183</fpage>&#x2013;<lpage>2188</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.8b00544</pub-id> </citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Maurer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Oostenbrink</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Water in Protein Hydration and Ligand Recognition</article-title>. <source>J.&#x20;Mol. Recognit.</source> <volume>32</volume>, <fpage>e2810</fpage>. <pub-id pub-id-type="doi">10.1002/jmr.2810</pub-id> </citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Michel</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Tirado-Rives</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jorgensen</surname>
<given-names>W. L.</given-names>
</name>
</person-group> (<year>2009</year>). <article-title>Prediction of the Water Content in Protein Binding Sites</article-title>. <source>J.&#x20;Phys. Chem. B.</source> <volume>113</volume>, <fpage>13337</fpage>&#x2013;<lpage>13346</lpage>. <pub-id pub-id-type="doi">10.1021/jp9047456</pub-id> </citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morozenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Leontyev</surname>
<given-names>I. V.</given-names>
</name>
<name>
<surname>Stuchebrukhov</surname>
<given-names>A. A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Dipole Moment and Binding Energy of Water in Proteins From Crystallographic Analysis</article-title>. <source>J.&#x20;Chem. Theor. Comput.</source> <volume>10</volume>, <fpage>4618</fpage>&#x2013;<lpage>4623</lpage>. <pub-id pub-id-type="doi">10.1021/ct500358r</pub-id> </citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Morozenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Stuchebrukhov</surname>
<given-names>A. A.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Dowser&#x2b;&#x2b;, a New Method of Hydrating Protein Structures</article-title>. <source>Proteins.</source> <volume>84</volume>, <fpage>1347</fpage>&#x2013;<lpage>1357</lpage>. <pub-id pub-id-type="doi">10.1002/prot.25081</pub-id> </citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Murphy</surname>
<given-names>R. B.</given-names>
</name>
<name>
<surname>Repasky</surname>
<given-names>M. P.</given-names>
</name>
<name>
<surname>Greenwood</surname>
<given-names>J.&#x20;R.</given-names>
</name>
<name>
<surname>Tubert-Brohman</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Jerome</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Annabhimoju</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>WScore: A Flexible and Accurate Treatment of Explicit Water Molecules in Ligand-Receptor Docking</article-title>. <source>J.&#x20;Med. Chem.</source> <volume>59</volume>, <fpage>4364</fpage>&#x2013;<lpage>4384</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jmedchem.6b00131</pub-id> </citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Nittinger</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Flachsenberg</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Bietz</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Lange</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Klein</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Rarey</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Placement of Water Molecules in Protein Structures: From Large-Scale Evaluations to Single-Case Examples</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>58</volume>, <fpage>1625</fpage>&#x2013;<lpage>1637</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jcim.8b00271</pub-id> </citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ramachandran</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Zoph</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q. V.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Swish: a Self-Gated Activation Function</article-title>. <comment>arXiv preprint arXiv:1710.05941 7</comment> </citation>
</ref>
<ref id="B1">
<citation citation-type="web">
<collab>RCSB Protein Data Bank</collab> (<year>2020</year>). <article-title>PDB statistics: PDB Data Distribution by Resolution</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="http://web.archive.org/web/20080207010024/">http://web.archive.org/web/20080207010024/</ext-link> (Accessed May 12, 2020)</comment>. </citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ross</surname>
<given-names>G. A.</given-names>
</name>
<name>
<surname>Morris</surname>
<given-names>G. M.</given-names>
</name>
<name>
<surname>Biggin</surname>
<given-names>P. C.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Rapid and Accurate Prediction and Scoring of Water Molecules in Protein Binding Sites</article-title>. <source>PloS one.</source> <volume>7</volume>, <fpage>e32036</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0032036</pub-id> </citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rossato</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Ernst</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Vedani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Smie&#x161;ko</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>AcquaAlta: A Directional Approach to the Solvation of Ligand-Protein Complexes</article-title>. <source>J.&#x20;Chem. Inf. Model.</source> <volume>51</volume>, <fpage>1867</fpage>&#x2013;<lpage>1881</lpage>. <pub-id pub-id-type="doi">10.1021/ci200150p</pub-id> </citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roy</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kovalenko</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Biomolecular Simulations with the Three-Dimensional Reference Interaction Site Model With the Kovalenko-Hirata Closure Molecular Solvation Theory</article-title>. <source>Int. J.&#x20;Mol. Sci.</source> <volume>22</volume>, <fpage>5061</fpage>. <pub-id pub-id-type="doi">10.3390/ijms22105061</pub-id> </citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<collab>Schr&#x00f6;dinger, LLC, New York, NY</collab> (<year>2020</year>). <article-title>Schr&#x00f6;dinger Release 2020-4: WaterMap</article-title>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.schrodinger.com/watermap">https://www.schrodinger.com/watermap</ext-link> (Accessed May 12, 2020)</comment>. </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sch&#xfc;tt</surname>
<given-names>K. T.</given-names>
</name>
<name>
<surname>Sauceda</surname>
<given-names>H. E.</given-names>
</name>
<name>
<surname>Kindermans</surname>
<given-names>P.-J.</given-names>
</name>
<name>
<surname>Tkatchenko</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>M&#xfc;ller</surname>
<given-names>K.-R.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>SchNet - A Deep Learning Architecture for Molecules and Materials</article-title>. <source>J.&#x20;Chem. Phys.</source> <volume>148</volume>, <fpage>241722</fpage>. <pub-id pub-id-type="doi">10.1063/1.5019779</pub-id> </citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Senior</surname>
<given-names>A. W.</given-names>
</name>
<name>
<surname>Evans</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jumper</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kirkpatrick</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sifre</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Green</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Improved Protein Structure Prediction Using Potentials From Deep Learning</article-title>. <source>Nature.</source> <volume>577</volume>, <fpage>706</fpage>&#x2013;<lpage>710</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-019-1923-7</pub-id> </citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sindhikara</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Yoshida</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Hirata</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2012</year>). <article-title>Placevent: An Algorithm for Prediction of Explicit Solvent Atom Distribution-Application to HIV-1 Protease and F-ATP Synthase</article-title>. <source>J.&#x20;Comput. Chem.</source> <volume>33</volume>, <fpage>1536</fpage>&#x2013;<lpage>1543</lpage>. <pub-id pub-id-type="doi">10.1002/jcc.22984</pub-id> </citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sridhar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ross</surname>
<given-names>G. A.</given-names>
</name>
<name>
<surname>Biggin</surname>
<given-names>P. C.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Waterdock 2.0: Water Placement Prediction for Holo-Structures With a Pymol Plugin</article-title>. <source>Plos One.</source> <volume>12</volume>, <fpage>e0172743</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0172743</pub-id> </citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Trott</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Olson</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2010</year>). <article-title>Autodock Vina: Improving the Speed and Accuracy of Docking With a New Scoring Function, Efficient Optimization, and Multithreading</article-title>. <source>J.&#x20;Comput. Chem.</source> <volume>31</volume>, <fpage>455</fpage>&#x2013;<lpage>461</lpage>. <pub-id pub-id-type="doi">10.1002/jcc.21334</pub-id> </citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lill</surname>
<given-names>M. A.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>WATsite2.0 With PyMOL Plugin: Hydration Site Prediction and Visualization</article-title>. <source>Protein Funct. Prediction.</source>, <fpage>123</fpage>&#x2013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1007/978-1-4939-7015-5_10</pub-id> </citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yoshidome</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ikeguchi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ohta</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Comprehensive 3D&#x2010;RISM Analysis of the Hydration of Small Molecule Binding Sites in Ligand&#x2010;Free Protein Structures</article-title>. <source>J.&#x20;Comput. Chem.</source> <volume>41</volume>, <fpage>2406</fpage>&#x2013;<lpage>2419</lpage>. <pub-id pub-id-type="doi">10.1002/jcc.26406</pub-id> </citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Young</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Abel</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Berne</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Friesner</surname>
<given-names>R. A.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Motifs for Molecular Recognition Exploiting Hydrophobic Enclosure in Protein-Ligand Binding</article-title>. <source>Proc. Natl. Acad. Sci.</source> <volume>104</volume>, <fpage>808</fpage>&#x2013;<lpage>813</lpage>. <pub-id pub-id-type="doi">10.1073/pnas.0610202104</pub-id> </citation>
</ref>
</ref-list>
</back>
</article>