<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article article-type="methods-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Chem. Eng.</journal-id>
<journal-title>Frontiers in Chemical Engineering</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Chem. Eng.</abbrev-journal-title>
<issn pub-type="epub">2673-2718</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1227620</article-id>
<article-id pub-id-type="doi">10.3389/fceng.2023.1227620</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Chemical Engineering</subject>
<subj-group>
<subject>Methods</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Solubilization of inclusion bodies: insights from explainable machine learning approaches</article-title>
<alt-title alt-title-type="left-running-head">Walther et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fceng.2023.1227620">10.3389/fceng.2023.1227620</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Walther</surname>
<given-names>Cornelia</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Martinetz</surname>
<given-names>Michael C.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2312923/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Friedrich</surname>
<given-names>Anja</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2103352/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Tschelie&#xdf;nig</surname>
<given-names>Anne-Luise</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Voigtmann</surname>
<given-names>Martin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2034860/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jung</surname>
<given-names>Alexander</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Brocard</surname>
<given-names>C&#x00E9;cile</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bluhmki</surname>
<given-names>Erich</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Smiatek</surname>
<given-names>Jens</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff6">
<sup>6</sup>
</xref>
<xref ref-type="aff" rid="aff7">
<sup>7</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Boehringer Ingelheim RCV GmbH &#x26; Co KG</institution>, <institution>Biopharma Austria</institution>, <institution>Process Science Downstream Development</institution>, <addr-line>Vienna</addr-line>, <country>Austria</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Pharmaceutical Sciences</institution>, <institution>Baxalta Innovations GmbH</institution>, <institution>Takeda Group</institution>, <addr-line>Vienna</addr-line>, <country>Austria</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Boehringer Ingelheim Pharma GmbH &#x26; Co. KG</institution>, <institution>Global Innovation and Alliance Management</institution>, <addr-line>Biberach</addr-line>, <country>Germany</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Boehringer Ingelheim Pharma GmbH &#x26; Co. KG</institution>, <institution>Analytical Development Biologicals</institution>, <addr-line>Biberach</addr-line>, <country>Germany</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Biberach University of Applied Sciences</institution>, <addr-line>Biberach</addr-line>, <country>Germany</country>
</aff>
<aff id="aff6">
<sup>6</sup>
<institution>Boehringer Ingelheim Pharma GmbH &#x26; Co. KG</institution>, <institution>Development NCE</institution>, <addr-line>Biberach</addr-line>, <country>Germany</country>
</aff>
<aff id="aff7">
<sup>7</sup>
<institution>Institute for Computational Physics</institution>, <institution>University of Stuttgart</institution>, <addr-line>Stuttgart</addr-line>, <country>Germany</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/578509/overview">Jin Wang</ext-link>, Auburn University, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1516269/overview">Joel Paulson</ext-link>, The Ohio State University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1193304/overview">Gefei Chen</ext-link>, Karolinska Institutet (KI), Sweden</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Cornelia Walther, <email>cornelia.walther@boehringer-ingelheim.com</email>; Jens Smiatek, <email>jens.smiatek@boehringer-ingelheim.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>07</day>
<month>08</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>5</volume>
<elocation-id>1227620</elocation-id>
<history>
<date date-type="received">
<day>23</day>
<month>05</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>20</day>
<month>07</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Walther, Martinetz, Friedrich, Tschelie&#xdf;nig, Voigtmann, Jung, Brocard, Bluhmki and Smiatek.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Walther, Martinetz, Friedrich, Tschelie&#xdf;nig, Voigtmann, Jung, Brocard, Bluhmki and Smiatek</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>We present explainable machine learning approaches for gaining deeper insights into the solubilization processes of inclusion bodies. The machine learning model with the highest prediction accuracy for the protein yield is further evaluated with regard to Shapley additive explanation (SHAP) values in terms of feature importance studies. Our results highlight an inverse fractional relationship between the protein yield and total protein concentration. Further correlations can also be observed for the dominant influences of the urea concentration and the underlying pH values. All findings are used to develop an analytical expression that is in reasonable agreement with experimental data. The resulting master curve highlights the benefits of explainable machine learning approaches for the detailed understanding of certain biopharmaceutical manufacturing steps.</p>
</abstract>
<kwd-group>
<kwd>inclusion bodies</kwd>
<kwd>refolding</kwd>
<kwd>solubilization</kwd>
<kwd>SHAP (shapley additive explanation)</kwd>
<kwd>explainable machine learning</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computational Methods in Chemical Engineering</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>High-level expression of recombinant proteins differs substantially between mammalian and microbial cells. In addition to missing post-translational modifications like glycosylation, cells from <italic>Escherichia coli</italic> (<italic>E. coli</italic>) also accumulate proteins at high concentration in aggregated form (<xref ref-type="bibr" rid="B37">Singh and Panda, 2005</xref>; <xref ref-type="bibr" rid="B38">Singhvi et al., 2020</xref>). Such inclusion bodies are dense particles of amorphous or para-crystalline protein arrangements (<xref ref-type="bibr" rid="B11">Freydell et al., 2007</xref>) that accumulate either in the cytoplasma or periplasma. The size of inclusion bodies varies between 0.5&#xa0;<italic>&#x3bc;</italic>m and 1.3&#xa0;<italic>&#x3bc;</italic>m with an average density of 1.3&#xa0;mg/mL (<xref ref-type="bibr" rid="B11">Freydell et al., 2007</xref>). Despite further processing steps that are required, impurities such as host cell proteins or DNA/RNA fragments are significantly reduced in inclusion bodies. In consequence, nearly 70%&#x2013;90% of the mass are represented by the recombinant protein, such that inclusion bodies can be considered as an interesting high-level expression system with certain advantages (<xref ref-type="bibr" rid="B46">Valax and Georgiou, 1993</xref>; <xref ref-type="bibr" rid="B37">Singh and Panda, 2005</xref>; <xref ref-type="bibr" rid="B33">Ram&#xf3;n et al., 2014</xref>). However, proteins from inclusion bodies often show missing biological activity due to denatured states, so further bioprocessing steps such as solubilization, refolding, and purification are required for efficient recovery.</p>
<p>Recent articles already studied the microporous structure of inclusion bodies (<xref ref-type="bibr" rid="B4">Bowden et al., 1991</xref>; <xref ref-type="bibr" rid="B49">Walther et al., 2013</xref>) and proposed a potential solubilization mechanism. Solubilization steps are usually performed in stirred reactors which facilitate the individual solvation of the proteins (<xref ref-type="bibr" rid="B50">Walther et al., 2014</xref>). By default, chemical denaturants are often used for mild solubilization conditions. In more detail, standard chemical denaturants like urea or guanidinium hydrochloride usually dissolve the protein from inclusion bodies in combination with reducing agents like mercaptoethanol or dithiothreitol (DTT) (<xref ref-type="bibr" rid="B8">Clark, 1998</xref>; <xref ref-type="bibr" rid="B37">Singh and Panda, 2005</xref>). The reducing agents are mainly used for the reduction of disulfide bonds in terms of protein refolding aspects (<xref ref-type="bibr" rid="B8">Clark, 1998</xref>). Notably, the presence of strongly denaturing agents at high concentrations results in a further loss of the native structure (<xref ref-type="bibr" rid="B39">Smiatek, 2017</xref>; <xref ref-type="bibr" rid="B27">Oprzeska-Zingrebe and Smiatek, 2018</xref>). The subsequent refolding step usually aims to improve the low yields of bioactive proteins, such that optimal conditions from solubilization and refolding were in the center of research in recent years (<xref ref-type="bibr" rid="B11">Freydell et al., 2007</xref>; <xref ref-type="bibr" rid="B51">Walther et al., 2022</xref>). Despite rational design of experiments (DoE) approaches in combination with machine learning techniques, the underlying mechanisms and the importance of individual features on the solubilization process are still only poorly understood (<xref ref-type="bibr" rid="B51">Walther et al., 2022</xref>).</p>
<p>In recent years, the use of machine learning has shifted slightly from pure prediction to understanding. Hence, more effort was spent into the understanding of feature importances with regard to &#x201c;explainable machine learning&#x201d; approaches (<xref ref-type="bibr" rid="B16">Holzinger et al., 2018</xref>; <xref ref-type="bibr" rid="B14">Gunning et al., 2019</xref>; <xref ref-type="bibr" rid="B17">Kailkhura et al., 2019</xref>; <xref ref-type="bibr" rid="B21">Linardatos et al., 2020</xref>; <xref ref-type="bibr" rid="B35">Roscher et al., 2020</xref>; <xref ref-type="bibr" rid="B2">Belle and Papantonis, 2021</xref>; <xref ref-type="bibr" rid="B7">Burkart and Huber, 2021</xref>; <xref ref-type="bibr" rid="B31">Pilania, 2021</xref>; <xref ref-type="bibr" rid="B29">Oviedo et al., 2022</xref>). In agreement with global sensitivity analysis for parametric models (<xref ref-type="bibr" rid="B45">Sudret, 2008</xref>), recent methods like Shapley additive explanations (SHAP) (<xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>), local interpretable model-agnostic interpretations (LIME) (<xref ref-type="bibr" rid="B34">Ribeiro et al., 2016</xref>; <xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>) or the &#x201c;explain like I am 5&#x201d; (ELI5) approach (<xref ref-type="bibr" rid="B1">Agarwal and Das, 2020</xref>) provide model-agnostic evaluations of feature importances with regard to the model outcomes. Although the purpose and general goal of these methods is similar, slight differences can be observed in their concepts. Global sensitivity analysis is mostly used for parametric models, whereby the weights of individual parameters are calculated by evaluating Monte Carlo simulations (<xref ref-type="bibr" rid="B45">Sudret, 2008</xref>). An extension of the sensitivity analysis is also the application of Polynomial Chaos Expansions (<xref ref-type="bibr" rid="B45">Sudret, 2008</xref>). However, these approaches are very computationally intensive, so that these methods are mostly calculated for parametric models with fewer than ten influencing variables. In contrast, SHAP analysis is rooted in game theory, such that it connects optimal credit allocation with local explanations using the classic Shapley values from game theory and their related extensions (<xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>). In more detail, Shapley values are introduced to quantify the contribution of individual players in a cooperative game. The Shapley values are determined by a weighted average calculation over all possible player orders. For each order of players, the marginal contribution of each player is calculated and multiplied by a weight that depends on the probability of this order. The Shapley values are then the sum of the weighted marginal contributions over all possible orders. In terms of explainable machine learning, the Shapley values are used to quantify the importance of individual features in terms of model predictions. In contrast to the SHAP approach, LIME attempts to explain model predictions at an instance level from the data set. An approximation in terms of a simplified model is developed for each selected instance (<xref ref-type="bibr" rid="B34">Ribeiro et al., 2016</xref>; <xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>). After that, the model approximations are weighted based on their similarity to the original instance. All features that are relevant for the prediction of the model are taken into account. Thus, the weighted model approximations are used to generate an explanation for the prediction of the original instance. It thus enables a local interpretation of the predictions and helps to understand the decisions of a model on an instance level. In addition, ELI5 uses various techniques to determine the importance of each feature for the model predictions (<xref ref-type="bibr" rid="B1">Agarwal and Das, 2020</xref>). This can be done, for example, by calculating feature weights or by analyzing the feature contributions. Based on the identified feature importances, a comprehensible explanation for the model predictions is then evaluated. By avoiding Monte Carlo methods as used in sensitivity analysis, SHAP analysis, LIME and ELI5 can evaluate significantly more input features and are more flexible in terms of their usage for non-parameteric machine learning models.</p>
<p>In general, all explainable machine learning approaches can be applied for data-driven and non-parametric approaches which are systematically evaluated in order to understand the feature-target value correlations. Although it has to be noted that biopharmaceutical modelling is still dominated by standard parametric models (<xref ref-type="bibr" rid="B42">Smiatek et al., 2020</xref>), recent machine learning approaches already revealed the benefits of non-parametric evaluations for certain process steps (<xref ref-type="bibr" rid="B54">Yang et al., 2020</xref>; <xref ref-type="bibr" rid="B40">Smiatek et al., 2021a</xref>; <xref ref-type="bibr" rid="B25">Montano Herrera et al., 2022</xref>; <xref ref-type="bibr" rid="B51">Walther et al., 2022</xref>). Hence, it can be expected that the model-agnostic interpretation of correlations between feature and target values may provide some further insights into the molecular mechanisms of the solubilization process.</p>
<p>In this article, we present an explainable machine learning approach to study the solubilization of inclusion bodies in terms of molecular mechanisms. A series of experiments with systematic parameter variations for the total protein concentration, certain co-solute concentrations and pH values were conducted in order to evaluate their impact on the final yield values. The corresponding data are used for the training of different machine learning models. We show that the best model with the highest predictive accuracy can be further used for feature importance analysis in terms of SHAP values. The corresponding results provide meaningful insights for the development of an analytic theory which is in reasonable agreement with the experimental outcomes.</p>
</sec>
<sec id="s2">
<title>2 Experimental and computational details</title>
<p>The experimental data set included 188 values for the protein yield after solubilization with systematically varied feature values. A detailed description of the data set and the experimental protocols was already presented in <xref ref-type="bibr" rid="B51">Walther et al. (2022)</xref>. In contrast to the previous publication, we explicitly focus on one unit operation. Hence, the consideration and optimization of coupled unit operations as studied in <xref ref-type="bibr" rid="B51">Walther et al. (2022)</xref> is not the purpose of this work. Moreover, we aim to provide a reliable description and further understanding of the molecular mechanisms underlying the solubilization process due to explainable machine learning approaches. More details on the experimental procedures can be found in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>.</p>
<p>The protein yield <italic>y</italic> is defined as<disp-formula id="e1">
<mml:math id="m1">
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(1)</label>
</disp-formula>where <italic>c</italic>
<sub>
<italic>t</italic>
</sub> denotes the total protein concentration and <italic>c</italic>
<sub>
<italic>s</italic>
</sub> the concentration of solubilized proteins. As varying parameters in a DoE approach (<xref ref-type="bibr" rid="B32">Politis et al., 2017</xref>), we chose the pH value, the urea concentration <italic>c</italic>
<sub>
<italic>U</italic>
</sub>, the total protein concentration <italic>c</italic>
<sub>
<italic>t</italic>
</sub>, the DTT concentration <italic>c</italic>
<sub>
<italic>D</italic>
</sub> and the guanidinium hydrochloride concentration <italic>c</italic>
<sub>
<italic>G</italic>
</sub>. The corresponding parameters were independently varied from <italic>c</italic>
<sub>
<italic>t</italic>
</sub> &#x3d; (2&#x2013;6) mol/L, pH &#x3d; 6&#x2013;12, <italic>c</italic>
<sub>
<italic>D</italic>
</sub> &#x3d; (0.00&#x2013;0.01) mol/L, <italic>c</italic>
<sub>
<italic>G</italic>
</sub> &#x3d; 0&#x2013;1&#xa0;mol/L and <italic>c</italic>
<sub>
<italic>U</italic>
</sub> &#x3d; (4.0&#x2013;8.5) mol/L. The resulting yield values showed a range of <italic>y</italic> &#x3d; 0.143&#x2013;0.996. The pH values were transformed according to the relation (<xref ref-type="bibr" rid="B20">Landsgesell et al., 2017</xref>)<disp-formula id="e2">
<mml:math id="m2">
<mml:mi>q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">I</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
<mml:msup>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mtext>pI</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">H</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(2)</label>
</disp-formula>which denotes the ratio of protonated titrable groups over the total number of titrable groups. The considered protein was an antibody fragment with an isolelectric point of pI &#x3d; 8.4.</p>
<p>For the choice of the best model, we used different regression approaches. More detailed information can be found in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>. We then performed hyperparameter optimization for the histogram gradient boosting model (HGB) (<xref ref-type="bibr" rid="B3">Blaser and Fryzlewicz, 2016</xref>) which showed the highest prediction accuracy. The correspondng hyperparameter optimized settings were a learning rate of 0.1, a maximum number of iterations of 90, meaning the number of individual decision trees and a minimum number of leaves of 20 in accordance with the nomenclature of scikit-learn 1.0.1 <xref ref-type="bibr" rid="B30">Pedregosa et al. (2011)</xref>. The corresponding results for the hyperparameter optimization procedure are presented in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>.</p>
<p>The source code was written in Python 3.9.1 (<xref ref-type="bibr" rid="B47">Van Rossum and Drake, 2009</xref>) in combination with the modules NumPy 1.19.5 (<xref ref-type="bibr" rid="B15">Harris et al., 2020</xref>), scikit-learn 1.0.1 (<xref ref-type="bibr" rid="B30">Pedregosa et al., 2011</xref>), XGBoost 1.6.0 (<xref ref-type="bibr" rid="B6">Brownlee, 2016</xref>), Pandas 1.2.1 (<xref ref-type="bibr" rid="B52">Wes McKinney, 2010</xref>) and SHAP 0.40.0 (<xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>). If not noted otherwise, all methods were used with default values.</p>
</sec>
<sec id="s3">
<title>3 Theoretical background: machine learning and feature importance analysis</title>
<sec id="s3-1">
<title>3.1 Machine learning and regression algorithms</title>
<p>The considered machine learning approaches can be divided into individual classes. An important model class includes the decision tree based models like Decision Trees (DT), Extra Trees (ET), Random Forests (RF), Gradient Boosting (GB), AdaBoost (ADA), Histogram-Based Gradient Boosting (HGB), Bagging (BAG) and Extreme Gradient Boosting (XGB). In general, decision tree-based models can be seen as non-parametric supervised learning methods which are often used for classification and regression. The value of a target variable is approximated by introducing simple decision rules based on arithmetic mean values for regression approaches as inferred from the data that represent the independent variables. The hierarchy of decision criteria forms different branches in terms of a tree-like structure. The various methods differ in their assumption on their underlying models (<xref ref-type="bibr" rid="B48">Wakjira et al., 2022</xref>; <xref ref-type="bibr" rid="B10">Feng et al., 2021</xref>). In contrast to a single weak learning model like DT, ensemble methods like ET, RF, GB, XGB, ADA, BAG and HGB consider an ensemble of different weak learning models. The main purpose of ensemble models is the combination of multiple decision trees to improve the overall performance. In more detail, ET and RF are both composed of a large number of decision trees, where the final decision is obtained taking into account the prediction of every tree. In contrast to ET, RF uses bootstrap replicas and optimal split points for decision criteria whereas ET consider the whole original data sample and randomly drawn split points. Overfitting is decreased by randomized feature selection for split selection which reduces the correlation between the individual trees in the ensemble. In addition, in other tree-based ensemble methods, a distinction can also be made between boosting and bagging approaches. Boosting approaches such as GB, XGB and ADA generate a weak prediction model at each step which is added sequentially to the full ensemble model. Such a weighted approach reduces variance and bias which improves the model performance. In contrast, bagging methods such as BAG and HGB generate a weak single DT models in parallel. As follows, Bagging, which stands for boostrap aggregating trains multiple weak learners in parallel and independent of each other. The final individual models are added to the ensemble by a deterministic averaging process which depends on the weights of accurate or inaccurate predictions. The individual models also differ in their definition and consideration of loss or objective functions for predictions in the training phase.</p>
<p>Statistical estimates for the predictive accuracy of the models are usually computed by the root-mean-squared error (RMSE) or the normalized root-mean-squared error (nRMSE) of predictions. The corresponding predicted values <inline-formula id="inf1">
<mml:math id="m3">
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula> are compared with the actual experimental values <italic>Y</italic>
<sub>
<italic>n</italic>
</sub> where <italic>n</italic> denotes the running index in terms of the associated RMSE value <inline-formula id="inf2">
<mml:math id="m4">
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> as defined by<disp-formula id="e3">
<mml:math id="m5">
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:msqrt>
</mml:math>
<label>(3)</label>
</disp-formula>with the number of samples <italic>P</italic> &#x3d; 188 in our data set. For estimating the model accuracy in comparison with the standard deviation of the target values, one can compute the normalized RMSE values <inline-formula id="inf3">
<mml:math id="m6">
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> in accordance with<disp-formula id="e4">
<mml:math id="m7">
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="normal">R</mml:mi>
<mml:mi mathvariant="normal">M</mml:mi>
<mml:mi mathvariant="normal">S</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>Y</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:mi>&#x3c3;</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(4)</label>
</disp-formula>with the experimental standard deviation <inline-formula id="inf4">
<mml:math id="m8">
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mo>,</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mi>P</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:msqrt>
</mml:math>
</inline-formula> and the mean experimental target value <inline-formula id="inf5">
<mml:math id="m9">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mo>&#x304;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mi>P</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:msubsup>
<mml:msub>
<mml:mrow>
<mml:mi>Y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
</inline-formula>.</p>
</sec>
<sec id="s3-2">
<title>3.2 Feature importance analysis</title>
<p>SHAP value analysis as developed by Lundberg and Lee (<xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>) is closely related to Shapley values as introduced in game theory (<xref ref-type="bibr" rid="B36">Shapley, 1953</xref>). In short, Shapley values provide estimates for the distribution of gains or pay-outs equally among the players (<xref ref-type="bibr" rid="B24">Molnar et al., 2020</xref>). Thus, SHAP analysis aims to rationalize a prediction of a specific value by consideration of the feature contributions. The individual values of features in the data set can be interpreted as players in a coalition game (<xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>; <xref ref-type="bibr" rid="B44">&#x160;trumbelj and Kononenko, 2014</xref>). In more detail, the algorithm works as follows (<xref ref-type="bibr" rid="B22">Lundberg and Lee, 2017</xref>). First, a subset <italic>S</italic> is randomly selected from all features <italic>F</italic>. The selected model is then trained on all feature subsets of <italic>S</italic> &#x2286; <italic>F</italic>. To estimate the feature effects, one model <italic>f</italic>
<sub>
<italic>S</italic>&#x222a;{<italic>i</italic>}</sub> is trained with and another model <italic>f</italic>
<sub>
<italic>S</italic>
</sub> is trained without the feature. The predictions of the two models are compared using the current input <italic>f</italic>
<sub>
<italic>S</italic>&#x222a;{<italic>i</italic>}</sub>(<italic>x</italic>
<sub>
<italic>S</italic>&#x222a;{<italic>i</italic>}</sub>) &#x2212; <italic>f</italic>
<sub>
<italic>S</italic>
</sub>(<italic>x</italic>
<sub>
<italic>S</italic>
</sub>), where <italic>x</italic>
<sub>
<italic>S</italic>
</sub> are the values of the input features in <italic>S</italic>. Since the effect of withholding a feature depends on other features in the model, the above differences are calculated for all possible subsets <italic>S</italic> &#x2286; <italic>F</italic> {<italic>i</italic>}. The Shapley values are then calculated and assigned to the individual features. In more detail, this can be considered as a weighted average of all possible differences with reference to<disp-formula id="e5">
<mml:math id="m10">
<mml:msub>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munder>
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x2286;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mspace width="0.3333em"/>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:munder>
<mml:mfrac>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>!</mml:mo>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
<mml:mo>!</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mo stretchy="false">&#x7c;</mml:mo>
<mml:mo>!</mml:mo>
</mml:mrow>
</mml:mfrac>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x222a;</mml:mo>
<mml:mfenced open="{" close="}">
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>x</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>S</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(5)</label>
</disp-formula>which shows that this approach assigns each feature an importance value that represents the impact of including that feature on the model prediction.</p>
<p>In addition, the Gini feature importance analysis focuses on the mean decrease in impurity of decision-tree based models. For this, one calculates the total decrease in node impurity for a decision tree based model weighted by the probability of reaching that node. The probability of reaching a node can be approximated by the proportion of samples averaged over all trees of the ensemble (<xref ref-type="bibr" rid="B5">Breiman et al., 1984</xref>). The Gini index then estimates the probability for a random instance being misclassified when chosen randomly. The higher the value of this coefficient, the higher is the confidence that the particular feature splits the data into distinct groups.</p>
</sec>
</sec>
<sec sec-type="results" id="s4">
<title>4 Results</title>
<p>In the first subsection, we study the correlations between the target and feature values for the experimental data and present the outcomes of the machine learning approaches. A detailed analysis of the feature importances in terms of explainable machine learning approaches is presented in the second subsection. The corresponding insights allow us to rationalize a molecular mechanism in combination with an analytic expression in good agreement with the experimental results.</p>
<sec id="s4-1">
<title>4.1 Correlation analysis and machine learning</title>
<p>A heatmap of all experimental correlations in terms of the Pearson correlation coefficient <italic>r</italic> between the target value ln&#x2009;<italic>y</italic> and all feature values for 188 data points is presented in <xref ref-type="fig" rid="F1">Figure 1</xref>. In addition to diagonal elements, notable correlations in terms of &#x7c;<italic>r</italic>&#x7c; &#x3e; 0.5 can only be identified for the logarithm of the total protein concentration ln&#x2009;<italic>c</italic>
<sub>
<italic>t</italic>
</sub>. All other values are smaller than &#x7c;<italic>r</italic>&#x7c; &#x3c; 0.5 in accordance with negligible correlations. Thus, it can be concluded that cross-correlations between the feature values are of minor importance. The corresponding correlation coefficients for the individual feature correlations with ln&#x2009;<italic>y</italic> are <italic>r</italic> &#x3d; &#x2212;0.69 (ln&#x2009;<italic>c</italic>
<sub>
<italic>t</italic>
</sub>), <italic>r</italic> &#x3d; 0.02 (<italic>c</italic>
<sub>
<italic>D</italic>
</sub>), <italic>r</italic> &#x3d; 0.09 (<italic>c</italic>
<sub>
<italic>G</italic>
</sub>), <italic>r</italic> &#x3d; 0.18 (<italic>q</italic>(pH &#x2212; pI)) and <italic>r</italic> &#x3d; 0.28 (<italic>c</italic>
<sub>
<italic>U</italic>
</sub>). All values are also visualized in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>. In consequence, non-vanishing positive correlation coefficients for ln&#x2009;<italic>y</italic> can only be identified for the actual urea concentration and the fraction of protonated titrable groups. In contrast, the total protein concentration <italic>c</italic>
<sub>
<italic>t</italic>
</sub> shows a strong negative correlation and the correlations for guandidinium hydrochloride and DTT are negligible. Thus, it can be concluded that the total protein concentration dominates the final yield values. The corresponding <italic>p</italic>-values from a Spearman rank-order correlation coefficient analysis are listed in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>. As can be seen, all values regarding the correlation between ln&#x2009;<italic>y</italic> and all features are less than <italic>p</italic> &#x3c; 0.05 for <italic>q</italic>(pH-pI), <italic>c</italic>
<sub>
<italic>U</italic>
</sub> and ln&#x2009;<italic>c</italic>
<sub>
<italic>t</italic>
</sub>. The corresponding values for <italic>c</italic>
<sub>
<italic>G</italic>
</sub> and <italic>c</italic>
<sub>
<italic>D</italic>
</sub> are quite high, which can be explained by the low concentrations, so not many settings can be chosen independently. However, one can conclude that <italic>c</italic>
<sub>
<italic>G</italic>
</sub> and <italic>c</italic>
<sub>
<italic>D</italic>
</sub> have a minor effect on the values of ln&#x2009;<italic>y</italic>. In addition, higher <italic>p</italic> values between the individual feature correlations are noticeable. This can be understood in terms of the preparation of the dataset obtained from a design of experiment study, which focuses solely on the individual effects of the characteristics on the target values.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Pearson correlation coefficients <italic>r</italic> between feature values and ln&#x2009;<italic>y</italic> from the experimental data. The colors of the individual entries highlight the corresponding value of the Pearson correlation coefficient from <italic>r</italic> &#x3d;1 (blue) via <italic>r</italic> &#x3d;0 (white) to <italic>r</italic> &#x3d;&#x2212;1 (red).</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g001.tif"/>
</fig>
<p>As a next step, the corresponding supervised non-optimized machine learning and standard regression methods are assessed in order to predict the corresponding ln&#x2009;<italic>y</italic> values with regard to a k-fold cross-validation scheme including successive permutations of the training set (<xref ref-type="bibr" rid="B12">Gareth et al., 2013</xref>; <xref ref-type="bibr" rid="B53">Wong, 2015</xref>). As can be seen in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>, the highest predictive accuracy for a qualitative assessment of the non-optimized models is achieved for gradient boosting and decision tree-based methods. In more detail, histogram gradient boosting (HGB), extra trees (ET), gradient boosting (GB) and random forests (RF) show low nRMSE values between 0.19 and 0.21. The corresponding coefficients of determination between predicted and actual values vary between <italic>R</italic>
<sup>2</sup> &#x3d; 0.72&#x2013;0.75. Noteworthy, ensemble boosting methods are ideally suited for non-linear regression problems like solubilization processes. Potential reasons for the high accuracy of decision tree-based models were recently published (<xref ref-type="bibr" rid="B13">Grinsztajn et al., 2022</xref>). In more detail, decision-tree based models do not overly smooth the solution in terms of predicted target values. Moreover, it was shown that uninformative features do not affect the performance metrics of decision-tree based models as much as for other machine learning approaches. In terms of such findings, it becomes clear that decision-tree based models often outperform kernel-based approaches in terms of predictive accuracies. It has to be noted that ensemble-based methods are also often robust against overfitting issues (<xref ref-type="bibr" rid="B9">Dietterich, 2000</xref>). Hence, time-consuming hyperoptimization tunings and validation checks are not of utmost importance.</p>
<p>In contrast to the good predictions of boosting models, standard linear regression methods like least-angle regression (LARS) or Lasso regression (LAS) show a rather poor performance (<xref ref-type="sec" rid="s12">Supplementary Material</xref>). Interestingly, also standard artificial neural networks (ANNs) show a low predictive accuracy. It can be argued that further optimization of hyper parameters may improve the results. All other approaches show a reasonable or even good performance with nRMSE values between 0.26&#x2013;0.22. As already discussed, the best performance can be observed for advanced decision tree-based ensemble models which reveals the underlying influence of non-linear contributions.</p>
<p>The corresponding predicted and experimental values for ln&#x2009;<italic>y</italic> from the HGB method are presented in <xref ref-type="fig" rid="F2">Figure 2</xref>. We used a k-fold cross validation approach, where the training data consists of each N-1 data samples with one test data point from the total data set including N samples (<xref ref-type="bibr" rid="B12">Gareth et al., 2013</xref>; <xref ref-type="bibr" rid="B53">Wong, 2015</xref>). In more detail, each model <italic>M</italic>
<sub>
<italic>j</italic>
</sub> is trained with the feature data including the samples <italic>X</italic> &#x3d; [<italic>x</italic>
<sub>1</sub>, <italic>x</italic>
<sub>2</sub>, &#x2026;, <italic>x</italic>
<sub>
<italic>j</italic>&#x2212;1</sub>, <italic>x</italic>
<sub>
<italic>j</italic>&#x2b;1</sub>, &#x2026;, <italic>x</italic>
<sub>
<italic>N</italic>
</sub>] and the associated target data <italic>Y</italic> &#x3d; [<italic>y</italic>
<sub>1</sub>, <italic>y</italic>
<sub>2</sub>, &#x2026;, <italic>y</italic>
<sub>
<italic>j</italic>&#x2212;1</sub>, <italic>y</italic>
<sub>
<italic>j</italic>&#x2b;1</sub>, &#x2026;, <italic>y</italic>
<sub>
<italic>N</italic>
</sub>] and predicts the corresponding test target sample <italic>Y</italic>
<sub>
<italic>j</italic>
</sub> from <italic>X</italic>
<sub>
<italic>j</italic>
</sub>. As can be seen, <italic>X</italic>
<sub>
<italic>j</italic>
</sub> and <italic>Y</italic>
<sub>
<italic>j</italic>
</sub> are not part of the corresponding training data for model <italic>M</italic>
<sub>
<italic>j</italic>
</sub>. This procedure is repeated for all <italic>N</italic> models <italic>M</italic>
<sub>1</sub>, <italic>M</italic>
<sub>2</sub>, &#x2026;, <italic>M</italic>
<sub>
<italic>j</italic>
</sub> and the corresponding predictions. Notably, a good agreement between the predicted and experimental values for all 0 &#x3e; ln&#x2009;<italic>y</italic> &#x3e; &#x2212; 1.6 becomes obvious. All values are located within the experimental standard deviation <italic>&#x3c3;</italic>(ln&#x2009;<italic>y</italic>) as denoted by the dashed blue lines. The corresponding good agreement can be rationalized by the large amount of training data in this range. Notable deviations in terms of one outlier with nRMSE <inline-formula id="inf6">
<mml:math id="m11">
<mml:mo>&#x3e;</mml:mo>
<mml:mn>1</mml:mn>
</mml:math>
</inline-formula> can only be observed for ln&#x2009;<italic>y</italic> &#x2264; &#x2212;1.9 due to missing reference training data. Moreover, the good predictive accuracy is also highlighted by the reasonable value <italic>R</italic>
<sup>2</sup> &#x3d; 0.75 for the coefficient of determination. As shown in the supplementary material, similar conclusions can also be drawn for the prediction of a training data set of 38 samples using an 80/20 ratio split between training and test data. The RMSE values are of comparable quality. In consequence, it can be concluded that solubilization mechanisms and final yield values can be predicted with acceptable accuracy using machine learning approaches.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Predicted (HGB) and experimental values (Exp) for the logarithm of the yield ln&#x2009;<italic>y</italic> from the optimized HGB method. The straight line has a slope of one and the corresponding blue dotted lines reveal the experimental standard deviation <italic>&#x3c3;</italic>(ln&#x2009;<italic>y</italic>). The lighter blue shaded regions demark a standard deviation of 2<italic>&#x3c3;</italic>(ln&#x2009;<italic>y</italic>).</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g002.tif"/>
</fig>
</sec>
<sec id="s4-2">
<title>4.2 Feature importance analysis and explainable machine learning</title>
<p>Due to the reasonable predictive accuracy of certain machine learning models, it can be concluded that the evaluation of feature importances might provide some further insights into the underlying mechanisms of solubilization. In terms of such considerations, we evaluated all training data with the HGB and the ET method in accordance with the SHAP values. The results for the HGB model are shown in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>. As expected, the accuracies of the HGB and the ET model for training data with <italic>R</italic>
<sup>2</sup> &#x3d; 0.91 (HGB), <italic>R</italic>
<sup>2</sup> &#x3d; 0.99 (ET), nRMSE &#x3d; 0.30 (HGB) and nRMSE &#x3d; 0.10 (ET) are higher when compared to the predictions from the k-fold cross-validation approach which rationalizes the validity of the following feature analysis.</p>
<p>As a first step, the corresponding Gini feature importance values as calculated from the ET method are presented in <xref ref-type="fig" rid="F3">Figure 3</xref>. As can be seen, the results confirm the dominant influence of the total protein concentration <italic>c</italic>
<sub>
<italic>t</italic>
</sub> in accordance with the correlation coefficients shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, followed by the urea concentration <italic>c</italic>
<sub>
<italic>U</italic>
</sub> and the fraction of protonated titrable groups <italic>q</italic>(pH-pI). With regard to rather low values, the DTT and guanidinium hydrochloride concentrations have negligible influences. It should be noted that the protein&#x2019;s native structure is characterized by only a very small number of disulfide bonds. This property explains the vanishing influence as well as the relatively low concentration of DTT as a reducing agent in this context. Furthermore, it is well-known that guanidinium hydrochloride is a very potent destabilizing agent. However, in combination with urea it shows a rather complex aggregation behavior around the protein (<xref ref-type="bibr" rid="B28">Oprzeska-Zingrebe and Smiatek, 2021</xref>, <xref ref-type="bibr" rid="B26">2022</xref>; <xref ref-type="bibr" rid="B23">Miranda-Quintana and Smiatek, 2021</xref>). Accordingly, higher concentrations of guanidinium hydrochloride could probably exert a stronger influence in terms of feature importance. However, it should be noted that this effect is represented here by the somewhat milder denaturation conditions in the presence of urea. In consequence, it can be concluded that the low feature importance of the guanidinium hydrochloride concentration can be rationalized by its corresponding low concentration. These results are also confirmed by the values from the ELI5 analysis which are shown in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Gini feature importance values for predictions of the extra trees (ET) model for the protein yield ln&#x2009;<italic>y</italic>.</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g003.tif"/>
</fig>
<p>Similar conclusions can also be drawn in terms of example decision pathways for an extra trees model with a maximum tree depth of 4 as shown in the <xref ref-type="sec" rid="s12">Supplementary Material</xref>. It becomes obvious that the first decision criterion defines a total protein concentration of ln&#x2009;<italic>c</italic>
<sub>
<italic>t</italic>
</sub> &#x3d; 1.39. This value separates between high and low yield branches whereas further criteria based on individual values of <italic>q</italic>(pH-pI) and <italic>c</italic>
<sub>
<italic>U</italic>
</sub> are of minor importance and thus lead to different subclassifications.</p>
<p>The corresponding results for the SHAP value analysis with regard to the final yield values ln&#x2009;<italic>y</italic> from a HGB model are shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. The color-coded beeswarm plot with the associated contributions of SHAP values are depicted in the top panel. The beeswarm plot aims to provide an information-dense summary for the most important features in terms of model predictions for each data instance. Each data instance is represented by a single dot on each feature row. The horizontal position of the dot is determined by the SHAP value of the corresponding feature. In addition, the color illustrates the original value of a feature. Thus, the beeswarm plot highlights the importance or contribution of the features for the whole dataset. As already discussed, it becomes obvious that the largest influence is represented by the total protein concentration, followed by the urea concentration and the amount of protonated titrable groups. Moreover, it can be seen that the total protein concentration has a negative influence on the final yield value. Thus, low values of <italic>c</italic>
<sub>
<italic>t</italic>
</sub> lead to positive SHAP values and <italic>vice versa</italic>. Such findings are unique for the total protein concentration while all other factors show a positive correlation. In terms of a molecular understanding, it has to be noted that the solubilization process itself is a rather complex process due to contributions from interfacial phenomena, the composition of the solution as well as further intermolecular mechanisms. Moreover, the individual features and their contributions on the model predictions are ordered in terms of their importance from top to bottom. This ordering is calculated from the mean absolute SHAP value for each feature. In general, such an approach is strongly determined by the broad average impact of the feature while rare maximum or minimum values do not contribute significantly. The corresponding mean absolute SHAP values are presented in the bottom of <xref ref-type="fig" rid="F4">Figure 4</xref> for reasons of consistency. It clearly can be seen that the total protein concentration dominates the feature importance, followed by the urea concentration and <italic>q</italic>(pH-pI). The contributions from <italic>c</italic>
<sub>
<italic>G</italic>
</sub> and <italic>c</italic>
<sub>
<italic>D</italic>
</sub> are of minor importance in agreement with the <italic>p</italic> values from the previous correlation analysis.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Color-coded heatmap for a beeswarm plot of individual SHAP values for the optimized HGB model (top panel) and mean absolute SHAP values as calculated for the target value ln&#x2009;<italic>y</italic>.</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g004.tif"/>
</fig>
<p>For a more detailed analysis, we present the corresponding SHAP value dependency plots for the total protein concentration in <xref ref-type="fig" rid="F5">Figure 5</xref>. In general, SHAP dependency plots as shown in <xref ref-type="fig" rid="F5">Figures 5</xref>, <xref ref-type="fig" rid="F6">6</xref> highlight the effect of a single feature on the model predictions. Each dot corresponds to a single prediction from the dataset and the position on the horizontal axis denotes the corresponding actual value of the feature. In contrast, the vertical axis shows the SHAP value for that feature and its impact on the prediction. In general, a slope along the points in the dependency plots enables to identify positive, negative or no correlations with the corresponding target parameter. It clearly can be seen in <xref ref-type="fig" rid="F5">Figure 5</xref> that the aforementioned negative correlation (<xref ref-type="fig" rid="F4">Figure 4</xref>) can be interpreted as an inverse relation between the total protein concentration and the final yield value ln&#x2009;<italic>y</italic>. Hence, for increasing total protein concentrations, one can observe a linear decrease of the SHAP values. The corresponding mechanism can be explained as follows. In general, the yield is defined by<disp-formula id="e6">
<mml:math id="m12">
<mml:mi>y</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(6)</label>
</disp-formula>which corresponds to the ratio between the number of free (solubilized) protein chains <italic>N</italic>
<sub>
<italic>f</italic>
</sub> and the total number of chains <italic>N</italic>
<sub>
<italic>t</italic>
</sub>. With regard to the fact that inclusion bodies show a rather poor solubility, one can assume that individual inclusion bodies aggregate in order to form larger moieties. Hence, the dissolution of free chains mainly occurs at the solvent accessible surface area of these compounds in agreement with previous assumptions (<xref ref-type="bibr" rid="B49">Walther et al., 2013</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>SHAP value dependency plot between the total protein concentration ln&#x2009;<italic>c</italic>
<sub>
<italic>t</italic>
</sub> and the final yield value ln&#x2009;<italic>y</italic> as calculated from the optimized HGB method.</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g005.tif"/>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>SHAP value dependency plot between the urea concentration <italic>c</italic>
<sub>
<italic>U</italic>
</sub> and the final yield value ln&#x2009;<italic>y</italic> from the optimized HGB method.</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g006.tif"/>
</fig>
<p>The number of free protein chains can be written as <inline-formula id="inf7">
<mml:math id="m13">
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>d</mml:mi>
</mml:math>
</inline-formula> with the local concentration of proteins in the aggregated inclusion body <italic>c</italic>
<sub>
<italic>p</italic>
</sub>, the corresponding spherical radius <italic>R</italic>
<sub>
<italic>I</italic>
</sub> of the aggregate and the penetration or dissolution depth <italic>d</italic>. Moreover, it is assumed that the inner region of the compound remains unaffected by dissolution such that <inline-formula id="inf8">
<mml:math id="m14">
<mml:msub>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2248;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> with <italic>R</italic>
<sub>
<italic>I</italic>
</sub> &#x226B; <italic>d</italic>. In consequence, we can rewrite Eq. <xref ref-type="disp-formula" rid="e6">6</xref> according to<disp-formula id="e7">
<mml:math id="m15">
<mml:mi>y</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x223c;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(7)</label>
</disp-formula>which reveals that aggregated inclusion bodies with larger radii result in lower yield values. Furthermore, it is assumed that <italic>d</italic> is constant after a fixed time interval. With regard to the fact that the radius scales as <inline-formula id="inf9">
<mml:math id="m16">
<mml:msub>
<mml:mrow>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>/</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>, one obtains after insertion into Eq. <xref ref-type="disp-formula" rid="e7">7</xref> the following relation<disp-formula id="e8">
<mml:math id="m17">
<mml:mi>y</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>d</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>&#x005F;</mml:mo>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x223c;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:msubsup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(8)</label>
</disp-formula>which clearly shows that the final yield value depends inversely on the total protein concentration <italic>N</italic>
<sub>
<italic>t</italic>
</sub> &#x223c; <italic>c</italic>
<sub>
<italic>t</italic>
</sub> as represented by ln&#x2009;<italic>y</italic> &#x223c; &#x2212; ln(<italic>c</italic>
<sub>
<italic>t</italic>
</sub>) in agreement with <xref ref-type="fig" rid="F5">Figure 5</xref>. Here, we assume that <italic>d</italic> is constant after a fixed time interval and that aggregates show a highest packing fraction leading to fixed values of <italic>c</italic>
<sub>
<italic>p</italic>
</sub>.</p>
<p>Moreover, it is well known that the presence of urea induces a structural destabilization of proteins. Recent articles rationalized this phenomena with preferential binding and exclusion mechanisms (<xref ref-type="bibr" rid="B39">Smiatek, 2017</xref>; <xref ref-type="bibr" rid="B27">Oprzeska-Zingrebe and Smiatek, 2018</xref>; <xref ref-type="bibr" rid="B23">Miranda-Quintana and Smiatek, 2021</xref>). As can be seen in <xref ref-type="fig" rid="F6">Figure 6</xref>, one observes increasing SHAP values and thus a positive trend of ln&#x2009;<italic>y</italic> for increasing urea concentrations. Moreover, it can be seen that for urea concentrations larger than 8&#xa0;mol/L, a saturation behavior becomes evident. As an explanation, we refer to co-solute induced destabilization effects as discussed in <xref ref-type="bibr" rid="B27">Oprzeska-Zingrebe and Smiatek (2018)</xref>; <xref ref-type="bibr" rid="B39">Smiatek (2017)</xref>. In more detail, it is assumed that the ratio of destabilized and stable proteins <italic>K</italic>
<sub>
<italic>cs</italic>
</sub> can be written as<disp-formula id="e9">
<mml:math id="m18">
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>33</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>&#x3bd;</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>23</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(9)</label>
</disp-formula>with the derivative of the thermodynamic activity <italic>a</italic>
<sub>33</sub>, the difference in the preferential binding coefficients &#x394;<italic>&#x3bd;</italic>
<sub>23</sub> and the ratio of destabilized and stable proteins <italic>K</italic>
<sub>0</sub> in absence of any co-solutes (<xref ref-type="bibr" rid="B39">Smiatek, 2017</xref>; <xref ref-type="bibr" rid="B27">Oprzeska-Zingrebe and Smiatek, 2018</xref>; <xref ref-type="bibr" rid="B41">Smiatek et al., 2018</xref>). For certain proteins, it was discussed that the partial molar volumes and the solvent-accessible surface area upon unfolding do not change significantly according to &#x394;<italic>&#x3bd;</italic>
<sub>23</sub> &#x3d; <italic>c</italic>
<sub>
<italic>U</italic>
</sub>&#x394;<italic>G</italic>
<sub>23</sub> (<xref ref-type="bibr" rid="B41">Smiatek et al., 2018</xref>; <xref ref-type="bibr" rid="B18">Krishnamoorthy et al., 2018a</xref>), such that the previous relation can be approximated by<disp-formula id="e10">
<mml:math id="m19">
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2248;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>K</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>33</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>23</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(10)</label>
</disp-formula>with the difference in the Kirkwood-Buff integrals &#x394;<italic>G</italic>
<sub>23</sub> (<xref ref-type="bibr" rid="B27">Oprzeska-Zingrebe and Smiatek, 2018</xref>). In addition to stable and destabilized proteins, one can apply the same relation for the fraction of dissolved and bound proteins (<xref ref-type="bibr" rid="B41">Smiatek et al., 2018</xref>). Under the assumption that <italic>N</italic>
<sub>
<italic>f</italic>
</sub> &#x226A; <italic>N</italic>
<sub>
<italic>t</italic>
</sub>, it thus follows that <italic>K</italic>
<sub>
<italic>cs</italic>
</sub> &#x2248; <italic>y</italic> and <italic>K</italic>
<sub>0</sub> &#x2248; <italic>y</italic>
<sub>0</sub>. For high co-solute concentrations, it was further discussed that <italic>a</italic>
<sub>33</sub> &#x2192; 0 due to stability conditions (<xref ref-type="bibr" rid="B19">Krishnamoorthy et al., 2018b</xref>). Hence, the exponential factor in Eq. <xref ref-type="disp-formula" rid="e10">10</xref> can be linearized according to<disp-formula id="e11">
<mml:math id="m20">
<mml:mi>y</mml:mi>
<mml:mo>&#x2248;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>33</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>23</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:math>
<label>(11)</label>
</disp-formula>which highlights the linear contribution <italic>y</italic> &#x223c; <italic>c</italic>
<sub>
<italic>U</italic>
</sub> between the urea concentration and the final yield value in good agreement with <xref ref-type="fig" rid="F6">Figure 6</xref>.</p>
<p>In our previous discussion, it was also highlighted that the fraction of protonated titrable groups has a positive influence on the final yield values. In accordance with Eq. <xref ref-type="disp-formula" rid="e2">2</xref>, it becomes clear that <italic>q</italic>(pH-pI) decreases with increasing pH values. Moreover, it is known that the isoelectric point with pI &#x3d; 8.4 corresponds to a net-uncharged protein. The clear distinction between pH values below and above the pI value in terms of the SHAP dependency plot can be seen in <xref ref-type="fig" rid="F7">Figure 7</xref>. In more detail, <italic>q</italic>(pH &#x2212; pI) &#x3c; 0.5 as represented by pH <inline-formula id="inf10">
<mml:math id="m21">
<mml:mo>&#x3c;</mml:mo>
<mml:mn>8.4</mml:mn>
</mml:math>
</inline-formula> results in negative SHAP values and thus lower yields and <italic>vice versa</italic>. The reason for this observation might be related to electrostatic repulsion between the charged groups of the protein (<xref ref-type="bibr" rid="B42">Smiatek et al., 2020</xref>). Thus, we assume that for high or low pH values, depending on the net amount of basic or acidic groups in the protein, the electrostatic repulsion fosters the dissolution of the protein in order to minimize non-favorable interactions. As also illustrated in <xref ref-type="fig" rid="F7">Figure 7</xref>, significant contributions for a net-uncharged protein at pH &#x3d; 8.4 leading to <italic>q</italic>(pH &#x2212; pI) &#x3d; 0.5 are absent.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Dependency plot of SHAP values for the fraction of titrable groups <italic>q</italic>(pH &#x2212; pI) and the final yield value ln&#x2009;<italic>y</italic> for the optimized HGB model.</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g007.tif"/>
</fig>
<p>The combination of the previous considerations in terms of Eqs <xref ref-type="disp-formula" rid="e2">2</xref>, <xref ref-type="disp-formula" rid="e8">8</xref>, <xref ref-type="disp-formula" rid="e11">11</xref> results in<disp-formula id="e12">
<mml:math id="m22">
<mml:mi>y</mml:mi>
<mml:mo>&#x223c;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>y</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>a</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>33</mml:mn>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>U</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi mathvariant="normal">&#x394;</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>G</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>23</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
<mml:mfrac>
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">H</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="normal">p</mml:mi>
<mml:mi mathvariant="normal">I</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mi>c</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>t</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>/</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(12)</label>
</disp-formula>which condenses all previous results in one analytic expression. In order to assess the validity of Eq. <xref ref-type="disp-formula" rid="e12">12</xref>, we plotted all experimental data points onto a master curve as shown in <xref ref-type="fig" rid="F8">Figure 8</xref>. In order to minimize fluctuating electrostatic repulsions, we chose nearly constant and high pH values with pH <inline-formula id="inf11">
<mml:math id="m23">
<mml:mo>&#x2265;</mml:mo>
<mml:mn>10</mml:mn>
</mml:math>
</inline-formula> as well as high urea concentrations with <italic>c</italic>
<sub>
<italic>U</italic>
</sub> &#x2265; 7.55&#xa0;mol/L. The corresponding 27 data points are then scaled in terms of a normal distribution and compared to Eq. <xref ref-type="disp-formula" rid="e12">12</xref>. As can be seen in <xref ref-type="fig" rid="F8">Figure 8</xref>, the theoretically predicted values nicely follow the proposed scaling relation. It has to be noted that the corresponding influences for different proteins and modalities upon solubilization may vary, such that the obtained results are not generally transferable without proper assessment. Nevertheless, we have proven strong evidence that explainable machine learning approaches provide deeper insights into the molecular mechanisms and correlations of solubilization processes.</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Theoretical estimates from Eq. <xref ref-type="disp-formula" rid="e12">12</xref> and actual values: The corresponding data points were selected for high urea concentrations and high pH values. All values are scaled in terms of normal distributions.</p>
</caption>
<graphic xlink:href="fceng-05-1227620-g008.tif"/>
</fig>
</sec>
</sec>
<sec id="s5">
<title>5 Discussion of results</title>
<p>In the previous sections, we developed a machine learning model to predict yield values based on some input features. We were able to show that an optimized Histogram Gradient Boosting (HGB) model enables the most accurate predictions. The underlying data for training and testing of the model were obtained from a Design of Experiments study. Overall, the model predictions show sufficient accuracy. The general trends are reproduced despite some minor inaccuracies for certain outliers. Previous statistical analysis of the experimental data already showed that the correlation coefficients between the feature and the target value do not reveal a particularly significant correlation. Accordingly, we could already assume in advance that the models would only allow meaningful predictions to a certain extent. To compensate for this drawback, we applied some methods of explainable machine learning. Although such approaches cannot increase the accuracy, the results provide fundamental insights into the feature importance for the model predictions. We observed that in particular the total protein concentration as well as the urea concentration and the degree of charge of the protein due to the adjusted pH value are of decisive importance for the yield value predictions. The influence of other co-solutes is negligible due to their low concentration. As part of the explainable machine learning approach, we were therefore able to examine data-driven models on a scientific basis. Accordingly, we were able to set up scientific hypotheses for the underlying mechanisms (<xref ref-type="bibr" rid="B43">Smiatek et al., 2021b</xref>), which provided a rationale for the observed feature importance values. These scientific hypotheses were then merged into Eq. <xref ref-type="disp-formula" rid="e12">12</xref>, which allows for a formal mathematical description of the influence of various parameters on yield values.</p>
<p>In general, it cannot be assumed that a single machine learning model is suitable to predict different yields for different proteins in different solubilization procedures. The differences in the charges and the interaction with co-solutes are sometimes so significantly different that the trends can sometimes even reverse. Accordingly, the general transferability of machine learning models is usually not given, so that the gain in knowledge is often very small. Accordingly, experimental data must be recorded again for new proteins and their inclusion bodies, so that the reduction in laboratory activities is usually not given. However, basic principles can be recognized by means of the analytical equation and the corresponding scientific hypotheses. These principles may differ slightly for individual proteins, but can now be estimated in advance using the analytical description. Hence, important influencing factors can be postulated, especially when planning the experiments. We were also able to show that the purely data-driven models can be subjected to scientific hypothesis formation, which makes the results more robust and more straightforward to understand.</p>
</sec>
<sec id="s6">
<title>6 Summary and conclusion</title>
<p>We studied the potential influences of certain process parameters on the solubilization of inclusion bodies in terms of explainable machine learning approaches. The corresponding final yield values after solubilization are crucially affected by the total protein concentration, the urea concentration and the amount of protonated titrable groups as affected by the actual pH value. The models with highest predictive accuracies are boosting ensemble-based approaches with nRMSE values around 0.19. The corresponding SHAP values show that the total protein concentration, the actual urea concentration as well as the fraction of protonated titrable groups dominate the final yield value. All other contributions like the DTT and guanidinium hydrochloride concentration are of minor importance. A more detailed analysis of SHAP dependecies also highlights an inverse relation between the total protein concentration and the yield values in contrast to the urea concentration and the amount of protonated titrable groups.</p>
<p>Based on these explainable machine learning observations, we proposed an analytic expression to rationalize these findings. The inverse relation for the total protein concentration can be understood with regard to surface solvation effects which inversely scale with the total protein concentration. The growing SHAP values for the urea concentration can be understood by the preferential binding and exclusion mechanisms for co-solutes. The direct interaction of urea molecules with the inclusion body thus favors dissolution mechanisms and hence larger yield values. Finally, larger fractions of protonated titrable groups result in stronger electrostatic repulsion effects between the proteins which facilitate the dissolution of the inclusion body. The corresponding assumptions can be summarized in terms of an analytic expression which shows a reasonable agreement with the experimental data. Although it has to be noted that the corresponding dependencies crucially rely on the nature of the protein and the inclusion bodies, our approach demonstrates a meaningful pathway towards a deeper understanding and optimization of solubilization conditions. It can be assumed that the underlying mechanisms vary through the individual contributions of the influencing factors for different proteins. Typical examples would be the influence of the pH value on different pI values of proteins as well as the importance of reducing reagents such as DTT at different amounts of disulfide bonds. Nevertheless, it can be expected that the presented machine learning models in combination with feature analysis can make these slightly varying relationships interpretable with similar accuracy as in this study. Accordingly, our work highlights an exemplary and generic approach to understand in detail the phenomena of solubilization for the individual proteins and solutions. The use of explainable machine learning approaches thus allows us to develop models with high predictive accuracy but also to gain deeper insights into the underlying correlations of the mechanisms. Hence, it has to be mentioned that the use of explainable machine learning does not increase the prediction accuracy of the model. However, there is the possibility that the results of non-parametric models can be assessed and evaluated in their justification with regard to individual feature correlations. This procedure corresponds to the scientific method, so that the results of purely data-driven models can be translated into scientific hypotheses and made correspondingly falsifiable (<xref ref-type="bibr" rid="B43">Smiatek et al., 2021b</xref>). In this context, the use of explainable machine learning has allowed us to derive an analytical equation (Eq. <xref ref-type="disp-formula" rid="e12">12</xref>). As we have shown, this analytical equation can also be derived separately from fundamental principles, with the SHAP analysis being able to contribute profitably to the identification of these mechanisms. The advantage of such an equation lies in its falsifiability and its potential for transfer to other projects. From this it can be assumed that new projects can be started with prior knowledge, so that material can be saved and the work can be reduced. In consequence, explainable machine learning provides a deeper process understanding and knowledge which is beneficial for different unit operations in biopharmaceutical manufacturing.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The data analyzed in this study is subject to the following licenses/restrictions: <email>jens.smiatek@boehringer-ingelheim.com</email>. Requests to access these datasets should be directed to <email>cornelia.walther@boehringer-ingelheim.com</email>.</p>
</sec>
<sec id="s8">
<title>Author contributions</title>
<p>JS, CW, and MM conducted the study and analyzed the results. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s9">
<title>Funding</title>
<p>This research was funded by Boehringer Ingelheim Pharma GmbH &#x26; Co. KG and Boehringer Ingelheim RCV GmbH &#x26; Co KG.</p>
</sec>
<ack>
<p>The authors thank Christina Yassouridis, Hermann Schuchnigg, I-Ting Ho, Milena Matysik, Liliana Montano Herrera, Ralph Guderlei, Michael Laussegger, and Bernhard Schrantz for valuable discussions.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>Authors CW, MM, AF, A-LT, MV, and CB were employed by Boehringer Ingelheim RCV GmbH &#x26; Co. KG. Author A-LT was employed by Baxalta Innovations GmbH, Takeda Group. Authors AJ, EB, and JS were employed by Boehringer Ingelheim Pharma GmbH &#x26; Co. KG.</p>
<p>The remaining authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s12">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fceng.2023.1227620/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fceng.2023.1227620/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet7.PDF" id="SM1" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet2.PDF" id="SM2" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet4.PDF" id="SM3" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet6.PDF" id="SM4" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet9.PDF" id="SM5" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet3.PDF" id="SM6" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet5.PDF" id="SM7" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="Image1.PNG" id="SM8" mimetype="application/PNG" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet8.PDF" id="SM9" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet10.PDF" id="SM10" mimetype="application/PDF" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Agarwal</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Das</surname>
<given-names>S.</given-names>
</name>
</person-group> &#x201c;<article-title>Interpretable machine learning tools: A survey</article-title>,&#x201d; in <conf-name>Proceedings of the 2020 IEEE Symposium Series on Computational Intelligence (SSCI)</conf-name>, <conf-loc>Canberra, ACT, Australia</conf-loc>, <conf-date>December 2020</conf-date>, <fpage>1528</fpage>&#x2013;<lpage>1534</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Belle</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Papantonis</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Principles and practice of explainable machine learning</article-title>. <source>Front. Big Data</source> <volume>4</volume>, <fpage>688969</fpage>. <pub-id pub-id-type="doi">10.3389/fdata.2021.688969</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Blaser</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Fryzlewicz</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Random rotation ensembles</article-title>. <source>J. Mach. Learn. Res.</source> <volume>17</volume>, <fpage>126</fpage>&#x2013;<lpage>151</lpage>. <pub-id pub-id-type="doi">10.5555/2946645.2946649</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bowden</surname>
<given-names>G. A.</given-names>
</name>
<name>
<surname>Paredes</surname>
<given-names>A. M.</given-names>
</name>
<name>
<surname>Georgiou</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>1991</year>). <article-title>Structure and morphology of protein inclusion bodies in escherichia coli</article-title>. <source>Biotechnol</source> <volume>9</volume>, <fpage>725</fpage>&#x2013;<lpage>730</lpage>. <pub-id pub-id-type="doi">10.1038/nbt0891-725</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Breiman</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Friedman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Stone</surname>
<given-names>C. J.</given-names>
</name>
<name>
<surname>Olshen</surname>
<given-names>R. A.</given-names>
</name>
</person-group> (<year>1984</year>). <source>Classification and regression trees</source>. <publisher-loc>Boca Raton, Florida, United States</publisher-loc>: <publisher-name>CRC Press</publisher-name>.</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Brownlee</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2016</year>). <source>XGBoost with python: Gradient boosted trees with XGBoost and scikit-learn</source>. <publisher-loc>Vermont, Victoria, Australia</publisher-loc>: <publisher-name>Machine Learning Mastery</publisher-name>.,</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Burkart</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Huber</surname>
<given-names>M. F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A survey on the explainability of supervised machine learning</article-title>. <source>J. Art. Intell. Res.</source> <volume>70</volume>, <fpage>245</fpage>&#x2013;<lpage>317</lpage>. <pub-id pub-id-type="doi">10.1613/jair.1.12228</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Clark</surname>
<given-names>E. D. B.</given-names>
</name>
</person-group> (<year>1998</year>). <article-title>Refolding of recombinant proteins</article-title>. <source>Curr. Opin. Biotechnol.</source> <volume>9</volume>, <fpage>157</fpage>&#x2013;<lpage>163</lpage>. <pub-id pub-id-type="doi">10.1016/s0958-1669(98)80109-2</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dietterich</surname>
<given-names>T. G.</given-names>
</name>
</person-group> (<year>2000</year>). &#x201c;<article-title>Ensemble methods in machine learning</article-title>,&#x201d; in <source>International workshop on multiple classifier systems</source> (<publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>15</lpage>.</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>D.-C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.-J.</given-names>
</name>
<name>
<surname>Mangalathu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Implementing ensemble learning methods to predict the shear strength of rc deep beams with/without web reinforcements</article-title>. <source>Eng. Struct.</source> <volume>235</volume>, <fpage>111979</fpage>. <pub-id pub-id-type="doi">10.1016/j.engstruct.2021.111979</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Freydell</surname>
<given-names>E. J.</given-names>
</name>
<name>
<surname>Ottens</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Eppink</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>van Dedem</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>van der Wielen</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>Efficient solubilization of inclusion bodies</article-title>. <source>Biotechnol. J.</source> <volume>2</volume>, <fpage>678</fpage>&#x2013;<lpage>684</lpage>. <pub-id pub-id-type="doi">10.1002/biot.200700046</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gareth</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Daniela</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Trevor</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Robert</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2013</year>). <source>An introduction to statistical learning: With applications in R</source>. <publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Spinger</publisher-name>.</citation>
</ref>
<ref id="B13">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Grinsztajn</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Oyallon</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Why do tree-based models still outperform deep learning on tabular data?</article-title> <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2207.08815">https://arxiv.org/abs/2207.08815</ext-link>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gunning</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Stefik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Miller</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Stumpf</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>G.-Z.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Xai&#x2014;Explainable artificial intelligence</article-title>. <source>Sci. Robot.</source> <volume>4</volume>, <fpage>7120</fpage>. <pub-id pub-id-type="doi">10.1126/scirobotics.aay7120</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Harris</surname>
<given-names>C. R.</given-names>
</name>
<name>
<surname>Millman</surname>
<given-names>K. J.</given-names>
</name>
<name>
<surname>van der Walt</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Gommers</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Virtanen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cournapeau</surname>
<given-names>D.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Array programming with NumPy</article-title>. <source>Nature</source> <volume>585</volume>, <fpage>357</fpage>&#x2013;<lpage>362</lpage>. <pub-id pub-id-type="doi">10.1038/s41586-020-2649-2</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Holzinger</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kieseberg</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Weippl</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Tjoa</surname>
<given-names>A. M.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Current advances, trends and challenges of machine learning and knowledge extraction: From machine learning to explainable ai</article-title>,&#x201d; in <source>International cross-domain conference for machine learning and knowledge extraction</source> (<publisher-loc>Berlin, Germany</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kailkhura</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Gallagher</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Hiszpanski</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Reliable and explainable machine-learning methods for accelerated material discovery</article-title>. <source>NPJ Comput. Mat.</source> <volume>5</volume>, <fpage>108</fpage>&#x2013;<lpage>109</lpage>. <pub-id pub-id-type="doi">10.1038/s41524-019-0248-2</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krishnamoorthy</surname>
<given-names>A. N.</given-names>
</name>
<name>
<surname>Holm</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018a</year>). <article-title>Influence of cosolutes on chemical equilibrium: A kirkwood&#x2013;buff theory for ion pair association&#x2013;dissociation processes in ternary electrolyte solutions</article-title>. <source>J. Phys. Chem. C</source> <volume>122</volume>, <fpage>10293</fpage>&#x2013;<lpage>10302</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jpcc.7b12255</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Krishnamoorthy</surname>
<given-names>A. N.</given-names>
</name>
<name>
<surname>Oldiges</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Winter</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Heuer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Cekic-Laskovic</surname>
<given-names>I.</given-names>
</name>
<name>
<surname>Holm</surname>
<given-names>C.</given-names>
</name>
<etal/>
</person-group> (<year>2018b</year>). <article-title>Electrolyte solvents for high voltage lithium ion batteries: Ion correlation and specific anion effects in adiponitrile</article-title>. <source>Phys. Chem. Chem. Phys.</source> <volume>20</volume>, <fpage>25701</fpage>&#x2013;<lpage>25715</lpage>. <pub-id pub-id-type="doi">10.1039/c8cp04102d</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Landsgesell</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Holm</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Wang&#x2013;landau reaction ensemble method: Simulation of weak polyelectrolytes and general acid&#x2013;base reactions</article-title>. <source>J. Chem. Theo. Comput.</source> <volume>13</volume>, <fpage>852</fpage>&#x2013;<lpage>862</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jctc.6b00791</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Linardatos</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Papastefanopoulos</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Kotsiantis</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Explainable ai: A review of machine learning interpretability methods</article-title>. <source>Entropy</source> <volume>23</volume>, <fpage>18</fpage>. <pub-id pub-id-type="doi">10.3390/e23010018</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lundberg</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>S.-I.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>A unified approach to interpreting model predictions</article-title>. <source>Adv. Neural. Inf. Proc. Sys.</source> <volume>30</volume>. <pub-id pub-id-type="doi">10.48550/arXiv.1705.07874</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Miranda-Quintana</surname>
<given-names>R. A.</given-names>
</name>
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Electronic properties of protein destabilizers and stabilizers: Implications for preferential binding and exclusion mechanisms</article-title>. <source>J. Phys. Chem. B</source> <volume>125</volume>, <fpage>11857</fpage>&#x2013;<lpage>11868</lpage>. <pub-id pub-id-type="doi">10.1021/acs.jpcb.1c06295</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Molnar</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Casalicchio</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Bischl</surname>
<given-names>B.</given-names>
</name>
</person-group> &#x201c;<article-title>Interpretable machine learning&#x2013;a brief history, state-of-the-art and challenges</article-title>,&#x201d; in <conf-name>Proceedings of the Joint European Conference on Machine Learning and Knowledge Discovery in Databases</conf-name>, <conf-loc>Ghent, Belgium</conf-loc>, <conf-date>September 2020</conf-date>, <fpage>417</fpage>&#x2013;<lpage>431</lpage>.</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montano Herrera</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Eilert</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>I.-T.</given-names>
</name>
<name>
<surname>Matysik</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Laussegger</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Guderlei</surname>
<given-names>R.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Holistic process models: A bayesian predictive ensemble method for single and coupled unit operation models</article-title>. <source>Processes</source> <volume>10</volume>, <fpage>662</fpage>. <pub-id pub-id-type="doi">10.3390/pr10040662</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oprzeska-Zingrebe</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Basket-type g-quadruplex with two tetrads in the presence of tmao and urea: A molecular dynamics study</article-title>. <source>J. Mol. Struct.</source> <volume>1274</volume>, <fpage>134375</fpage>. <pub-id pub-id-type="doi">10.1016/j.molstruc.2022.134375</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oprzeska-Zingrebe</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Aqueous ionic liquids in comparison with standard co-solutes</article-title>. <source>Biophys. Rev.</source> <volume>10</volume>, <fpage>809</fpage>&#x2013;<lpage>824</lpage>. <pub-id pub-id-type="doi">10.1007/s12551-018-0414-7</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oprzeska-Zingrebe</surname>
<given-names>E. A.</given-names>
</name>
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Interactions of a dna g-quadruplex with tmao and urea: A molecular dynamics study on co-solute compensation mechanisms</article-title>. <source>Phys. Chem. Chem. Phys.</source> <volume>23</volume>, <fpage>1254</fpage>&#x2013;<lpage>1264</lpage>. <pub-id pub-id-type="doi">10.1039/d0cp05356b</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Oviedo</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Ferres</surname>
<given-names>J. L.</given-names>
</name>
<name>
<surname>Buonassisi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Butler</surname>
<given-names>K. T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Interpretable and explainable machine learning for materials science and chemistry</article-title>. <source>Acc. Mat. Res.</source> <volume>3</volume>, <fpage>597</fpage>&#x2013;<lpage>607</lpage>. <pub-id pub-id-type="doi">10.1021/accountsmr.1c00244</pub-id>
</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: Machine learning in Python</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. <pub-id pub-id-type="doi">10.5555/1953048.2078195</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pilania</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Machine learning in materials science: From explainable predictions to autonomous design</article-title>. <source>Comput. Mat. Sci.</source> <volume>193</volume>, <fpage>110360</fpage>. <pub-id pub-id-type="doi">10.1016/j.commatsci.2021.110360</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Politis</surname>
<given-names>S. N.</given-names>
</name>
<name>
<surname>Colombo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Colombo</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Rekkas</surname>
<given-names>M. D.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Design of experiments (doe) in pharmaceutical development</article-title>. <source>Drug Dev. indust. Pharm.</source> <volume>43</volume>, <fpage>889</fpage>&#x2013;<lpage>901</lpage>. <pub-id pub-id-type="doi">10.1080/03639045.2017.1291672</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ram&#xf3;n</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Se&#xf1;orale</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mar&#xed;n</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Inclusion bodies: Not that bad</article-title>. <source>Front. Microbiol.</source> <volume>5</volume>, <fpage>56</fpage>. <pub-id pub-id-type="doi">10.3389/fmicb.2014.00056</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="web">
<person-group person-group-type="author">
<name>
<surname>Ribeiro</surname>
<given-names>M. T.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guestrin</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Model-agnostic interpretability of machine learning</article-title>. <ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/1606.05386">https://arxiv.org/abs/1606.05386</ext-link>.</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Roscher</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Bohn</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Duarte</surname>
<given-names>M. F.</given-names>
</name>
<name>
<surname>Garcke</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Explainable machine learning for scientific insights and discoveries</article-title>. <source>Ieee Access</source> <volume>8</volume>, <fpage>42200</fpage>&#x2013;<lpage>42216</lpage>. <pub-id pub-id-type="doi">10.1109/access.2020.2976199</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Shapley</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>1953</year>). &#x201c;<article-title>Quota solutions op n-person games</article-title>,&#x201d; in <source>Contributions to the theory of games</source> (<publisher-loc>Santa Monica, CL, USA</publisher-loc>: <publisher-name>The Rand Corporation</publisher-name>).</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singh</surname>
<given-names>S. M.</given-names>
</name>
<name>
<surname>Panda</surname>
<given-names>A. K.</given-names>
</name>
</person-group> (<year>2005</year>). <article-title>Solubilization and refolding of bacterial inclusion body proteins</article-title>. <source>J. Biosci. Bioeng.</source> <volume>99</volume>, <fpage>303</fpage>&#x2013;<lpage>310</lpage>. <pub-id pub-id-type="doi">10.1263/jbb.99.303</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singhvi</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Saneja</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Srichandan</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Panda</surname>
<given-names>A. K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Bacterial inclusion bodies: A treasure trove of bioactive proteins</article-title>. <source>Trends Biotechnol.</source> <volume>38</volume>, <fpage>474</fpage>&#x2013;<lpage>486</lpage>. <pub-id pub-id-type="doi">10.1016/j.tibtech.2019.12.011</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2017</year>). <article-title>Aqueous ionic liquids and their effects on protein structures: An overview on recent theoretical and experimental results</article-title>. <source>J. Phys. Condens. Matter</source> <volume>29</volume>, <fpage>233001</fpage>. <pub-id pub-id-type="doi">10.1088/1361-648x/aa6c9d</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Clemens</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Herrera</surname>
<given-names>L. M.</given-names>
</name>
<name>
<surname>Arnold</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Knapp</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Presser</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2021a</year>). <article-title>Generic and specific recurrent neural network models: Applications for large and small scale biopharmaceutical upstream processes</article-title>. <source>Biotechnol. Rep.</source> <volume>31</volume>, <fpage>e00640</fpage>. <pub-id pub-id-type="doi">10.1016/j.btre.2021.e00640</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Heuer</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Winter</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Properties of ion complexes and their impact on charge transport in organic solvent-based electrolyte solutions for lithium batteries: Insights from a theoretical perspective</article-title>. <source>Batteries</source> <volume>4</volume>, <fpage>62</fpage>. <pub-id pub-id-type="doi">10.3390/batteries4040062</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bluhmki</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Towards a digital bioprocess replica: Computational approaches in biopharmaceutical development and manufacturing</article-title>. <source>Trends Biotechnol.</source> <volume>38</volume>, <fpage>1141</fpage>&#x2013;<lpage>1153</lpage>. <pub-id pub-id-type="doi">10.1016/j.tibtech.2020.05.008</pub-id>
</citation>
</ref>
<ref id="B43">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smiatek</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bluhmki</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2021b</year>). <article-title>Validation is not verification: Precise terminology and scientific methods in bioprocess modeling</article-title>. <source>Trends Biotechnol.</source> <volume>39</volume>, <fpage>1117</fpage>&#x2013;<lpage>1119</lpage>. <pub-id pub-id-type="doi">10.1016/j.tibtech.2021.04.003</pub-id>
</citation>
</ref>
<ref id="B44">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>&#x160;trumbelj</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Kononenko</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Explaining prediction models and individual predictions with feature contributions</article-title>. <source>Knowl. Info. Sys.</source> <volume>41</volume>, <fpage>647</fpage>&#x2013;<lpage>665</lpage>. <pub-id pub-id-type="doi">10.1007/s10115-013-0679-x</pub-id>
</citation>
</ref>
<ref id="B45">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sudret</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2008</year>). <article-title>Global sensitivity analysis using polynomial chaos expansions</article-title>. <source>Reliab. Eng. Sys. Saf.</source> <volume>93</volume>, <fpage>964</fpage>&#x2013;<lpage>979</lpage>. <pub-id pub-id-type="doi">10.1016/j.ress.2007.04.002</pub-id>
</citation>
</ref>
<ref id="B46">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Valax</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Georgiou</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>1993</year>). <article-title>Molecular characterization of <italic>&#x3b2;</italic>-lactamase inclusion bodies produced in escherichia coli. 1. composition</article-title>. <source>Biotechnol. Prog.</source> <volume>9</volume>, <fpage>539</fpage>&#x2013;<lpage>547</lpage>. <pub-id pub-id-type="doi">10.1021/bp00023a014</pub-id>
</citation>
</ref>
<ref id="B47">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Van Rossum</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Drake</surname>
<given-names>F. L.</given-names>
</name>
</person-group> (<year>2009</year>). <source>Python 3 reference manual</source>. <publisher-loc>Scotts Valley, CA, USA</publisher-loc>: <publisher-name>CreateSpace</publisher-name>.</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wakjira</surname>
<given-names>T. G.</given-names>
</name>
<name>
<surname>Al-Hamrani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Ebead</surname>
<given-names>U.</given-names>
</name>
<name>
<surname>Alnahhal</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Shear capacity prediction of frp-rc beams using single and ensenble explainable machine learning models</article-title>. <source>Comp. Struct.</source> <volume>287</volume>, <fpage>115381</fpage>. <pub-id pub-id-type="doi">10.1016/j.compstruct.2022.115381</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Walther</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Mayer</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Sekot</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Antos</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Hahn</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jungbauer</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Mechanism and model for solubilization of inclusion bodies</article-title>. <source>Chem. Eng. Sci.</source> <volume>101</volume>, <fpage>631</fpage>&#x2013;<lpage>641</lpage>. <pub-id pub-id-type="doi">10.1016/j.ces.2013.07.026</pub-id>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Walther</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Mayer</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Trefilov</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sekot</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hahn</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Jungbauer</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Prediction of inclusion body solubilization from shaken to stirred reactors</article-title>. <source>Biotechnol. Bioeng.</source> <volume>111</volume>, <fpage>84</fpage>&#x2013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1002/bit.24998</pub-id>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Walther</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Voigtmann</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Bruna</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Abusnina</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Tschelie&#xdf;nig</surname>
<given-names>A.-L.</given-names>
</name>
<name>
<surname>Allmer</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Smart process development: Application of machine-learning and integrated process modeling for inclusion body purification processes</article-title>. <source>Biotechnol. Prog.</source> <volume>38</volume>, <fpage>e3249</fpage>. <pub-id pub-id-type="doi">10.1002/btpr.3249</pub-id>
</citation>
</ref>
<ref id="B52">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wes McKinney</surname>
</name>
</person-group> &#x201c;<article-title>Data structures for statistical computing in Python</article-title>,&#x201d; in <conf-name>Proceedings of the 9th Python in Science Conference</conf-name>, <conf-loc>Austin, Texas</conf-loc>, <conf-date>June 2010</conf-date>. <pub-id pub-id-type="doi">10.25080/Majora-92bf1922-00a</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wong</surname>
<given-names>T.-T.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Performance evaluation of classification algorithms by k-fold and leave-one-out cross validation</article-title>. <source>Pattern Recogn.</source> <volume>48</volume>, <fpage>2839</fpage>&#x2013;<lpage>2846</lpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2015.03.009</pub-id>
</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Knape</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Burkert</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Mazzini</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Jung</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Craig</surname>
<given-names>V. S.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Artificial neural networks for the prediction of solvation energies based on experimental and computational data</article-title>. <source>Phys. Chem. Chem. Phys.</source> <volume>22</volume>, <fpage>24359</fpage>&#x2013;<lpage>24364</lpage>. <pub-id pub-id-type="doi">10.1039/d0cp03701j</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>