<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Bioinform.</journal-id>
<journal-title>Frontiers in Bioinformatics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Bioinform.</abbrev-journal-title>
<issn pub-type="epub">2673-7647</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1603133</article-id>
<article-id pub-id-type="doi">10.3389/fbinf.2025.1603133</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Bioinformatics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>NeSyDPP-4: discovering DPP-4 inhibitors for diabetes treatment with a neuro-symbolic AI approach</article-title>
<alt-title alt-title-type="left-running-head">Hossain et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fbinf.2025.1603133">10.3389/fbinf.2025.1603133</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Hossain</surname>
<given-names>Delower</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3020446/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Saghapour</surname>
<given-names>Ehsan</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2082823/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Chen</surname>
<given-names>Jake Y.</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/531449/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Department of Computer Science</institution>, <institution>The University of Alabama at Birmingham</institution>, <addr-line>Birmingham</addr-line>, <addr-line>AL</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>System Pharmacology and AI Research Center (SPARC)</institution>, <institution>The University of Alabama at Birmingham</institution>, <addr-line>Birmingham</addr-line>, <addr-line>AL</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Biomedical Informatics and Data Science</institution>, <institution>School of Medicine</institution>, <institution>The University of Alabama at Birmingham</institution>, <addr-line>Birmingham</addr-line>, <addr-line>AL</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/472005/overview">Huixiao Hong</ext-link>, United States Food and Drug Administration, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1586849/overview">Antonina L. Nazarova</ext-link>, University of Southern California, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3023838/overview">Heba Askr</ext-link>, University of Sadat City, Egypt</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Jake Y. Chen, <email>jakechen@uab.edu</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>21</day>
<month>07</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>5</volume>
<elocation-id>1603133</elocation-id>
<history>
<date date-type="received">
<day>31</day>
<month>03</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>05</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Hossain, Saghapour and Chen.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Hossain, Saghapour and Chen</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Diabetes Mellitus (DM) constitutes a global epidemic and is one of the top ten leading causes of mortality (WHO, 2019), projected to rank seventh by 2030. The US National Diabetes Statistics Report (2021) states that 38.4 million Americans have diabetes. Dipeptidyl Peptidase-4 (DPP-4) is an FDA-approved target for the treatment of type 2 diabetes mellitus (T2DM). However, current DPP-4 inhibitors may cause adverse effects, including gastrointestinal issues, severe joint pain (FDA safety warning), nasopharyngitis, hypersensitivity, and nausea. Moreover, the development of novel drugs and the <italic>in vivo</italic> assessment of DPP-4 inhibition are both costly and often impractical. These challenges highlight the urgent need for efficient <italic>in-silico</italic> approaches to facilitate the discovery and optimization of safer and more effective DPP-4 inhibitors.</p>
</sec>
<sec>
<title>Methodology</title>
<p>Quantitative Structure-Activity Relationship (QSAR) modeling is a widely used computational approach for evaluating the properties of chemical substances. In this study, we employed a Neuro-symbolic (NeSy) approach, specifically the Logic Tensor Network (LTN), to develop a DPP-4 QSAR model capable of identifying potential small-molecule inhibitors and predicting bioactivity classification. For comparison, we also implemented baseline models using Deep Neural Networks (DNNs) and Transformers. A total of 6,563 bioactivity records (SMILES-based compounds with IC<sub>50</sub> values) were collected from ChEMBL, PubChem, BindingDB, and GTP. Feature sets used for model training included descriptors (CDK Extended&#x2013;PaDEL), fingerprints (Morgan), chemical language model embeddings (ChemBERTa-2), LLaMa 3.2 embedding features, and physicochemical properties.</p>
</sec>
<sec>
<title>Results</title>
<p>Among all tested configurations, the Neuro-symbolic QSAR model (NeSyDPP-4) performed best using a combination of CDK extended and Morgan fingerprints. The model achieved an accuracy of 0.9725, an F1-score of 0.9723, an ROC AUC of 0.9719, and a Matthews correlation coefficient (MCC) of 0.9446. These results outperformed the baseline DNN and Transformer models, as well as existing state-of-the-art (SOTA) methods. To further validate the robustness of the model, we conducted an external evaluation using the Drug Target Common (DTC) dataset, where NeSyDPP-4 also demonstrated strong performance, with an accuracy of 0.9579, an AUC-ROC of 0.9565, a Matthews Correlation Coefficient (MCC) of 0.9171, and an F1-score of 0.9577.</p>
</sec>
<sec>
<title>Discussion</title>
<p>These findings suggest that the NeSyDPP-4 model not only delivered high predictive performance but also demonstrated generalizability to external datasets. This approach presents a cost-effective and reliable alternative to traditional vivo screening, offering valuable support for the identification and classification of biologically active DPP-4 inhibitors in the treatment of type 2 diabetes mellitus (T2DM).</p>
</sec>
</abstract>
<kwd-group>
<kwd>neuro-symbolic artificial intelligence</kwd>
<kwd>deep learning</kwd>
<kwd>DPP-4</kwd>
<kwd>drug discovery</kwd>
<kwd>machine learning</kwd>
<kwd>QSAR</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Integrative Bioinformatics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Diabetes mellitus (DM) is a chronic metabolic disorder characterized by elevated blood glucose levels, posing a significant global health burden. According to the World Health Organization (WHO) 2019 report, diabetes ranks among the top 10 leading causes of mortality, with an estimated 1.6 million deaths worldwide (<xref ref-type="bibr" rid="B44">World Health Organization: WHO, 2024</xref>; <xref ref-type="bibr" rid="B43">World Health Organization, 2020</xref>). In the United States, diabetes is a significant public health challenge, affecting approximately 38 million people (11.3% of the population) and leading to $327 billion in medical expenses and lost wages annually (<xref ref-type="bibr" rid="B7">CDC, 2024</xref>). Beyond economic costs, diabetes is associated with severe complications, including blindness, kidney failure, stroke, heart disease, and neuropathy. DM is broadly classified into type 1 diabetes mellitus (T1DM) and type 2 diabetes mellitus (T2DM), with T2DM accounting for over 90% of all cases. One decisive therapeutic target for managing Type 2 Diabetes Mellitus (T2DM) is the Dipeptidyl Peptidase-4 (DPP-4) enzyme, which plays a key role in regulating glucose metabolism. DPP-4 inhibitors, a class of FDA-approved medications, help control blood sugar levels by inhibiting this enzyme. However, current DPP-4 inhibitors have been linked to adverse effects such as gastrointestinal issues, severe joint pain, nasopharyngitis, hypersensitivity, and nausea (<xref ref-type="bibr" rid="B22">Huang et al., 2020</xref>). As a result, discovering safer and more effective DPP-4 inhibitors remains a critical research challenge.</p>
<p>Artificial intelligence (AI) has revolutionized diabetes management and drug discovery over the past two decades. Early AI models focused on predicting glucose levels, providing insulin dosage recommendations, and monitoring patients. In recent years, AI has rapidly advanced in the field of <italic>de novo</italic> drug design by leveraging large-scale molecular datasets. These models can not only generate novel drug candidates but also identify repurposable inhibitors and uncover complex relationships among genes, proteins, and disease mechanisms. In the field of DPP-4 inhibitor prediction, quantitative structure&#x2013;activity relationship (QSAR) models have been widely developed using machine learning techniques such as random forest, support vector machines (SVMs), XGBoost, gradient boosting machines, and deep neural networks (DNNs) (<xref ref-type="bibr" rid="B48">Gong et al., 2021</xref>; <xref ref-type="bibr" rid="B17">Hermansyah et al., 2021</xref>; <xref ref-type="bibr" rid="B29">Ojo et al., 2021</xref>; <xref ref-type="bibr" rid="B33">Septiawan et al., 2022</xref>; <xref ref-type="bibr" rid="B5">Bustamam et al., 2021</xref>; <xref ref-type="bibr" rid="B1">Ajiboye et al., 2021</xref>). Although these models have demonstrated high predictive performance, they have limitations, including poor interpretability, data inefficiency, and a lack of reasoning capabilities. The black-box nature of deep learning models further complicates their use in critical healthcare applications, where transparency, logical reasoning, and explainability are vital (<xref ref-type="bibr" rid="B15">Hassan et al., 2022</xref>).</p>
<p>To address these challenges, neuro-symbolic (NeSy) AI (<xref ref-type="bibr" rid="B20">Hossain and Chen, 2025</xref>) has emerged as a promising paradigm that combines neural networks with symbolic reasoning for more interpretable and data-efficient learning. In contrast to traditional AI approaches that rely solely on data, Neuro-symbolic AI (NeSy AI) integrates domain knowledge with data-driven learning, enabling logical reasoning and making it especially well-suited for healthcare and drug discovery applications (<xref ref-type="bibr" rid="B21">Hossain et al., 2025</xref>) (<xref ref-type="bibr" rid="B15">Hassan et al., 2022</xref>). Studies identified numerous NeSy models that have demonstrated immense success in biomedical applications (<xref ref-type="bibr" rid="B20">Hossain and Chen, 2025</xref>; <xref ref-type="bibr" rid="B50">Yu et al., 2023</xref>; <xref ref-type="bibr" rid="B41">Wang W. et al., 2024</xref>), such as protein function prediction [MultiPredGO (<xref ref-type="bibr" rid="B13">Giri et al., 2020</xref>)], gene sequence analysis [KBANN (<xref ref-type="bibr" rid="B37">Towell and Shavlik, 1994</xref>)], diabetic retinopathy diagnosis [ExplainDR (<xref ref-type="bibr" rid="B24">Jang et al., 2021</xref>)], predicting the structure of proteins [extended KBANN (<xref ref-type="bibr" rid="B27">Maclin and Shavlik, 1994</xref>)], cardiotoxicity assessment hERG-LTN (<xref ref-type="bibr" rid="B21">Hossain et al., 2025</xref>), (Ontology) RRN (<xref ref-type="bibr" rid="B46">Yang et al., 2017</xref>), NSRL (<xref ref-type="bibr" rid="B19">Hohenecker and Lukasiewicz, 2020</xref>), Neuro-Fuzzy (<xref ref-type="bibr" rid="B47">Yang et al., 2020</xref>), FSKBANN (<xref ref-type="bibr" rid="B25">Kora et al., 2019</xref>), DeepMiRGO (<xref ref-type="bibr" rid="B40">Wang et al., 2019</xref>), NS-VQA (<xref ref-type="bibr" rid="B49">Yi et al., 2018</xref>), DFOL-VQA (<xref ref-type="bibr" rid="B2">Amizadeh et al., 2020</xref>), LNN (<xref ref-type="bibr" rid="B32">Riegel et al., 2020</xref>), NofM (<xref ref-type="bibr" rid="B36">Towell and Shavlik, 1991</xref>), PP-DKL (<xref ref-type="bibr" rid="B26">Lavin, 2022</xref>), FSD (<xref ref-type="bibr" rid="B8">Dobosz and Duch, 2008</xref>), CORGI (<xref ref-type="bibr" rid="B3">Arabshahi et al., 2021</xref>), NeurASP (<xref ref-type="bibr" rid="B34">Shi et al., 2019</xref>), XNMs (<xref ref-type="bibr" rid="B35">Teru et al., 2020</xref>), Semantic loss (<xref ref-type="bibr" rid="B45">Xu et al., 2018</xref>), NS-CL (<xref ref-type="bibr" rid="B28">Mao et al., 2019</xref>), and Logic Tensor Networks (LTNs) (<xref ref-type="bibr" rid="B4">Badreddine et al., 2021</xref>). In this study, we explore a hybrid neuro-symbolic approach integrating LTNs for DPP-4 bioactivity prediction, aiming to enhance predictive accuracy while enabling logical reasoning for novel drug discovery in T2DM treatment.</p>
<p>The paper&#x2019;s main contributions are summarized as follows: 1) We developed a scalable and robust AI predictive model that demonstrates significant improvements in accuracy for predicting the potency of T2DM inhibitors. 2) A novel integration of data and rules (Knowledge) for DPP-4 inhibitor bioactivity classification. 3) We acquired and utilized a more diverse set of compound datasets, including chemical embeddings, descriptors, fingerprints, and physicochemical properties, which previous studies have not explored. The developed NeSyDPP-4 model can be used to discover novel DPP-4 active drugs by scanning large molecular datasets, such as ZINC, and identifying new candidate compounds, thereby accelerating the <italic>de novo</italic> design of drugs. Additionally, it facilitates QSAR model downstream applications, including virtual screening, contraindications, bioactivity indications, and other key elements of DPP-4 inhibitor therapy in clinical settings. These applications encompass docking, affinity prediction, ADMET analysis, and molecular dynamics (MD) studies for DPP-4 clinical applications.</p>
<p>The remainder of this manuscript is organized as follows: <xref ref-type="sec" rid="s2">Section 2</xref> provides the background, offering essential insights into the problem domain and its significance. <xref ref-type="sec" rid="s3">Section 3</xref> briefly reviews related work, highlighting existing approaches and their contributions to DPP-4 bioactivity classification. <xref ref-type="sec" rid="s4">Section 4</xref> describes the methodology, detailing the proposed approach, datasets, and algorithms used. <xref ref-type="sec" rid="s5">Section 5</xref> reports the results obtained from the experimental evaluation. <xref ref-type="sec" rid="s6">Section 6</xref> offers a discussion, interpreting and comparing the findings with existing studies. Finally, <xref ref-type="sec" rid="s7">Section 7</xref> concludes the paper with key findings and guidance for future research.</p>
</sec>
<sec id="s2">
<title>2 Background</title>
<sec id="s2-1">
<title>2.1 Dipeptidyl peptidase-4 inhibitors</title>
<p>DPP-4 is an enzyme primarily involved in glucose metabolism, particularly regulating blood glucose levels in T2DM. DPP-4 is an FDA-approved target for T2DM treatment. The primary aim of the DPP-4 inhibitor is to prevent the degradation of incretin hormones and improve blood glucose control. Several FDA-approved DPP-4 inhibitors include sitagliptin, saxagliptin, linagliptin, and alogliptin (<xref ref-type="bibr" rid="B10">FDA, 2023</xref>; <xref ref-type="sec" rid="s14">Supplementary Appendix A</xref>). In addition, <xref ref-type="fig" rid="F1">Figure 1</xref> depicts the 2D structures of FDA-approved DPP-4 inhibitors collected from ChEMBL.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>2D structure of FDA-approved dipeptidyl peptidase-4 (DPP-4) inhibitors collected from ChEMBL.</p>
</caption>
<graphic xlink:href="fbinf-05-1603133-g001.tif"/>
</fig>
</sec>
<sec id="s2-2">
<title>2.2 Quantitative structure&#x2013;activity relationship</title>
<p>QSAR modeling (<xref ref-type="bibr" rid="B31">Perkins et al., 2003</xref>) is a computational technique that uncovers patterns between molecular features and experimental outcomes, which helps predict the biological activity of compounds based on their chemical composition. Machine learning algorithms like artificial neural networks (ANNs), SVMs, and random forests are commonly applied to build accurate QSAR models. This approach accelerates drug discovery by enabling <italic>in silico</italic> assessment, reducing the reliance on extensive laboratory testing (wet-lab).</p>
</sec>
<sec id="s2-3">
<title>2.3 Symbolic AI</title>
<p>Good Old-Fashioned Artificial Intelligence (GOFAI), commonly referred to as Symbolic AI, is a classical AI approach emphasizing knowledge representation and reasoning. It was the dominant paradigm from the 1950s to the 1980s (<xref ref-type="bibr" rid="B20">Hossain and Chen, 2025</xref>), using methods like logic, rules, ontologies, decision trees, and knowledge graphs. Although symbolic AI excels in explanation, interpretability, and structured decision-making, it struggles to perform effectively at scale and with noisy data.</p>
</sec>
<sec id="s2-4">
<title>2.4 Sub-symbolic AI</title>
<p>Sub-symbolic AI, often called a &#x201c;black box&#x201d; approach, relies on large-scale data and statistical learning rather than explicit rules. Sub-symbolic (connectionist) AI has driven significant advancements in the modern AI era, such as autonomous driving systems and healthcare. For instance, a recent breakthrough contribution is building AlphaFold, a highly accurate protein structure prediction model developed by google DeepMind research. However, ANNs are the heart of this connectionist system. Although it excels in pattern recognition and handling unstructured, noisy, and big data, it lacks transparency, explainability, and reasoning.</p>
</sec>
<sec id="s2-5">
<title>2.5 Neuro-symbolic AI</title>
<p>Neuro-symbolic AI (<xref ref-type="bibr" rid="B20">Hossain and Chen, 2025</xref>) is an emerging branch of artificial intelligence constructed to bridge the gap between symbolic and connectionist approaches by integrating their strengths and eliminating their flaws. This hybrid integration aims to create AI systems that are both interpretable and capable of reasoning. Interpretable and reasoning-based AI enhances trust, transparency, and decision-making in healthcare (<xref ref-type="bibr" rid="B13">Giri et al., 2020</xref>; <xref ref-type="bibr" rid="B37">Towell and Shavlik, 1994</xref>; <xref ref-type="bibr" rid="B24">Jang et al., 2021</xref>; <xref ref-type="bibr" rid="B21">Hossain et al., 2025</xref>; <xref ref-type="bibr" rid="B15">Hassan et al., 2022</xref>), enabling accurate diagnoses and personalized treatments. It also improves safety, accountability, and regulatory compliance.</p>
</sec>
<sec id="s2-6">
<title>2.6 Logic Tensor Network</title>
<p>The LTN is a neuro-symbolic framework developed at Sony Computer Science Laboratories (Sony CSL) that enables querying, learning, and reasoning with complex data and abstract knowledge. It uses a differentiable first-order language called Real Logic to integrate logical reasoning with data-driven learning (<xref ref-type="bibr" rid="B4">Badreddine et al., 2021</xref>). The core advantage of this paradigm is its ability to perform reasoning, which is expressed through logical components, and is highly scalable. Additionally, it offers a comprehensive framework capable of handling supervised and unsupervised tasks, including regression, classification, and clustering. In this study, we conceived this model (LTN) for DPP-4 bioactivity classification, and more details are discussed in the <italic>Methodology</italic> section.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Related work</title>
<p>DPP-4 inhibitors are a significant class of compounds used in treating type 2 diabetes; predicting their bioactivity is essential in early drug discovery. Several machine learning-based studies have been explored for classifying DPP-4 inhibitors. For instance, QSAR models have been widely applied with descriptors such as molecular fingerprints and physicochemical properties to predict inhibitory activity. Techniques like random forests, support vector machines, and deep neural networks have shown promising results in improving classification accuracy and guiding virtual screening processes. For instance, <xref ref-type="bibr" rid="B16">Hermansyah et al. (2020)</xref> employed Random Forest and DNNs, achieving an accuracy of 0.9221 using CDK fingerprint and molecular properties. <xref ref-type="bibr" rid="B5">Bustamam et al. (2021)</xref> developed a QSAR&#x2013;DNN model, yielding an accuracy of 0.904. In addition, <xref ref-type="bibr" rid="B6">Cai et al. (2017)</xref> applied a Na&#xef;ve Bayesian (NB) approach using various fingerprint extractions, such as ECFP_4, ECFP_6, FCFP_4, and FCFP_6, reaching an accuracy of 0.872. <xref ref-type="bibr" rid="B38">Ulfa et al. (2021)</xref> combined Conv1D and LSTM layers for bioactivity classification, using CatBoost-selected fingerprint features, and achieved an accuracy of 0.8618. Finally, <xref ref-type="bibr" rid="B18">Hermansyah et al. (2023)</xref> used XGBoost with CDK and ECFP-6 features, reporting an accuracy of 0.8164. These studies highlight the progression from traditional machine learning models to advanced deep learning approaches, which have steadily improved predictive performance in DPP-4 bioactivity classification. However, to date, neuro-symbolic approaches have not been effectively integrated into DPP-4 research, even though such integration is essential for the discovery of novel chemical compounds. Moreover, previous research lacked experimentation with diverse chemical features, such as chemical language model embeddings and physicochemical properties. To address these gaps, we propose the NeSyDPP-4 strategy, which leverages Logic Tensor Networks and incorporates a wide range of feature representations, including LLaMA3.2 embeddings, PaDELPy CDKExtended fingerprints, Morgan fingerprints, chemical language model embeddings, and physicochemical properties.</p>
</sec>
<sec sec-type="methods" id="s4">
<title>4 Methodology</title>
<p>This section describes a set of procedures to determine the performance of LTNs (<xref ref-type="bibr" rid="B4">Badreddine et al., 2021</xref>), DNNs, and an advanced language model known as Transformer, using the ChEMBL BindingDB, PubChem, and GTP datasets related to DPP-4 inhibitors. This section covers the entire pipeline, including the material procurement, data preprocessing, feature extraction, simulation environment, network architecture, LTN knowledge-based setting, the training and inferencing phases, and the evaluation metrics used to measure the trained model&#x2019;s performance.</p>
<sec id="s4-1">
<title>4.1 Data acquisition</title>
<p>The study constructed a new DPP-4 cohort using four publicly available chemical compound databases, namely, ChEMBL (<xref ref-type="bibr" rid="B11">Gaulton et al., 2011</xref>), BindingDB (<xref ref-type="bibr" rid="B12">Gilson et al., 2015</xref>), PubChem, and GTP. The ChEMBL database contains more than 2 million compounds. We retrieved canonical SMILES related to the DPP-4 inhibitor with the target organism <italic>Homo sapiens</italic> using ID: CHEMBL284 and standard type IC<sub>50</sub>. The data were extracted using the ChEMBL Python API (chembl_webresource_client). The BindingDB manually uses DPP-4 string keywords (dipeptidyl peptidase-4) from their official site. Subsequently, data from PubChem in CSV format were retrieved using the following <ext-link ext-link-type="uri" xlink:href="https://pubchem.ncbi.nlm.nih.gov/gene/DPP4/human">link</ext-link>, and GTP data were accessed via the corresponding <ext-link ext-link-type="uri" xlink:href="https://www.guidetopharmacology.org/GRAC/ObjectDisplayForward?objectId=1612">link</ext-link>. In addition, to assess the model&#x2019;s robustness and generalizability, we collected additional DPP-4-related data from Drug Target Common (DTC) via the provided <ext-link ext-link-type="uri" xlink:href="https://drugtargetcommons.fimm.fi/bioactivities?id=DTCT0024079&#x26;category=Target&#x26;name=DPP4">link</ext-link> for external validation. Curated data can be found in the Data Availability section.</p>
</sec>
<sec id="s4-2">
<title>4.2 Data preprocessing and feature extraction</title>
<p>The initial bioactivity datasets comprised various irrelevant features. The curated subsets focused explicitly on the IC<sub>50</sub> biological activity values, inhibitor identifiers (such as ChEMBL_ID and BindingDB_ID), and canonical SMILES representations and constrained the target organism to <italic>Homo sapiens</italic>. Notably, numerical IC<sub>50</sub> values were reported in nanomolar (nM) units for ChEMBL, BindingDB, and GTP, while PubChem provided values in micromolar (&#x3bc;M) units. All IC<sub>50</sub> measurements were harmonized to nanomolar units to standardize the data using the conversion formula nM &#x3d; &#x3bc;M&#xd7;1000. Subsequently, pIC<sub>50</sub> values were computed from the standardized IC<sub>50</sub> values through a logarithmic transformation using log10 (<xref ref-type="disp-formula" rid="e1">Equation 1</xref>) to normalize the distribution. Based on these pIC<sub>50</sub> values, compounds were labeled as active or inactive according to established thresholds from prior DPP-4 chemical research (<xref ref-type="bibr" rid="B38">Ulfa et al., 2021</xref>). After merging all curated datasets, duplicates were removed based on the canonical SMILES column, and entries with missing values in the ID, SMILES, or IC<sub>50</sub> fields were discarded. The final dataset was then split using a stratified sampling strategy with the <italic>scikit-learn</italic> package, followed by feature scaling using a standard scaler.</p>
<p>Afterward, a diverse array of features was extracted from SMILES representations, encompassing Morgan fingerprints (512, 1024, and 2048 bits), CDKExtended descriptors using PaDELPy (<xref ref-type="bibr" rid="B51">Yap, 2011</xref>), chemical embeddings generated via ChemBERTa-2 (<xref ref-type="bibr" rid="B52">Ahmad et al., 2022</xref>) and LLaMA3.2 (<xref ref-type="bibr" rid="B53">Ettaleb et al., 2024</xref>) from the Hugging Face model, and a comprehensive set of physicochemical properties [molecular weight, hydrophobicity-LogP, topological polar surface area (TPSA), hydrogen bond donors, hydrogen bond acceptors, and rotatable bonds] extracted using RDKit (<xref ref-type="bibr" rid="B23">Installation, 2024</xref>).<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mtext>pIC</mml:mtext>
<mml:mn>50</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>log</mml:mi>
<mml:mn>10</mml:mn>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mtext>IC</mml:mtext>
<mml:mn>50</mml:mn>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msup>
<mml:mn>10</mml:mn>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>9</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
</p>
</sec>
<sec id="s4-3">
<title>4.3 LTN classification model</title>
<p>We used LTNs (<xref ref-type="bibr" rid="B4">Badreddine et al., 2021</xref>) to build the NeSyDPP-4 classifier. LTNs combine neural networks with first-order logic, which allows us to perform reasoning over structured knowledge while learning from data. The architecture has two key components: logic and neural networks. The visual architecture of the classification model was adopted from the LTN (shown in <xref ref-type="fig" rid="F2">Figure 2</xref>). The logical mechanism contains a set of axioms or rules (explained in detail in the knowledge-based setting), and reasoning is revealed through those rules/axioms. In our context, <xref ref-type="table" rid="T1">Table 1</xref> represents the axioms and the relevant knowledge-based component. Notably, other network configuration parameters can be found in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Logic Tensor Network (LTN) based Architecture of NeSyDPP4 model.</p>
</caption>
<graphic xlink:href="fbinf-05-1603133-g002.tif"/>
</fig>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>LTN knowledge-based setting for DPP-4 classification.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Content</th>
<th align="left">Block</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Definition of axioms</td>
<td align="left">&#x2022; <inline-formula id="inf1">
<mml:math id="m2">
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">x</mml:mi>
<mml:mi mathvariant="normal">A</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">l</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: all the examples of class <inline-formula id="inf2">
<mml:math id="m3">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> should have a label <inline-formula id="inf3">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
<break/>&#x2022; <inline-formula id="inf4">
<mml:math id="m5">
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: all the examples of class <inline-formula id="inf5">
<mml:math id="m6">
<mml:mrow>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> (Inactive &#x3d; 1) should have a label <inline-formula id="inf6">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Axioms (rules and knowledge base)</td>
<td align="center">
<inline-formula id="inf7">
<mml:math id="m8">
<mml:mrow>
<mml:mspace width="3em"/>
<mml:mi mathvariant="script">K</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mo>&#x2200;</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mi>p</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>l</mml:mi>
<mml:mi>B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">SatAgg is given by</td>
<td align="center">
<inline-formula id="inf8">
<mml:math id="m9">
<mml:mrow>
<mml:mspace width="5em"/>
<mml:msub>
<mml:mtext>SatAgg</mml:mtext>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi mathvariant="script">G</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="bold-italic">D</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
<tr>
<td align="left">Learning and loss</td>
<td align="center">
<inline-formula id="inf9">
<mml:math id="m10">
<mml:mrow>
<mml:mspace width="3em"/>
<mml:mi mathvariant="bold-italic">L</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:munder>
<mml:mtext>SatAgg</mml:mtext>
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi mathvariant="script">K</mml:mi>
</mml:mrow>
</mml:munder>
<mml:msub>
<mml:mi mathvariant="script">G</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold-italic">&#x3b8;</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo>&#x2190;</mml:mo>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>&#x3d5;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: This table was constructed inspired by the official LTN model. More detail <xref ref-type="sec" rid="s14">Supplementary Appendix C</xref>.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>List of hyperparameters used for NeSyDPP4 (LTN) and DNN models for DPP-4 inhibitor classification.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Parameter</th>
<th align="left">LTN</th>
<th align="left">DNN</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Units (input sample/hidden units)</td>
<td align="left">(768,384,192,2)</td>
<td align="left">(768,384,192,2)</td>
</tr>
<tr>
<td align="left">Activation</td>
<td align="left">ReLU</td>
<td align="left">ReLU</td>
</tr>
<tr>
<td align="left">No. of dense layers</td>
<td align="left">3</td>
<td align="left">3</td>
</tr>
<tr>
<td align="left">Dropout</td>
<td align="left">0.25</td>
<td align="left">0.25</td>
</tr>
<tr>
<td align="left">Seed</td>
<td align="left">42</td>
<td align="left">42</td>
</tr>
<tr>
<td align="left">Batch size</td>
<td align="left">128</td>
<td align="left">128</td>
</tr>
<tr>
<td align="left">Training epochs</td>
<td align="left">100</td>
<td align="left">100</td>
</tr>
<tr>
<td align="left">Learning rate</td>
<td align="left">0.00001</td>
<td align="left">0.00001</td>
</tr>
<tr>
<td align="left">Loss function</td>
<td align="left">LTN pMeanError</td>
<td align="left">Sparse Categorical Crossentropy</td>
</tr>
<tr>
<td align="left">Optimizer</td>
<td align="left">Adam</td>
<td align="left">Adam</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: LTN and DNN models were configured identically in terms of MLP architecture, activation function, and training setup to ensure a fair comparison.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>The pMeanError aggregator, as shown in <xref ref-type="disp-formula" rid="e2">Equation 2</xref>,<disp-formula id="e2">
<mml:math id="m11">
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>M</mml:mi>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>u</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>n</mml:mi>
</mml:munderover>
</mml:mstyle>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>u</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>p</mml:mi>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:msup>
<mml:mi>p</mml:mi>
<mml:mo>&#x2265;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Here, pMeanError is computed through universal quantification (&#x201c;for all&#x201d; or <inline-formula id="inf999">
<mml:math id="m999">
<mml:mrow>
<mml:msub>
<mml:mo>&#x2200;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>A</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:msub>
<mml:mo>&#x2200;</mml:mo>
<mml:mrow>
<mml:mi>x</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:mi>B</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as shown <xref ref-type="fig" rid="F2">Figure 2</xref>), which refers to the generalized mean of the deviations with respect to the truth (more detail <ext-link ext-link-type="uri" xlink:href="https://github.com/tommasocarraro/LTNtorch/tree/main/tutorials">link</ext-link>). Further, SatAgg&#x2019;s all attributes can be defined as below (<xref ref-type="table" rid="T1">Table 1</xref>).<list list-type="simple">
<list-item>
<p>&#x2022; SatAgg: This stands for &#x201c;Satisfaction Aggregator,&#x201d; an operator that aggregates the truth values of the formulas in K (if there is more than one rule).</p>
</list-item>
<list-item>
<p>&#x2022; &#x3d5;&#x2208;K: This part indicates that &#x3d5; (phi) belongs to the set K. &#x3d5; is often used to represent a predicate.</p>
</list-item>
<list-item>
<p>&#x2022; <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:mi mathvariant="script">G</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi mathvariant="normal">&#x3b8;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>: This is denoted by grounding (<inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:mi mathvariant="script">G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>) with parameters &#x3b8;. &#x3b8; represents a set of parameters or weights in a model.</p>
</list-item>
<list-item>
<p>&#x2022; x&#x2190;D: <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> indicates the dataset of all examples (domain).</p>
</list-item>
<list-item>
<p>&#x2022; B is a mini-batch sampled from D.</p>
</list-item>
</list>
</p>
<p>However, <xref ref-type="fig" rid="F2">Figure 2</xref> depicts an architecture composed of three segments. Segment A represents several features used to train the model, while Segment B illustrates the LTN-based classification architecture model, which was conceived from the LTN paper. Specifically, feature&#x2013;label pairs <inline-formula id="inf13">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf14">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">l</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (for classes A and B, respectively) are used as input into the MLP. The output is evaluated using a universal <inline-formula id="inf15">
<mml:math id="m17">
<mml:mrow>
<mml:mo>&#x2200;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula> quantification equation that influences the loss function and updates model weights via backpropagation, the same as for <inline-formula id="inf16">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> (all the features related to the B class). The logical formulation follows <inline-formula id="inf17">
<mml:math id="m19">
<mml:mrow>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">l</mml:mi>
<mml:mi mathvariant="bold-italic">A</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi mathvariant="bold-italic">a</mml:mi>
<mml:mi mathvariant="bold-italic">n</mml:mi>
<mml:mi mathvariant="bold-italic">d</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi mathvariant="bold-italic">P</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold-italic">l</mml:mi>
<mml:mi mathvariant="bold-italic">B</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, where <italic>p</italic> represents the model/MLP and K denotes the complete knowledge expressed in real/first-order logic. For more details, refer to the official LTN GitHub repository.</p>
</sec>
<sec id="s4-4">
<title>4.4 Model training and validation phase</title>
<p>LTN, DNN, and Transformer models were trained and tested using TensorFlow 2.15.1 with Python 3.10.16 on the UAB server with an NVIDIA A100 80 GB PCIe GPU, and other dependency packages can be found in the project&#x2019;s GitHub repository under environment.yml. We partitioned the data into 70:10:20 ratios over 100 epochs in the training phase. To optimize model performance, and conducted hyperparameter tuning via Grid Search (<xref ref-type="sec" rid="s14">Supplementary Appendix B</xref>), with multiple training trials. The best configurations, trial 01 and trial 07, achieved the highest accuracy of 0.9726, using ReLU activation and a learning rate of 0.0001, with three layers. In addition to experimenting with the LTN, we conducted the simulation with DNNs and Transformer with Keras integrated to compare LTN performance fairly. <xref ref-type="table" rid="T2">Table 2</xref> depicts the network configuration parameters and the simulation notebook (project GitHub), which describes the details.</p>
<p>Furthermore, we conducted external validation by extracting additional 1,045 data samples from the DTC dataset using the best-performing model weights. Notably, there was no duplication or overlap between the external validation data and the training dataset. The following metrics, such as accuracy, F-score (F), ROC AUC score, and Matthews correlation coefficient (MCC), were used to assess the trained model&#x2019;s performance evaluation. Additionally, the confusion matrix (CM) provides a visual representation of misclassified classes (<xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Confusion matrix of DNN, Transformer, and LTN using CDKextended + Morgan features based on DTC dataset evaluation.</p>
</caption>
<graphic xlink:href="fbinf-05-1603133-g003.tif"/>
</fig>
<p>
<xref ref-type="disp-formula" rid="e3">Equation 3</xref> represents the accuracy:<disp-formula id="e3">
<mml:math id="m20">
<mml:mrow>
<mml:mtext>Accuracy</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>
<xref ref-type="disp-formula" rid="e4">Equation 4</xref> represents the F1-score:<disp-formula id="e4">
<mml:math id="m21">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>x</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>x</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>R</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>l</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
<p>
<xref ref-type="disp-formula" rid="e5">Equation 5</xref> represents the ROC AUC score:<disp-formula id="e5">
<mml:math id="m22">
<mml:mrow>
<mml:mtext>ROC&#x2009;AUC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x222b;</mml:mo>
<mml:mn>0</mml:mn>
<mml:mn>1</mml:mn>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>
<xref ref-type="disp-formula" rid="e6">Equation 6</xref> represents the MCC:<disp-formula id="e6">
<mml:math id="m23">
<mml:mrow>
<mml:mtext>MCC</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mfrac>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
</sec>
</sec>
<sec sec-type="results" id="s5">
<title>5 Result</title>
<p>This section outlines the performance of the proposed NeSyDPP-4 model in identifying potential DPP-4 inhibitors by integrating logical rules into a neural network via the LTN architecture. To achieve this, we extracted a diverse set of molecular features from each SMILES/drug representation. These include the Morgan fingerprint, CDK extended descriptors, and embeddings generated from chemical language models such as ChemBERTa-2 and LLaMA3.2 (via the Hugging Face platform). Additionally, physicochemical properties were computed using RDKit to enrich the feature space with interpretable molecular characteristics. This section presents four result grids: <xref ref-type="table" rid="T3">Table 3</xref> shows all the features separated and combined as input results for an ablation study; <xref ref-type="table" rid="T4">Table 4</xref> exposes the fair comparison with baseline DNN and transformer architecture performance; <xref ref-type="table" rid="T5">Table 5</xref> summarizes the model&#x2019;s performance on the external evaluation; and finally, <xref ref-type="table" rid="T6">Table 6</xref> presents the benchmarking evaluation. To illustrate, <xref ref-type="table" rid="T3">Table 3</xref> depicts the different input performances of NeSyDPP (LTN). The best-performing feature set is combining CDKExtended &#x2b; ECFP, which yielded the highest accuracy (97.25%), F1-score (97.23%), AUC ROC (97.19%), and MCC (94.46%), while physicochemical features alone yield the lowest performance of accuracy (73.49%), F1-score (73.16%), AUC ROC (73.09%), and MCC (46.38%). ChemBERTa-2 and LLaMA3.2 performed comparably but achieved lower performance than the fingerprint-based methods. Overall, physicochemical properties alone are insufficient for effective bioactivity classification.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>NeSyDPP-4 (LTN) model&#x2019;s performance comparison using various feature representations and input dimensions for DPP-4 inhibitor classification.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Model</th>
<th align="center">Feature</th>
<th align="center">Input</th>
<th align="center">Acc</th>
<th align="center">F1</th>
<th align="center">AUC ROC</th>
<th align="center">MCC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="9" align="center">NeSyDPP-4 (LTN)</td>
<td align="left">CDKExtended &#x2b; ECFP</td>
<td align="left">1024&#x2b;(512 &#x2b; 1024&#x2b;2048)</td>
<td align="center">
<bold>0.9725</bold>
</td>
<td align="center">
<bold>0.9723</bold>
</td>
<td align="center">
<bold>0.9719</bold>
</td>
<td align="center">
<bold>0.9446</bold>
</td>
</tr>
<tr>
<td align="left">ECFP</td>
<td align="left">1024</td>
<td align="center">0.9687</td>
<td align="center">0.9684</td>
<td align="center">0.9680</td>
<td align="center">0.9370</td>
</tr>
<tr>
<td align="left">ECFP</td>
<td align="left">2048</td>
<td align="center">0.9657</td>
<td align="center">0.9654</td>
<td align="center">0.9650</td>
<td align="center">0.9308</td>
</tr>
<tr>
<td align="left">ECFP</td>
<td align="left">512</td>
<td align="center">0.9649</td>
<td align="center">0.9646</td>
<td align="center">0.9643</td>
<td align="center">0.9293</td>
</tr>
<tr>
<td align="left">Combined All</td>
<td align="left">7430</td>
<td align="center">0.9634</td>
<td align="center">0.9631</td>
<td align="center">0.9632</td>
<td align="center">0.9262</td>
</tr>
<tr>
<td align="left">CDKExtended</td>
<td align="left">1024</td>
<td align="center">0.9504</td>
<td align="center">0.9499</td>
<td align="center">0.9492</td>
<td align="center">0.9001</td>
</tr>
<tr>
<td align="left">ChemBERTa-2</td>
<td align="left">768</td>
<td align="center">0.8956</td>
<td align="center">0.8944</td>
<td align="center">0.8935</td>
<td align="center">0.7892</td>
</tr>
<tr>
<td align="left">LLaMA3.2</td>
<td align="left">2048</td>
<td align="center">0.8933</td>
<td align="center">0.8926</td>
<td align="center">0.8933</td>
<td align="center">0.7854</td>
</tr>
<tr>
<td align="left">Physiochemical</td>
<td align="left">6</td>
<td align="center">0.7349</td>
<td align="center">0.7316</td>
<td align="center">0.7309</td>
<td align="center">0.4638</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: The table reports Accuracy (Acc), F1-score (F1), AUC-ROC, and Matthews Correlation Coefficient (MCC). The model achieved the highest performance (bold) using the combined CDKExtended and ECFP descriptors.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Comparison of the NeSyDPP-4 (LTN) model with baseline deep learning architecture and the Transformer.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Model</th>
<th align="center">Feature</th>
<th align="center">Input dimension</th>
<th align="center">Acc</th>
<th align="center">F1</th>
<th align="center">AUC ROC</th>
<th align="center">MCC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">NeSyDPP-4 (LTN)</td>
<td align="left">CDKExtended &#x2b; ECFP</td>
<td align="left">1024&#x2b;(512 &#x2b; 1024&#x2b;2048)</td>
<td align="center">
<bold>0.9725</bold>
</td>
<td align="center">
<bold>0.9723</bold>
</td>
<td align="center">
<bold>0.9719</bold>
</td>
<td align="center">
<bold>0.9446</bold>
</td>
</tr>
<tr>
<td align="center">DNN</td>
<td align="left">CDKExtended &#x2b; ECFP</td>
<td align="left">1024&#x2b;(512 &#x2b; 1024&#x2b;2048)</td>
<td align="center">0.9695</td>
<td align="center">0.9692</td>
<td align="center">0.9691</td>
<td align="center">0.9385</td>
</tr>
<tr>
<td align="center">Transformer</td>
<td align="left">SMILES/Emb</td>
<td align="left">212</td>
<td align="center">0.7821</td>
<td align="center">0.7306</td>
<td align="center">0.8549</td>
<td align="center">0.5641</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Note: Models were trained and evaluated on identical feature sets, and results are based on internal data.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>External evaluation (DTC dataset) comparing NeSyDPP-4 with baseline DNN and Transformer model performances.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Model</th>
<th align="center">Feature</th>
<th align="center">Input dimension</th>
<th align="center">Acc</th>
<th align="center">F1</th>
<th align="center">AUC ROC</th>
<th align="center">MCC</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">NeSyDPP-4 (LTN)</td>
<td align="left">CDKExtended &#x2b; ECFP</td>
<td align="left">1024&#x2b;(512 &#x2b; 1024&#x2b;2048)</td>
<td align="center">
<bold>0.9578</bold>
</td>
<td align="left">
<bold>0.9576</bold>
</td>
<td align="center">
<bold>0.9564</bold>
</td>
<td align="center">
<bold>0.9171</bold>
</td>
</tr>
<tr>
<td align="left">DNN</td>
<td align="left">CDKExtended &#x2b; ECFP</td>
<td align="left">1024&#x2b;(512 &#x2b; 1024&#x2b;2048)</td>
<td align="left">0.9521</td>
<td align="center">0.9518</td>
<td align="center">0.9506</td>
<td align="center">0.9057</td>
</tr>
<tr>
<td align="left">Transformer</td>
<td align="left">SMILES/Emb</td>
<td align="left">212</td>
<td align="left">0.6478</td>
<td align="center">0.6251</td>
<td align="center">0.6384</td>
<td align="center">0.3095</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T6" position="float">
<label>TABLE 6</label>
<caption>
<p>Comparative performance grid shows typical machine learning and deep learning models with NeSyDPP-4 for DPP-4 inhibitor classification results.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="left">Author</th>
<th align="left">Metrics</th>
<th align="left">Result</th>
<th align="left">Reference</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">NeSyDPP-4</td>
<td align="left">Hossain <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">
<bold>0.9578</bold>
</td>
<td align="left"/>
</tr>
<tr>
<td align="left">Random Forest</td>
<td align="left">Oky Hermansyah <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">0.9221</td>
<td align="left">
<xref ref-type="bibr" rid="B16">Hermansyah et al. (2020)</xref>
</td>
</tr>
<tr>
<td align="left">DNN</td>
<td align="left">Haris Hamzah <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">0.9060</td>
<td align="left">
<xref ref-type="bibr" rid="B14">Hamzah et al. (2020)</xref>
</td>
</tr>
<tr>
<td align="left">QSAR-DNN</td>
<td align="left">Alhadi Bustamam <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">0.9040</td>
<td align="left">
<xref ref-type="bibr" rid="B5">Bustamam et al. (2021)</xref>
</td>
</tr>
<tr>
<td align="left">NB</td>
<td align="left">Jie Cai <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">0.8720</td>
<td align="left">
<xref ref-type="bibr" rid="B6">Cai et al. (2017)</xref>
</td>
</tr>
<tr>
<td align="left">Conv1D&#x2013;LSTM</td>
<td align="left">Adawiyah Ulfa <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">0.8618</td>
<td align="left">
<xref ref-type="bibr" rid="B38">Ulfa et al. (2021)</xref>
</td>
</tr>
<tr>
<td align="left">XGBoost</td>
<td align="left">Oky Hermansyah <italic>et al</italic>
</td>
<td align="left">Accuracy</td>
<td align="left">0.8164</td>
<td align="left">
<xref ref-type="bibr" rid="B18">Hermansyah et al. (2023)</xref>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In addition, <xref ref-type="table" rid="T4">Table 4</xref> exhibits the internal validation results comparing the performance of NeSyDPP-4 (LTN), DNN, and Transformer models. The NeSyDPP-4 model, using CDKExtended and ECFP features, achieved the highest performance with 97.25% accuracy and an MCC of 94.46%, highlighting the strength of neuro-symbolic reasoning. The DNN model, using the same input features but without reasoning capabilities, yielded slightly lower results (96.95% accuracy and 93.85% MCC). In contrast, the Transformer model, which relied on SMILES embeddings, exhibited the weakest performance (78.21% accuracy and 56.41% MCC). These findings accentuate the effectiveness of fingerprint-based features over SMILES-based language models for bioactivity classification tasks. Furthermore, in the external evaluation, using the DTC dataset (<xref ref-type="table" rid="T5">Table 5</xref>), NeSyDPP-4 continued to outperform baseline models, confirming its strong generalization capability.</p>
<p>Finally, <xref ref-type="table" rid="T6">Table 6</xref> presents a state-of-the-art (SOTA) performance comparison. The proposed NeSyDPP-4 model outperforms all baseline models across accuracy metrics. Traditional models such as Random Forest and XGBoost achieved 92.21% and 81.64% accuracy, respectively, while deep learning models like DNN and QSAR &#x2013;DNN reached approximately 90%, as reported in prior benchmarks. These results underscore the superior performance of the neuro-symbolic NeSyDPP-4 model. In addition, <xref ref-type="fig" rid="F4">Figure 4</xref> illustrates the training and validation accuracy curves over 100 epochs for both the DNN and NeSyDPP-4 simulations.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>The graphs represent epoch and accuracy during the training and validation phases.</p>
</caption>
<graphic xlink:href="fbinf-05-1603133-g004.tif"/>
</fig>
</sec>
<sec sec-type="discussion" id="s6">
<title>6 Discussion</title>
<p>This article aimed to develop a neuro-symbolic model leveraging LTN, an integration of data and a logic-driven approach, for predicting DPP-4 inhibition in diabetes mellitus. One of the key challenges in DPP-4 inhibitor research and AI integration is the absence of unified, data and knowledge-driven experimental frameworks that support logical reasoning. Moreover, current approaches fail to effectively utilize diverse features and integrate logical rules, such as extracting embeddings from large language models (LLMs) and physicochemical properties, to predict DPP-4 bioactivity accurately. Furthermore, prior studies have focused on data-driven approaches such as DNN and Transformer. In this context, our study introduces LTN-based NeSyDPP-4, a neuro-symbolic classifier trained on the curated diverse bioactivity activity data and logical rules (<xref ref-type="table" rid="T1">Table 1</xref>), which demonstrates superior performance in predicting DPP-4 inhibitory activity compared to existing models. Notably, some studies suggest that it can be semi-interpretable since rules (<xref ref-type="table" rid="T1">Table 1</xref>) are apparently understandable by humans regarding how models should make decisions. However, the study&#x2019;s findings provide valuable insights into the applicability and robustness of the LTN model, discovering bioactivity behavior. To illustrate, the utilization of this advanced machine learning technique (LTN) surpassed the state-of-the-art performance compared to other models with classification tasks; the proposed model demonstrates superior accuracy of 0.9725 and an MCC score of 0.9446 in the internal dataset for DPP-4 inhibitor bioactivity prediction. In contrast, several other studies have reported comparable results: the QSAR-DNN model by <xref ref-type="bibr" rid="B5">Bustamam et al. (2021)</xref> achieved an accuracy of 0.9040; <xref ref-type="bibr" rid="B38">Ulfa et al. (2021)</xref> reported an accuracy of 0.8618 using Conv1D&#x2013;LSTM; random forest by <xref ref-type="bibr" rid="B16">Hermansyah et al. (2020)</xref> yielded an accuracy of 0.9221; and DNN by <xref ref-type="bibr" rid="B14">Hamzah et al. (2020)</xref> obtained an accuracy of 0.9060. Furthermore, the NB model by <xref ref-type="bibr" rid="B6">Cai et al. (2017)</xref> gained an accuracy of 0.8720, while ML-based XGBoost by <xref ref-type="bibr" rid="B18">Hermansyah et al. (2023)</xref> reported an accuracy of 0.8164.</p>
<p>Overall, this study emphasizes the value of integrating neuro-symbolic modeling, which combines data-driven learning with logical rule-based reasoning, for highly accurate classification of DPP-4 inhibitor activity in the context of diabetes mellitus, a task that conventional data-driven AI approaches (DNN, Transformer) are incapable of. Significantly, the developed NeSyDPP4 model holds substantial promise for future applications, including the discovery of novel DPP-4 compounds, prediction of DPP-4-related pharmacological interactions, efficient high-throughput screening of molecular libraries to identify potential active agents, and conducting virtual screening based on identified approved/existing substance compounds that can be utilized as drug re-purposing (<xref ref-type="fig" rid="F5">Figures 5B, C</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Workflow and future implications of the developed model (NeSyDPP-4). <bold>(A)</bold> represents a workflow from target identification to the best model (NeSyDPP-4) development; <bold>(B)</bold> represents the implication of the best model to discover new DPP-4 drug (FDA-approved/drug repurposing), which is known as in silico assessment; finally, based on the identified in silico compounds, <italic>in vitro</italic> and <italic>in vivo</italic> studies can be conducted in the future, as shown in <bold>(C)</bold>.</p>
</caption>
<graphic xlink:href="fbinf-05-1603133-g005.tif"/>
</fig>
<sec id="s6-1">
<title>6.1 Limitation</title>
<p>Acknowledging the limitations of our study, we state that although the LTN has demonstrated significant promise, it may be incapable of incorporating external, diverse, comprehensive biological additional knowledge with neural networks due to structural limitations.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s7">
<title>7 Conclusion</title>
<p>Diabetes mellitus is a vital global health concern, and discovering effective chemical compounds is decisive to tackling this epidemic. In this study, we develop a QSAR system to identify a therapeutic potential compound of DPP-4 inhibitors using an advanced AI framework called LTN that integrates domain-specific knowledge into neural networks. The study is a pioneer in applying the neuro-symbolic strategy in the DM domain and provides new insights that reveal a higher performance for DPP-4 bioactivity classification. The root cause of achieving such performance could be upholding learning and reasoning principles and training neural networks with rules. Furthermore, we experimented with DNN, an NLP Transformer model, whereas the LTN-based model NeSyDPP-4&#x2013;QSAR attained the highest accuracy of those baselines&#x2019; approaches and prior SOTA strategies. In conclusion, the findings of this study prove that the neuro-symbolic approach for uncovering potential DPP-4 inhibitors is promising. However, an ideal direction for future work could involve integrating additional potential neuro-symbolic strategies, such as Semantic Loss and DeepProbLog, to study GLP-1, IDO, and PTP1B inhibitors, which would include extracting a variety of new descriptors and fingerprints from different datasets (PubChem and Drug Bank), focusing on regression tasks.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The dataset utilized in this study can be found at: <ext-link ext-link-type="uri" xlink:href="https://drive.google.com/file/d/1SGiYOyuSiirueZR3F6K0d7aHcPT43PYw/view">https://drive.google.com/file/d/1SGiYOyuSiirueZR3F6K0d7aHcPT43PYw/view</ext-link>, and the experimental code repository can be found at: <ext-link ext-link-type="uri" xlink:href="https://github.com/hossain013/NeSyDPP4-QSAR">https://github.com/hossain013/NeSyDPP4-QSAR</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s9">
<title>Author contributions</title>
<p>DH: conceptualization, data curation, formal analysis, investigation, methodology, resources, software, supervision, validation, visualization, writing &#x2013; original draft, and writing &#x2013; review &#x26; editing. ES: conceptualization, formal analysis, investigation, resources, validation, and writing &#x2013; review &#x26; editing. JC: funding acquisition, project administration, resources, supervision, and writing &#x2013; review &#x26; editing.</p>
</sec>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. The work is partly supported by an NIH grant R21DK129968 and research startup funding awarded to Jake Chen.</p>
</sec>
<ack>
<p>The authors acknowledge the biomedical data science infrastructure and staff support provided by the UAB U-BRITE program.</p>
</ack>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s12">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="s13">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="s14">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fbinf.2025.1603133/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fbinf.2025.1603133/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.docx" id="SM1" mimetype="application/docx" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ahmad</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Simon</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Chithrananda</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Grand</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Ramsundar</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>ChemBERTa-2: towards chemical foundation models</article-title>. <source>arXiv</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2209.01712</pub-id>
</citation>
</ref>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ajiboye</surname>
<given-names>B. O.</given-names>
</name>
<name>
<surname>Iwaloye</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Owolabi</surname>
<given-names>O. V.</given-names>
</name>
<name>
<surname>Ejeje</surname>
<given-names>J. N.</given-names>
</name>
<name>
<surname>Okerewa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Johnson</surname>
<given-names>O. O.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Screening of potential antidiabetic phytochemicals from Gongronema latifolium leaf against therapeutic targets of type 2 diabetes mellitus: multi-targets drug design</article-title>. <source>SN Appl. Sci.</source> <volume>4</volume> (<issue>1</issue>), <fpage>14</fpage>. <pub-id pub-id-type="doi">10.1007/s42452-021-04880-2</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Amizadeh</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Palangi</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Polozov</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Koishida</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Neuro-symbolic visual reasoning: disentangling &#x201c;visual&#x201d; from &#x201c;reasoning&#x201d;</article-title>. <source>Int. Conf. Mach. Learn.</source> <volume>1</volume>, <fpage>279</fpage>&#x2013;<lpage>290</lpage>. <pub-id pub-id-type="doi">10.48550/arxiv.2006.11524</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Arabshahi</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gawarecki</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Mazaitis</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Azaria</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Mitchell</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Conversational Neuro-Symbolic commonsense reasoning</article-title>. <source>Proc. AAAI Conf. Artif. Intell.</source> <volume>35</volume> (<issue>6</issue>), <fpage>4902</fpage>&#x2013;<lpage>4911</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v35i6.16623</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Badreddine</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Garcez</surname>
<given-names>A. D.</given-names>
</name>
<name>
<surname>Serafini</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Spranger</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Logic tensor networks</article-title>. <source>Artif. Intell.</source> <volume>303</volume>, <fpage>103649</fpage>. <pub-id pub-id-type="doi">10.1016/j.artint.2021.103649</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bustamam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Hamzah</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Husna</surname>
<given-names>N. A.</given-names>
</name>
<name>
<surname>Syarofina</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dwimantara</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Yanuar</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Artificial intelligence paradigm for ligand-based virtual screening on the drug discovery of type 2 diabetes mellitus</article-title>. <source>J. Big Data</source> <volume>8</volume> (<issue>1</issue>), <fpage>74</fpage>. <pub-id pub-id-type="doi">10.1186/s40537-021-00465-3</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cai</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Du</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Ye</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gu</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Predicting DPP-IV inhibitors with machine learning approaches</article-title>. <source>J. Computer-Aided Mol. Des.</source> <volume>31</volume> (<issue>4</issue>), <fpage>393</fpage>&#x2013;<lpage>402</lpage>. <pub-id pub-id-type="doi">10.1007/s10822-017-0009-6</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<collab>CDC</collab> (<year>2024</year>). <article-title>Methods for the national diabetes statistics report</article-title>. <source>Diabetes</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.cdc.gov/diabetes/php/data-research/methods.html?CDC_AAref_Val=https://www.cdc.gov/diabetes/data/statistics-report/index.html">https://www.cdc.gov/diabetes/php/data-research/methods.html?CDC_AAref_Val&#x3d;https://www.cdc.gov/diabetes/data/statistics-report/index.html</ext-link>.</comment>
</citation>
</ref>
<ref id="B8">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Dobosz</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Duch</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2008</year>). &#x201c;<article-title>Fuzzy symbolic dynamics for neurodynamical systems</article-title>,&#x201d; in <source>Lecture notes in computer science</source>, <fpage>471</fpage>&#x2013;<lpage>478</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-540-87559-8_49</pub-id>
</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ettaleb</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kamel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Moriceau</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Aussenac-Gilles</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>The Llama 3 Herd of Models</article-title>. <source>Arxiv</source>. <pub-id pub-id-type="doi">10.48550/arxiv.2407.21783</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="web">
<collab>FDA</collab> (<year>2023</year>). <article-title>FDA approved dipeptidyl peptidase IV (DPP IV) inhibitors</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK542331/#:%7E:text=DPP%2D4%20inhibitors%2C%20known%20as,saxagliptin%2C%20linagliptin%2C%20and%20alogliptin">https://www.ncbi.nlm.nih.gov/books/NBK542331/&#x23;:&#x223c;:text&#x3d;DPP%2D4%20inhibitors%2C%20known%20as,saxagliptin%2C%20linagliptin%2C%20and%20alogliptin</ext-link>.</comment>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gaulton</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bellis</surname>
<given-names>L. J.</given-names>
</name>
<name>
<surname>Bento</surname>
<given-names>A. P.</given-names>
</name>
<name>
<surname>Chambers</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Davies</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hersey</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>ChEMBL: a large-scale bioactivity database for drug discovery</article-title>. <source>Nucleic Acids Res.</source> <volume>40</volume> (<issue>D1</issue>), <fpage>D1100</fpage>&#x2013;<lpage>D1107</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkr777</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gilson</surname>
<given-names>M. K.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Baitaluk</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Nicola</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Hwang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chong</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>BindingDB in 2015: a public database for medicinal chemistry, computational chemistry and systems pharmacology</article-title>. <source>Nucleic Acids Res.</source> <volume>44</volume> (<issue>D1</issue>), <fpage>D1045</fpage>&#x2013;<lpage>D1053</lpage>. <pub-id pub-id-type="doi">10.1093/nar/gkv1072</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Giri</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Dutta</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Halani</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Saha</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>MulTiPREDGO: deep Multi-Modal Protein function prediction by amalgamating protein structure, sequence, and interaction information</article-title>. <source>IEEE J. Biomed. Health Inf.</source> <volume>25</volume> (<issue>5</issue>), <fpage>1832</fpage>&#x2013;<lpage>1838</lpage>. <pub-id pub-id-type="doi">10.1109/jbhi.2020.3022806</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gong</surname>
<given-names>J. N.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Z. D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C. Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>A novel artificial intelligence protocol to investigate potential leads for diabetes mellitus</article-title>. <source>Mol Divers</source>. <volume>25</volume>, <fpage>1375</fpage>&#x2013;<lpage>1393</lpage>. <pub-id pub-id-type="doi">10.1007/s11030-021-10204-8</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hamzah</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Bustamam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yanuar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sarwinda</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Predicting the molecular structure relationship and the biological activity of DPP-4 inhibitor using deep neural network with CatBoost method as feature selection</article-title>. <source>IEEEXplore</source>, <fpage>101</fpage>&#x2013;<lpage>108</lpage>. <pub-id pub-id-type="doi">10.1109/icacsis51025.2020.9263204</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hassan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Guan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Melliou</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2022</year>). <article-title>Neuro-symbolic learning: principles and applications in ophthalmology</article-title>. <comment>arXiv preprint arXiv:2208.00374</comment>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hermansyah</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Bustamam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yanuar</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Virtual screening of DPP-4 inhibitors using QSAR-Based artificial intelligence and molecular docking of HIT compounds to DPP-8 and DPP-9 enzymes</article-title>. <source>Res. Square Res. Square</source>. <pub-id pub-id-type="doi">10.21203/rs.2.22282/v2</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hermansyah</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Bustamam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yanuar</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Virtual screening of dipeptidyl peptidase-4 inhibitors using quantitative structure&#x2013;activity relationship-based artificial intelligence and molecular docking of hit compounds</article-title>. <source>Comput. Biol. Chem.</source> <volume>95</volume>, <fpage>107597</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiolchem.2021.107597</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hermansyah</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Rahmawati</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dwi Putri Masrijal</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Intan Perma Sari</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2023</year>). <source>Identification of DPP-4 inhibitor active compounds using machine learning classification</source>. <publisher-loc>Canada</publisher-loc>: <publisher-name>International Journal of Chemical and Biochemical Sciences</publisher-name>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hohenecker</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Lukasiewicz</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Ontology reasoning with deep neural networks</article-title>. <source>J. Artif. Intell. Res.</source> <volume>68</volume>. <pub-id pub-id-type="doi">10.1613/jair.1.11661</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hossain</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J. Y.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A study on neuro-symbolic artificial intelligence: healthcare perspectives</article-title>. <comment>arXiv preprint arXiv:2503.18213</comment>.</citation>
</ref>
<ref id="B21">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Hossain</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J. Y.</given-names>
</name>
<name>
<surname>Abir</surname>
<given-names>F. A.</given-names>
</name>
</person-group> (<year>2025</year>). &#x201c;<article-title>hERG-LTN: a new paradigm</article-title>,&#x201d; in <source>hERG cardiotoxicity assessment using neuro-symbolic and generative AI embedding (MegaMolBART, Llama3. 2, gemini, DeepSeek) approach</source> (<publisher-loc>bioRxive USA</publisher-loc>: <publisher-name>Cold Spring Harbor Laboratory</publisher-name>), <fpage>2025</fpage>&#x2013;<lpage>2102</lpage>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Meng</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Adverse event profiles of dipeptidyl peptidase-4 inhibitors: data mining of the public version of the FDA adverse event reporting system</article-title>. <source>BMC Pharmacol. Toxicol.</source> <volume>21</volume> (<issue>1</issue>), <fpage>68</fpage>. <pub-id pub-id-type="doi">10.1186/s40360-020-00447-w</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="web">
<collab>Installation</collab> (<year>2024</year>). <article-title>The RDKit 2024.09.6 documentation</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.rdkit.org/docs/Install.html">https://www.rdkit.org/docs/Install.html</ext-link>.</comment>
</citation>
</ref>
<ref id="B24">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Jang</surname>
<given-names>S.-I.</given-names>
</name>
<name>
<surname>Girard</surname>
<given-names>M. J. A.</given-names>
</name>
<name>
<surname>Thiery</surname>
<given-names>A. H.</given-names>
</name>
</person-group> (<year>2021</year>). <source>Explainable diabetic Retinopathy classification based on neural-symbolic learning</source>. <publisher-name>arXiv (Cornell University)</publisher-name>, <fpage>104</fpage>&#x2013;<lpage>114</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://ceur-ws.org/Vol-2986/paper8.pdf">http://ceur-ws.org/Vol-2986/paper8.pdf</ext-link>.</comment>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kora</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Meenakshi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Swaraja</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Rajani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Islam</surname>
<given-names>M. K.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Detection of Cardiac arrhythmia using fuzzy logic</article-title>. <source>Inf. Med. Unlocked</source> <volume>17</volume>, <fpage>100257</fpage>. <pub-id pub-id-type="doi">10.1016/j.imu.2019.100257</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Lavin</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Neuro-Symbolic neurodegenerative disease modeling as probabilistic programmed deep kernels</article-title>,&#x201d; in <source>Studies in computational intelligence</source>, <fpage>49</fpage>&#x2013;<lpage>64</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-93080-6_5</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Maclin</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Shavlik</surname>
<given-names>J. W.</given-names>
</name>
</person-group> (<year>1994</year>). &#x201c;<article-title>Refining algorithms with knowledge-based neural networks: improving the Chou-Fasman algorithm for protein folding</article-title>,&#x201d; in <conf-name>Conference on Learning Theory</conf-name>, <fpage>249</fpage>&#x2013;<lpage>286</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://dl.acm.org/citation.cfm?id=188535">http://dl.acm.org/citation.cfm?id&#x3d;188535</ext-link>.</comment>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mao</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Kohli</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J. B.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>The Neuro-Symbolic Concept Learner: interpreting scenes, words, and sentences from natural supervision</article-title>. <source>Int. Conf. Learn. Represent</source>. <pub-id pub-id-type="doi">10.48550/arxiv.1904.12584</pub-id> </citation>
</ref>
<ref id="B29">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ojo</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Ojo</surname>
<given-names>A. B.</given-names>
</name>
<name>
<surname>Okolie</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Abdurrahman</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Barnabas</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Evbuomwan</surname>
<given-names>I. O.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Elucidating the interactions of compounds identified from Aframomum melegueta seeds as promising candidates for the management of diabetes mellitus: a computational approach</article-title>. <source>Inf. Med. Unlocked</source> <volume>26</volume>, <fpage>100720</fpage>. <pub-id pub-id-type="doi">10.1016/j.imu.2021.100720</pub-id>
</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Perkins</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Tong</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Welsh</surname>
<given-names>W. J.</given-names>
</name>
</person-group> (<year>2003</year>). <article-title>Quantitative structure&#x2010;activity relationship methods: perspectives on drug discovery and toxicology</article-title>. <source>Environ. Toxicol. Chem.</source> <volume>22</volume> (<issue>8</issue>), <fpage>1666</fpage>&#x2013;<lpage>1679</lpage>. <pub-id pub-id-type="doi">10.1897/01-171</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Riegel</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gray</surname>
<given-names>A. G.</given-names>
</name>
<name>
<surname>Luus</surname>
<given-names>F. P. S.</given-names>
</name>
<name>
<surname>Khan</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Makondo</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Akhalwaya</surname>
<given-names>I. Y.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Logical neural networks</article-title>. <source>arXiv Cornell Univ.</source> <pub-id pub-id-type="doi">10.48550/arxiv.2006.13155</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Septiawan</surname>
<given-names>N. R. R.</given-names>
</name>
<name>
<surname>Prakoso</surname>
<given-names>N. B. H.</given-names>
</name>
<name>
<surname>Kurniawan</surname>
<given-names>N. I.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>DPP IV inhibitors activities prediction as an anti-diabetic agent using particle swarm optimization-support vector machine method</article-title>. <source>J. RESTI Rekayasa Sist. Dan. Teknol. Inf.</source> <volume>6</volume> (<issue>6</issue>), <fpage>974</fpage>&#x2013;<lpage>980</lpage>. <pub-id pub-id-type="doi">10.29207/resti.v6i6.4470</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Shi</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Explainable and explicit visual reasoning over scene graphs</article-title>,&#x201d; in <conf-name>2022 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>, <fpage>8368</fpage>&#x2013;<lpage>8376</lpage>. <pub-id pub-id-type="doi">10.1109/cvpr.2019.00857</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Teru</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Denis</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hamilton</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Inductive relation prediction by subgraph reasoning</article-title>. <source>Int. Conf. Mach. Learn.</source> <volume>1</volume>, <fpage>9448</fpage>&#x2013;<lpage>9457</lpage>. <pub-id pub-id-type="doi">10.48550/arxiv.1911.06962</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Towell</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Shavlik</surname>
<given-names>J. W.</given-names>
</name>
</person-group> (<year>1991</year>). <article-title>Interpretation of artificial neural networks: mapping knowledge-based neural networks into rules</article-title>. <source>Neural Inf. Process. Syst.</source> <volume>4</volume>, <fpage>977</fpage>&#x2013;<lpage>984</lpage>. <pub-id pub-id-type="doi">10.5555/2986916.2987036</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Towell</surname>
<given-names>G. G.</given-names>
</name>
<name>
<surname>Shavlik</surname>
<given-names>J. W.</given-names>
</name>
</person-group> (<year>1994</year>). <article-title>Knowledge-based artificial neural networks</article-title>. <source>Artif. Intell.</source> <volume>70</volume> (<issue>1&#x2013;2</issue>), <fpage>119</fpage>&#x2013;<lpage>165</lpage>. <pub-id pub-id-type="doi">10.1016/0004-3702(94)90105-8</pub-id>
</citation>
</ref>
<ref id="B38">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ulfa</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Bustamam</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Yanuar</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Amalia</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Anki</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Model QSAR classification using Conv1D-LSTM of dipeptidyl peptidase-4 inhibitors</article-title>. <source>IEEExplore</source>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1109/aims52415.2021.9466083</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Jin</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>StructuralDPPIV: a novel deep learning model based on atom structure for predicting dipeptidyl peptidase-IV inhibitory peptides</article-title>. <source>Bioinformatics</source> <volume>40</volume> (<issue>2</issue>), <fpage>btae057</fpage>. <pub-id pub-id-type="doi">10.1093/bioinformatics/btae057</pub-id>
</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>DEEPMIR2GO: inferring functions of human MicroRNAs using a deep Multi-Label Classification model</article-title>. <source>Int. J. Mol. Sci.</source> <volume>20</volume> (<issue>23</issue>), <fpage>6046</fpage>. <pub-id pub-id-type="doi">10.3390/ijms20236046</pub-id>
</citation>
</ref>
<ref id="B41">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Towards data-and knowledge-driven AI: a survey on neuro-symbolic computing</article-title>. <source>IEEE Trans. Pattern Analysis Mach. Intell.</source> <volume>47</volume>, <fpage>878</fpage>&#x2013;<lpage>899</lpage>. <pub-id pub-id-type="doi">10.1109/tpami.2024.3483273</pub-id>
</citation>
</ref>
<ref id="B42">
<citation citation-type="web">
<collab>Wikipedia</collab> (<year>2018</year>). <article-title>Wikipedia DPP-4 inhibitors</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://en.wikipedia.org/wiki/Dipeptidyl_peptidase-4_inhibitor">https://en.wikipedia.org/wiki/Dipeptidyl_peptidase-4_inhibitor</ext-link>.</comment>
</citation>
</ref>
<ref id="B43">
<citation citation-type="web">
<collab>World Health Organization</collab> (<year>2020</year>). <article-title>World health statistics 2020: monitoring health for the SDGs, sustainable development goals</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://iris.who.int/bitstream/handle/10665/332070/9789240005105-eng.pdf">https://iris.who.int/bitstream/handle/10665/332070/9789240005105-eng.pdf</ext-link>.</comment>
</citation>
</ref>
<ref id="B44">
<citation citation-type="web">
<collab>World Health Organization: WHO</collab> (<year>2024</year>). <article-title>The top 10 causes of death</article-title>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.who.int/news-room/fact-sheets/detail/the-top-10-causes-of-death#:%7E:text=Of%20the%2056.9%20million%20deaths%20worldwide%20in%202016%2C,of%20death%20globally%20in%20the%20last%2015%20years">https://www.who.int/news-room/fact-sheets/detail/the-top-10-causes-of-death&#x23;:&#x223c;:text&#x3d;Of%20the%2056.9%20million%20deaths%20worldwide%20in%202016%2C,of%20death%20globally%20in%20the%20last%2015%20years</ext-link>.</comment>
</citation>
</ref>
<ref id="B45">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Xu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Friedman</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Liang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Broeck</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2018</year>). <source>A semantic loss function for deep learning with symbolic knowledge</source>. <publisher-name>PMLR</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v80/xu18h.html">https://proceedings.mlr.press/v80/xu18h.html</ext-link>.</comment>
</citation>
</ref>
<ref id="B46">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Cohen</surname>
<given-names>W. W.</given-names>
</name>
</person-group> (<year>2017</year>). <source>Differentiable learning of logical rules for knowledge base reasoning</source>. <publisher-loc>USA</publisher-loc>: <publisher-name>arXiv (Cornell University)</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/hash/0e55666a4ad822e0e34299df3591d979-Abstract.html">https://proceedings.neurips.cc/paper_files/paper/2017/hash/0e55666a4ad822e0e34299df3591d979-Abstract.html</ext-link>.</comment>
</citation>
</ref>
<ref id="B47">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Ishay</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>NeurASP: embracing neural networks into answer set programming</article-title>,&#x201d; in <conf-name>Proceedings of the Twenty-Ninth International Joint Conference on Artificial Intelligence</conf-name> (<publisher-name>IJCAI-20</publisher-name>), <fpage>1755</fpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.ijcai.org/proceedings/2020/0243.pdf">https://www.ijcai.org/proceedings/2020/0243.pdf</ext-link>.</comment>
</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yap</surname>
<given-names>C. W.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>PaDEL&#x2010;descriptor: An open source software to calculate molecular descriptors and fingerprints</article-title>. <source>J. Comput. Chem.</source> <volume>32</volume> (<issue>7</issue>), <fpage>1466</fpage>&#x2013;<lpage>1474</lpage>. <pub-id pub-id-type="doi">10.1002/jcc.21707</pub-id>
</citation>
</ref>
<ref id="B49">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gan</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Torralba</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kohli</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Tenenbaum</surname>
<given-names>J. B.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Neural-symbolic VQA: disentangling reasoning from vision and language understanding</article-title>. <source>arXiv Cornell Univ.</source> <volume>31</volume>, <fpage>1031</fpage>&#x2013;<lpage>1042</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="http://arxiv.org/pdf/1810.02338.pdf">http://arxiv.org/pdf/1810.02338.pdf</ext-link>.</comment>
</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Pan</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>A survey on neural-symbolic learning systems</article-title>. <source>Neural Netw.</source> <volume>166</volume>, <fpage>105</fpage>&#x2013;<lpage>126</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2023.06.028</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>