<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Immunol.</journal-id>
<journal-title>Frontiers in Immunology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Immunol.</abbrev-journal-title>
<issn pub-type="epub">1664-3224</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fimmu.2025.1630863</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Immunology</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>
<italic>In-silico</italic> tool for predicting and scanning rheumatoid arthritis-inducing peptides in an antigen</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Tomer</surname>
<given-names>Ritu</given-names>
</name>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2169898/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Jain</surname>
<given-names>Shipra</given-names>
</name>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3181871/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gahlot</surname>
<given-names>Pushpendra Singh</given-names>
</name>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3181454/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Bajiya</surname>
<given-names>Nisha</given-names>
</name>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2638481/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Raghava</surname>
<given-names>Gajendra P. S.</given-names>
</name>
<xref ref-type="author-notes" rid="fn001">
<sup>*</sup>
</xref>
<xref ref-type="author-notes" rid="fn003">
<sup>&#x2020;</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/301928/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<institution>Department of Computational Biology, Indraprastha Institute of Information Technology</institution>, <addr-line>New Delhi</addr-line>, <country>India</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/688360/overview">Roberto Paganelli</ext-link>, YDA, Institute for Advanced Biologic Therapies, Italy</p>
</fn>
<fn fn-type="edited-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3057187/overview">Tracy Nero</ext-link>, The University of Melbourne, Australia</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3085082/overview">Shuang Ma</ext-link>, Shenyang Ligong University, China</p>
</fn>
<fn fn-type="corresp" id="fn001">
<p>*Correspondence: Gajendra P. S. Raghava, <email xlink:href="mailto:raghava@iiitd.ac.in">raghava@iiitd.ac.in</email>
</p>
</fn>
<fn fn-type="other" id="fn003">
<p>&#x2020;ORCID: Ritu Tomer, <uri xlink:href="https://orcid.org/0000-0002-6171-8660">orcid.org/0000-0002-6171-8660</uri>; Shipra Jain, <uri xlink:href="https://orcid.org/0000-0002-7045-5188">orcid.org/0000-0002-7045-5188</uri>; Pushpendra Singh Gahlot, <uri xlink:href="https://orcid.org/0009-0004-9868-358X">orcid.org/0009-0004-9868-358X</uri>; Nisha Bajiya, <uri xlink:href="https://orcid.org/0000-0002-5075-5386">orcid.org/0000-0002-5075-5386</uri>; Gajendra P. S. Raghava, <uri xlink:href="https://orcid.org/0000-0002-8902-2876">orcid.org/0000-0002-8902-2876</uri>
</p>
</fn>
</author-notes>
<pub-date pub-type="epub">
<day>01</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>16</volume>
<elocation-id>1630863</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>12</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Tomer, Jain, Gahlot, Bajiya and Raghava.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Tomer, Jain, Gahlot, Bajiya and Raghava</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Rheumatoid arthritis (RA) is an autoimmune disorder in which the immune system mounts an abnormal response to self-antigens, resulting in chronic inflammation and joint damage. Identifying antigenic regions in proteins that trigger RA is essential for the development of protein-based therapeutics.</p>
</sec>
<sec>
<title>Methods</title>
<p>We developed predictive models for HLA class II binding RA-inducing peptides using a dataset of 291 experimentally validated RA-inducing peptides and 165 RA non-inducing peptides. Positional and compositional analyses were performed to identify residue preferences. Alignment-based approaches (BLAST and MERCI), machine learning classifiers, deep learning, and protein language model&#x2013;based methods were evaluated for predictive performance.</p>
</sec>
<sec>
<title>Results</title>
<p>Compositional analysis revealed significant enrichment of glycine, proline, and tyrosine in RA-inducing peptides. Alignment-based approaches provided high precision but limited coverage. Among machine learning methods, XGBoost achieved the best performance (AUC = 0.75) on the validation dataset, while ProtBERT was the top-performing protein language model (AUC = 0.72). The ensemble model integrating XGBoost with MERCI-derived motifs yielded the highest overall performance (AUC = 0.80; MCC = 0.45) on an independent validation dataset.</p>
</sec>
<sec>
<title>Discussion</title>
<p>This study presents computational strategies for identifying RA-inducing peptides and demonstrates the advantage of combining motif-based and machine learning approaches for improved performance. The findings are valuable for evaluating the safety of proteins in probiotics, genetically modified foods, and protein-based therapeutics. To facilitate broader use, the best-performing approach has been implemented in RAIpred, a web server and standalone software tool for predicting and scanning RA-inducing peptides, available at <uri xlink:href="https://webs.iiitd.edu.in/raghava/raipred/">https://webs.iiitd.edu.in/raghava/raipred/</uri>.</p>
</sec>
</abstract>
<kwd-group>
<kwd>autoimmune disease</kwd>
<kwd>rheumatoid arthritis</kwd>
<kwd>T-cell epitopes</kwd>
<kwd>machine learning</kwd>
<kwd>large language models</kwd>
</kwd-group>
<counts>
<fig-count count="4"/>
<table-count count="11"/>
<equation-count count="6"/>
<ref-count count="42"/>
<page-count count="14"/>
<word-count count="6745"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-in-acceptance</meta-name>
<meta-value>Autoimmune and Autoinflammatory Disorders : Autoimmune Disorders</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>Highlights</title>
<list list-type="bullet">
<list-item>
<p>Rheumatoid arthritis (RA), an incurable chronic joint disorder.</p>
</list-item>
<list-item>
<p>Identification of antigenic regions responsible for inducing RA.</p>
</list-item>
<list-item>
<p>Application of protein language models in prediction of RA-inducing peptides.</p>
</list-item>
<list-item>
<p>Ensemble model integrating similarity-based and machine learning approaches.</p>
</list-item>
<list-item>
<p>Development of webserver, standalone, pypi and GitHub package.</p>
</list-item>
</list>
</sec>
<sec id="s2" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Rheumatoid arthritis (RA) is an incurable, chronic autoimmune joint disorder that exhibits significant clinical heterogeneity (<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B3">3</xref>). RA is characterized by abnormal inflammation within the synovial tissue of the joints, which progressively damages both cartilage and bone (<xref ref-type="bibr" rid="B4">4</xref>, <xref ref-type="bibr" rid="B5">5</xref>). The global prevalence of RA affects approximately 1% of the world population, translating to millions of individuals (<xref ref-type="bibr" rid="B6">6</xref>, <xref ref-type="bibr" rid="B7">7</xref>). Several studies have reported that RA contributes to a diverse range of systemic complications, including cardiovascular disease (<xref ref-type="bibr" rid="B5">5</xref>, <xref ref-type="bibr" rid="B8">8</xref>). The pathogenesis of RA is still not fully understood, but it is believed to arise from interactions between genetic predisposition and environmental factors. Several immune cells secrete immune-regulatory molecules that cause inflammation and joint damage&#x2014;hallmarks of autoimmune diseases (see <xref ref-type="fig" rid="f1">
<bold>Figure&#xa0;1</bold>
</xref>) (<xref ref-type="bibr" rid="B9">9</xref>). Previously, <italic>in-silico</italic> methods have been developed to predict binders of HLA-DRB1*04:01, as it plays a critical role in RA (<xref ref-type="bibr" rid="B10">10</xref>, <xref ref-type="bibr" rid="B11">11</xref>).</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>The etiology of rheumatoid arthritis (RA), highlighting how genetic and environmental factors influence T-helper cell activation, ultimately leading to bone erosion and cartilage degradation.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-16-1630863-g001.tif">
<alt-text content-type="machine-generated">Diagram showing the progression from a normal joint to an arthritic joint, highlighting bone erosion, inflamed synovial membrane, and cartilage degradation. It illustrates how genetic and environmental factors, including smoking and infections, lead to autoantigen modification. The autoimmune response involves CD4+ T-cells, B-cells, and the production of rheumatoid factors. Autoantigens are recognized by antigen-presenting cells, leading to inflammation, synovial membrane degradation, and joint deterioration. The process includes cytokines like IL-17 and TNF-alpha, osteoclast activation through RANKL, and the role of fibroblast-like synoviocytes.</alt-text>
</graphic>
</fig>
<p>In response to genetic and environmental triggers, autoreactive CD4+ T cells become activated and present antigens to B cells, which in turn produce autoantibodies such as rheumatoid factor and anti-citrullinated protein antibodies (<xref ref-type="bibr" rid="B12">12</xref>). This autoimmune response is further driven by pro-inflammatory cytokines such as tumor necrosis factor-alpha (TNF-&#x3b1;) and interleukin-6 (IL-6), which promote enhanced immune cell activity and inflammation in the synovium (<xref ref-type="bibr" rid="B13">13</xref>). Additionally, macrophages and synovial fibroblasts release chemokines that perpetuate the inflammatory response (<xref ref-type="bibr" rid="B14">14</xref>). The dysregulation of the Janus kinase (JAK)/signal transducer and activator of transcription (STAT) signaling pathway is crucial, as it mediates the signaling of several key cytokine receptors involved in RA (<xref ref-type="bibr" rid="B2">2</xref>, <xref ref-type="bibr" rid="B15">15</xref>). Together, these pathways contribute to a persistent autoimmune response, leading to chronic joint inflammation and eventual tissue destruction (<xref ref-type="bibr" rid="B16">16</xref>).</p>
<p>Traditionally, therapy for RA has primarily focused on disease-modifying anti-rheumatic drugs (DMARDs) (<xref ref-type="bibr" rid="B16">16</xref>). These drugs have been reported to reduce pro-inflammatory cytokine production, thereby decreasing the underlying inflammation in the synovium and slowing disease progression (<xref ref-type="bibr" rid="B17">17</xref>). Significant progress has been made in the use of DMARDs that target inflammation to prevent joint damage. Methotrexate is considered the first-line therapy due to its proven efficacy and safety (<xref ref-type="bibr" rid="B18">18</xref>&#x2013;<xref ref-type="bibr" rid="B20">20</xref>). Additionally, hydroxychloroquine and sulfasalazine are also widely regarded as conventional DMARDs, which can be administered either alone or in combination with methotrexate (<xref ref-type="bibr" rid="B21">21</xref>, <xref ref-type="bibr" rid="B22">22</xref>). Apart from DMARDs, glucocorticoids, non-steroidal anti-inflammatory drugs (NSAIDs), and inflammatory cytokine inhibitors (ICIs) are also employed in managing and preventing RA progression (<xref ref-type="bibr" rid="B1">1</xref>). However, these traditional drugs have their limitations, including inadequate response, intolerance, high cost, and a number of side effects (<xref ref-type="bibr" rid="B23">23</xref>).</p>
<p>In the era of protein therapeutics, one of the key challenges is identifying the antigens or antigenic regions that activate T-helper cells implicated in RA (<xref ref-type="bibr" rid="B24">24</xref>). In the past, numerous computational methods have been developed to predict T-helper epitopes responsible for inducing cytokines such as interferon-gamma, TNF-&#x3b1;, IL-4, and IL-5 (<xref ref-type="bibr" rid="B25">25</xref>&#x2013;<xref ref-type="bibr" rid="B28">28</xref>). These cytokines play a crucial role in the development of autoimmune diseases like RA. However, to date, there is no in silico computational tool available that predicts T-helper cell-inducing peptides or epitopes that specifically trigger RA.</p>
<p>In this study, we focused on identifying peptides that activate T cells responsible for inducing RA. We extracted 291 experimentally validated MHC class II-binding RA-associated peptides and 165 non-associated peptides from the Immune Epitope Database (IEDB; <ext-link ext-link-type="uri" xlink:href="https://www.iedb.org">https://www.iedb.org</ext-link>). To create a robust model, we implemented both alignment-based approaches such as the Basic Local Alignment Search Tool (BLAST) and motif discovery, as well as alignment-free approaches, including machine learning (ML), deep learning (DL), and protein language models (PLMs). In addition, we developed an ensemble model that combines our best performing ML models with motif-based features to achieve higher predictive performance. Finally, we developed a web server and standalone software tool, RAIpred, for predicting, designing, and scanning RA-inducing peptides.</p>
</sec>
<sec id="s3" sec-type="materials|methods">
<label>2</label>
<title>Materials and methods</title>
<sec id="s3_1">
<label>2.1</label>
<title>Dataset preparation and preprocessing</title>
<p>We gathered experimentally validated data from the IEDB for our study and performed several preprocessing steps to improve the quality of the data used (<xref ref-type="bibr" rid="B29">29</xref>). First, we extracted a total of 344 unique RA-inducing peptides as the positive dataset and 176 unique RA non-inducing peptides (not overlapping with the positive dataset) as the negative dataset from IEDB. We observed that RA-associated peptides are binders of both HLA class I and HLA class II molecules (please refer to <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Figure S1</bold>
</xref>). Among these, the HLA class I set contained only 46 peptides, while the HLA class II set included 298 peptides. Due to the limited number of HLA class I peptides, we selected only the HLA class II peptides for further analysis.</p>
<p>Next, we carried out several preprocessing steps, including the removal of duplicate sequences from the negative dataset that overlapped with the positive dataset. We also filtered out sequences from the positive dataset with very low frequency (i.e., those that appeared &#x2264; 6 times). Furthermore, we retained sequences with lengths between 9 and 20 amino acids. After preprocessing, we were left with 291 sequences in the positive dataset and 165 sequences in the negative dataset. The detailed workflow is shown in <xref ref-type="fig" rid="f2">
<bold>Figure&#xa0;2</bold>
</xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The complete workflow used to develop RAIpred.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-16-1630863-g002.tif">
<alt-text content-type="machine-generated">Flowchart illustrating a prediction model development process from the IEDB database. Positive and negative datasets are split into training (80%) and validation (20%). Feature extraction involves compositional features, binary profiles, and LLM embedding. Feature selection uses univariate analysis and Sci-kit library methods. Model development includes alignment-free and alignment-based methods, incorporating machine learning, deep learning, and reinforcement learning techniques. Hybrid models combine ML models with motifs. Validation is both internal and external. The final model, RAIPred, distinguishes RAI from non-RAI and is packaged for webserver development.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s3_2">
<label>2.2</label>
<title>Feature generation</title>
<p>Sequence-based features are numerical or categorical representations derived directly from the amino acid sequences of peptides or proteins. These features are essential for computational models such as ML or DL- to understand and make predictions about peptide properties, including immunogenicity, toxicity, antimicrobial activity, or disease association. In the present study, we generated relevant sequence-based features for both RA-inducing and non-inducing peptides. We calculated various composition-based features using the Pfeature software (<xref ref-type="bibr" rid="B30">30</xref>). The Pfeature software calculates about 9,189 features using amino acid sequences. We have extracted features such as amino acid composition (AAC), dipeptide composition (DPC), distance distribution of residues (DDR), and many more (please refer to <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S1</bold>
</xref>).</p>
<p>In addition to composition-based features, we extended feature extraction to include binary profiles. To capture maximum information from the amino acid sequences, we used the Amino Acid Binary Profile (AABP) module of the Pfeature software (<xref ref-type="bibr" rid="B30">30</xref>). The generated features served as the basis for implementing ML-based prediction algorithms. Furthermore, we included embeddings from PLMs, specifically those generated using ProtBERT, developed by Rostlab (<xref ref-type="bibr" rid="B31">31</xref>). Each feature type carries its own significance and contributes uniquely to the overall prediction performance.</p>
</sec>
<sec id="s3_3">
<label>2.3</label>
<title>Preliminary analysis</title>
<sec id="s3_3_1">
<label>2.3.1</label>
<title>Positional analysis</title>
<p>We created a two-sample logo using the &#x201c;Two Sample Logo&#x201d; software to analyze the positional preferences of amino acid residues (<xref ref-type="bibr" rid="B32">32</xref>). This method requires input sequences of fixed length. Since the minimum peptide length in our dataset is nine residues, we extracted 9-mers from both the N-terminal and C-terminal of each peptide. These were then concatenated to form a fixed-length sequence of 18 amino acids for each peptide. To generate the two-sample logo plot, these 18-residue sequences from the positive and negative datasets were used as input, enabling the identification of amino acid enrichment or depletion at specific positions between the two classes.</p>
</sec>
<sec id="s3_3_2">
<label>2.3.2</label>
<title>Compositional analysis</title>
<p>To gain deeper insights into the differences in AAC between RA-inducing and non-inducing peptides, we performed compositional analysis on both the positive and negative datasets. For this, we utilized the Pfeature software to compute the AAC of each dataset (<xref ref-type="bibr" rid="B30">30</xref>). The AAC is calculated by Pfeature using the following formula;</p>
<disp-formula id="eq1">
<label>[1]</label>
<mml:math display="block" id="M1">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>A</mml:mi>
<mml:msub>
<mml:mi>C</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mi>R</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mi>L</mml:mi>
</mml:mfrac>
<mml:mo>&#x2217;</mml:mo>
<mml:mn>100</mml:mn>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where, AAC<sub>i</sub> is amino acid composition of residue type i, Ri is the number of residues in i, and L is the length of peptide sequence.</p>
</sec>
<sec id="s3_3_3">
<label>2.3.3</label>
<title>Mean-based univariate analysis</title>
<p>In this analysis, we calculated the absolute mean difference of each feature between the RA-inducing (positive) and non-inducing (negative) classes after normalizing the dataset. To assess the relevance of the generated features, we computed the mean difference for each feature across both groups. Subsequently, we applied an independent t-test to identify the top 5 features with statistically significant <italic>p</italic>-values distinguishing the two classes.</p>
</sec>
<sec id="s3_3_4">
<label>2.3.4</label>
<title>Logistic regression-based analysis</title>
<p>We also employed logistic regression (LR) as a single-feature statistical model to evaluate the relationship between each feature and the target label. We computed the area under the curve (AUC) for each feature to assess its individual relevance in classification.</p>
</sec>
</sec>
<sec id="s3_4">
<label>2.4</label>
<title>Alignment-based approach</title>
<sec id="s3_4_1">
<label>2.4.1</label>
<title>BLAST search</title>
<p>To annotate peptide sequences, we employed the well-known similarity search tool BLAST (<xref ref-type="bibr" rid="B33">33</xref>). Specifically, we used the &#x201c;blastp-short&#x201d; algorithm (BLAST+ v2.2.28) designed for short peptide sequences to predict RA-inducing and non-inducing peptides based on sequence similarity.</p>
</sec>
<sec id="s3_4_2">
<label>2.4.2</label>
<title>Motif search</title>
<p>Motifs are short amino acid patterns potentially associated with shared biological functions. This analysis helps identify signature patterns in RA-inducing and non-inducing peptides. These motifs may serve as targets for the development of drugs and therapeutic interventions. We employed the MERCI tool (<xref ref-type="bibr" rid="B34">34</xref>), implemented in Perl, to discover motifs exclusive to either the positive or negative dataset using both default and user-defined parameters.</p>
</sec>
</sec>
<sec id="s3_5">
<label>2.5</label>
<title>Alignment-free approach</title>
<sec id="s3_5_1">
<label>2.5.1</label>
<title>Machine learning models</title>
<p>We used the Scikit-learn Python library to implement various ML algorithms for classification. The classification algorithm consists of decision trees (DT), random forest (RF), multi-layer perceptron (MLP), eXtreme gradient boosting (XGBoost), support vector with the kernel as a radial basis (SVR), ExtraTreesClassifier (ET), LR, k-nearest neighbors (KNN), and Gaussian Na&#xef;ve Baise (GNB). We employed a GridSearchCV approach to optimize hyperparameters, using AUC as the scoring metric.</p>
</sec>
<sec id="s3_5_2">
<label>2.5.2</label>
<title>Deep learning models</title>
<p>To process sequential data and capture local patterns in peptide sequences, we implemented a one-dimensional convolutional neural network (1D CNN) model for the DL technique. This model is particularly effective in recognizing sequence patterns and dependencies. Hyperparameters were tuned to maximize classification performance for each dataset.</p>
</sec>
</sec>
<sec id="s3_6">
<label>2.6</label>
<title>Feature selection</title>
<p>As not all features contribute equally to model performance, we applied two feature selection methods: minimum Redundancy&#x2014;Maximum Relevance (mRMR) and support vector classifier with L1 regularization (SVC-L1). Feature selection was applied to all composition-based features. The SVC-L1 method selected 34 features, whereas mRMR selected 50 features. We also selected top-performing features based on mean-based univariate and LR analyses.</p>
<p>In addition to this, we have selected top relevant features from mean-based univariate analysis and LR-based analysis. After computing the mean difference among both groups of each feature, we selected 3,782 of 9,189 features. Second, we applied an independent t-test to identify significant features from the stretch of 3,782 features using a <italic>p</italic>-value &#x2264; 0.05. Finally, we have obtained 305 features with maximum absolute mean difference ranges from 0.001 to 0.14 with significant <italic>p</italic>-values. Upon which, we have deployed ML classifiers over the top 10, 20, 50, 100, 150, 200, 250, and 305 features. Similarly, we have developed ML models on the top 10, 20, and 50 features obtained from LR-based analysis.</p>
</sec>
<sec id="s3_7">
<label>2.7</label>
<title>Protein language models</title>
<p>Large language models (LLMs), such as PLMs, excel in tasks like peptide classification due to their contextual understanding. We utilized ProtBERT, a pre-trained PLM developed by RostLab, and fine-tuned it to classify RA-inducing and non-inducing peptides. After fine-tuning, the model predicted the class of each input sequence with improved accuracy.</p>
</sec>
<sec id="s3_8">
<label>2.8</label>
<title>Ensemble method</title>
<p>To develop a more robust classification system, we implemented two ensemble approaches. First, the BLAST-based approach was used to identify disease-causing peptides based on similarity hits, and then the ML approach was used for the prediction of those peptides that are not covered by the BLAST-based approach. Second, the Motif-based approach was used to classify between disease-causing peptides by identifying specific motifs, and then the ML approach was used for the prediction of those peptides not covered by the Motif-based approach.</p>
</sec>
<sec id="s3_9">
<label>2.9</label>
<title>Model evaluation</title>
<p>To ensure generalizability and prevent overfitting, we followed standard ML practices, including fivefold cross-validation (<xref ref-type="bibr" rid="B35">35</xref>, <xref ref-type="bibr" rid="B36">36</xref>). The dataset was split in an 80:20 ratio, with 80% used for training and 20% reserved for external validation. Models were evaluated using both threshold-dependent and threshold-independent metrics:</p>
<disp-formula id="eq2">
<label>[2]</label>
<mml:math display="block" id="M2">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>v</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq3">
<label>[3]</label>
<mml:math display="block" id="M3">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mi>p</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq4">
<label>[4]</label>
<mml:math display="block" id="M4">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>y</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq5">
<label>[5]</label>
<mml:math display="block" id="M5">
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>S</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<disp-formula id="eq6">
<label>[6]</label>
<mml:math display="block" id="M6">
<mml:mrow>
<mml:mi>M</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>C</mml:mi>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>*</mml:mo>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>*</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mo stretchy="false">(</mml:mo>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:mi>N</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>F</mml:mi>
<mml:mi>N</mml:mi>
</mml:mrow>
<mml:mo stretchy="false">)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:msqrt>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:math>
</disp-formula>
<p>Where FP is false positive, FN is false negative, TP is true positive and TN is true negative, AUC was used to assess the overall discriminatory power of the models, independent of classification thresholds.</p>
</sec>
</sec>
<sec id="s4" sec-type="results">
<label>3</label>
<title>Results</title>
<sec id="s4_1">
<label>3.1</label>
<title>Preliminary analysis</title>
<sec id="s4_1_1">
<label>3.1.1</label>
<title>Positional analysis</title>
<list list-type="simple">
<list-item>
<p>To determine the most significant positional preferences of amino acid residues within peptides, we employed &#x201c;Two Sample Logo&#x201d; for positional analysis. It is important to note that the first nine positions correspond to the N-terminal residues, while the last nine represent the C-terminal residues of the peptides. Our analysis revealed that glycine (G), glutamine (Q), and phenylalanine (F) are predominantly present at the N-terminal of RA-inducing (positive) peptides, whereas isoleucine (I), glycine (G), tyrosine (Y), and proline (P) are enriched at the C-terminal. In contrast, threonine (T) and isoleucine (I) were observed at the N-terminal, and alanine (A), leucine (L), arginine (R), and histidine (H) were commonly found at the C-terminal of non-inducing (negative) peptides. Notably, glutamic acid (E) was consistently prominent across almost all positions in the negative dataset (refer to <xref ref-type="fig" rid="f3">
<bold>Figure&#xa0;3</bold>
</xref>).</p>
</list-item>
</list>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Preference of residues at different positions in RA-inducing and non-inducing peptides.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-16-1630863-g003.tif">
<alt-text content-type="machine-generated">Two Sample Logo plot showing enriched and depleted amino acid residues at positions one to eighteen. Enriched residues G, Q, and Y appear at positions one to two, and eleven. Depleted residues E, T, A, L, R, and P appear at various positions with E being the most prominent.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4_1_2">
<label>3.1.2</label>
<title>Compositional analysis</title>
<p>We computed the AAC (Using <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>) for both RA-inducing (positive) and RA non-inducing (negative) peptides. As shown in <xref ref-type="fig" rid="f4">
<bold>Figure&#xa0;4</bold>
</xref>, glycine (G), proline (P), and tyrosine (Y) exhibit the highest average composition in RA-inducing peptides, with statistically significant <italic>p</italic>-values compared to the negative dataset. In contrast, alanine (A), aspartic acid (D), glutamic acid (E), and leucine (L) are significantly more abundant in the RA non-inducing peptides.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The average amino acid composition among RA-inducing (i.e., positive) and RA non-inducing (i.e., negative) peptides.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fimmu-16-1630863-g004.tif">
<alt-text content-type="machine-generated">Bar chart showing average amino acid composition in positive (red) versus negative (green) sequences. Amino acids are listed along the x-axis, and composition percentage is on the y-axis. Values are noted above specific bars, such as Ala and Ser having higher compositions in positive sequences, while Asp and Pro are higher in negative sequences.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4_1_3">
<label>3.1.3</label>
<title>Mean-based univariate analysis</title>
<p>We finally selected 305 features based on their higher mean differences between the positive and negative datasets, along with statistically significant <italic>p</italic>-values. As shown in <xref ref-type="table" rid="T1">
<bold>Table&#xa0;1</bold>
</xref>, the composition-enhanced transition and distribution (CeTD) features such as CeTD_21_HB and CeTD_25_p_VW3 exhibited the highest mean differences of 0.084 and &#x2212;0.140, respectively. For the complete list of selected features and their rankings, please refer to <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S2</bold>
</xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Significantly preferred, or not preferred, features in RA-inducing peptides in terms of difference in mean values between RA-inducing and non-inducing peptides.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Feature</th>
<th valign="middle" colspan="2" align="center">Mean value</th>
<th valign="middle" rowspan="2" align="center">Difference in mean (Inducing - non-inducing)</th>
</tr>
<tr>
<th valign="middle" align="center">RA-inducing</th>
<th valign="middle" align="center">Non-inducing</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="4" align="center">Features preferred in RA-inducing peptides</th>
</tr>
<tr>
<td valign="middle" align="left">CeTD_21_HB</td>
<td valign="middle" align="center">0.466</td>
<td valign="middle" align="center">0.382</td>
<td valign="middle" align="center">0.084</td>
</tr>
<tr>
<td valign="middle" align="left">SER_Y</td>
<td valign="middle" align="center">0.275</td>
<td valign="middle" align="center">0.196</td>
<td valign="middle" align="center">0.079</td>
</tr>
<tr>
<td valign="middle" align="left">PRI_NE</td>
<td valign="middle" align="center">0.340</td>
<td valign="middle" align="center">0.265</td>
<td valign="middle" align="center">0.075</td>
</tr>
<tr>
<td valign="middle" align="left">PRI_NE_pH</td>
<td valign="middle" align="center">0.340</td>
<td valign="middle" align="center">0.265</td>
<td valign="middle" align="center">0.075</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_11_SS</td>
<td valign="middle" align="center">0.373</td>
<td valign="middle" align="center">0.299</td>
<td valign="middle" align="center">0.074</td>
</tr>
<tr>
<th valign="middle" colspan="4" align="center">Features not preferred in RA-inducing peptides</th>
</tr>
<tr>
<td valign="middle" align="left">CeTD_25_p_VW3</td>
<td valign="middle" align="center">0.306</td>
<td valign="middle" align="center">0.445</td>
<td valign="middle" align="center">&#x2212;0.140</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_100_p_VW3</td>
<td valign="middle" align="center">0.320</td>
<td valign="middle" align="center">0.453</td>
<td valign="middle" align="center">&#x2212;0.133</td>
</tr>
<tr>
<td valign="middle" align="left">SER_E</td>
<td valign="middle" align="center">0.382</td>
<td valign="middle" align="center">0.514</td>
<td valign="middle" align="center">&#x2212;0.132</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_75_p_VW3</td>
<td valign="middle" align="center">0.340</td>
<td valign="middle" align="center">0.471</td>
<td valign="middle" align="center">&#x2212;0.131</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_50_p_VW3</td>
<td valign="middle" align="center">0.330</td>
<td valign="middle" align="center">0.458</td>
<td valign="middle" align="center">&#x2212;0.128</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_1_4">
<label>3.1.4</label>
<title>Logistic Regression-based analysis</title>
<p>To identify the best features based on their individual performance, we also applied an LR classifier on 9,189 features calculated using the Pfeature tool. We observed that the top AUC features belong to the Bond Composition (BTC) and CeTD feature categories. The features named BTC_T, BTC_S, and BTC_H achieved a maximum AUC of 0.69, while features CeTD_75_p_VW3, CeTD_100_p_VW3, and CeTD_50_p_VW3 achieved an AUC of 0.68. The top 10 features with their performance are shown in <xref ref-type="table" rid="T2">
<bold>Table&#xa0;2</bold>
</xref>, and detailed results are provided in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S3</bold>
</xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The performance of single feature based LR models developed using the top ten features.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Feature name</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">AUC</th>
<th valign="middle" align="center">Kappa</th>
<th valign="middle" align="center">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">BTC_T</td>
<td valign="middle" align="center">65.64</td>
<td valign="middle" align="center">66.06</td>
<td valign="middle" align="center">65.79</td>
<td valign="middle" align="center">0.69</td>
<td valign="middle" align="center">0.30</td>
<td valign="middle" align="center">0.31</td>
</tr>
<tr>
<td valign="middle" align="left">BTC_S</td>
<td valign="middle" align="center">66.32</td>
<td valign="middle" align="center">67.88</td>
<td valign="middle" align="center">66.89</td>
<td valign="middle" align="center">0.69</td>
<td valign="middle" align="center">0.32</td>
<td valign="middle" align="center">0.33</td>
</tr>
<tr>
<td valign="middle" align="left">BTC_H</td>
<td valign="middle" align="center">65.98</td>
<td valign="middle" align="center">67.27</td>
<td valign="middle" align="center">66.45</td>
<td valign="middle" align="center">0.69</td>
<td valign="middle" align="center">0.31</td>
<td valign="middle" align="center">0.32</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_75_p_VW3</td>
<td valign="middle" align="center">56.70</td>
<td valign="middle" align="center">66.06</td>
<td valign="middle" align="center">60.09</td>
<td valign="middle" align="center">0.68</td>
<td valign="middle" align="center">0.21</td>
<td valign="middle" align="center">0.22</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_100_p_VW3</td>
<td valign="middle" align="center">56.70</td>
<td valign="middle" align="center">66.06</td>
<td valign="middle" align="center">60.09</td>
<td valign="middle" align="center">0.68</td>
<td valign="middle" align="center">0.21</td>
<td valign="middle" align="center">0.22</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_50_p_VW3</td>
<td valign="middle" align="center">68.73</td>
<td valign="middle" align="center">53.33</td>
<td valign="middle" align="center">63.16</td>
<td valign="middle" align="center">0.68</td>
<td valign="middle" align="center">0.22</td>
<td valign="middle" align="center">0.22</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_100_p_HB3</td>
<td valign="middle" align="center">71.13</td>
<td valign="middle" align="center">53.33</td>
<td valign="middle" align="center">64.69</td>
<td valign="middle" align="center">0.66</td>
<td valign="middle" align="center">0.24</td>
<td valign="middle" align="center">0.24</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_75_p_HB3</td>
<td valign="middle" align="center">71.13</td>
<td valign="middle" align="center">53.33</td>
<td valign="middle" align="center">64.69</td>
<td valign="middle" align="center">0.66</td>
<td valign="middle" align="center">0.24</td>
<td valign="middle" align="center">0.24</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_50_p_HB1</td>
<td valign="middle" align="center">62.89</td>
<td valign="middle" align="center">63.03</td>
<td valign="middle" align="center">62.94</td>
<td valign="middle" align="center">0.66</td>
<td valign="middle" align="center">0.24</td>
<td valign="middle" align="center">0.25</td>
</tr>
<tr>
<td valign="middle" align="left">CeTD_50_p_SS3</td>
<td valign="middle" align="center">62.89</td>
<td valign="middle" align="center">63.03</td>
<td valign="middle" align="center">62.94</td>
<td valign="middle" align="center">0.66</td>
<td valign="middle" align="center">0.24</td>
<td valign="middle" align="center">0.25</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s4_2">
<label>3.2</label>
<title>Alignment-based approach</title>
<p>We used both alignment-free (ML techniques) and alignment-based (motif &amp; BLAST) methods, as explained in previous sections. Each strategy has its own advantages and limitations. Alignment-based techniques generally have low sensitivity but high specificity, as their performance depends on the presence of motifs or sequence similarity. In contrast, alignment-free ML-based approaches are more generalizable and not dependent on sequence similarity. We developed ensemble or hybrid approaches combining BLAST and Motif to leverage the strengths of both methods.</p>
<sec id="s4_2_1">
<label>3.2.1</label>
<title>BLAST</title>
<p>In the BLAST-based approach, we first prepared a BLAST-formatted database using the training dataset. Then, we searched the query sequences (from the validation dataset) against the training database to find hits at various e-values ranging from 1e-5 to 1e+3. A query sequence was categorized as positive if the top hit was positive and negative if the top hit was negative. The detailed BLAST results on the validation data are shown in <xref ref-type="table" rid="T3">
<bold>Table&#xa0;3</bold>
</xref>.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>The performance of the BLAST method in terms of coverage of RA-inducing and non-inducing peptides at different e-values.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" colspan="6" align="center">BLAST + CeTD (validation dataset)</th>
</tr>
<tr>
<th valign="middle" align="center">E-value</th>
<th valign="middle" align="center">Number of hits</th>
<th valign="middle" align="center">RA-inducers</th>
<th valign="middle" align="center">RA non-inducers</th>
<th valign="middle" align="center">Correct pos</th>
<th valign="middle" align="center">Correct neg</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="right">1.00E+10</td>
<td valign="middle" align="center">92</td>
<td valign="middle" align="center">46</td>
<td valign="middle" align="center">46</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">23</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E+03</td>
<td valign="middle" align="center">92</td>
<td valign="middle" align="center">46</td>
<td valign="middle" align="center">46</td>
<td valign="middle" align="center">10</td>
<td valign="middle" align="center">23</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E+02</td>
<td valign="middle" align="center">87</td>
<td valign="middle" align="center">44</td>
<td valign="middle" align="center">43</td>
<td valign="middle" align="center">9</td>
<td valign="middle" align="center">20</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E+01</td>
<td valign="middle" align="center">66</td>
<td valign="middle" align="center">32</td>
<td valign="middle" align="center">34</td>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">13</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E+00</td>
<td valign="middle" align="center">53</td>
<td valign="middle" align="center">28</td>
<td valign="middle" align="center">25</td>
<td valign="middle" align="center">6</td>
<td valign="middle" align="center">12</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E-01</td>
<td valign="middle" align="center">47</td>
<td valign="middle" align="center">25</td>
<td valign="middle" align="center">22</td>
<td valign="middle" align="center">20</td>
<td valign="middle" align="center">10</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E-02</td>
<td valign="middle" align="center">39</td>
<td valign="middle" align="center">21</td>
<td valign="middle" align="center">18</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">10</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E-03</td>
<td valign="middle" align="center">30</td>
<td valign="middle" align="center">16</td>
<td valign="middle" align="center">14</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">9</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E-04</td>
<td valign="middle" align="center">20</td>
<td valign="middle" align="center">12</td>
<td valign="middle" align="center">8</td>
<td valign="middle" align="center">4</td>
<td valign="middle" align="center">6</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E-05</td>
<td valign="middle" align="center">14</td>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">7</td>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">5</td>
</tr>
<tr>
<td valign="middle" align="right">1.00E-08</td>
<td valign="middle" align="center">2</td>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">1</td>
<td valign="middle" align="center">0</td>
<td valign="middle" align="center">1</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Correct pos, correct positive; Correct neg, correct negative.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Next, we combined the predicted labels obtained using CeTD features with BLAST scores to improve the performance of our XGB models. We attained a maximum AUC of 0.77 on the validation dataset at an e-value of 1.00E+01. However, such a high e-value could indicate a random chance of getting hits. The complete result table is provided in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S4</bold>
</xref>. To develop a more robust model, we further explored a motif-based approach.</p>
</sec>
<sec id="s4_2_2">
<label>3.2.2</label>
<title>Motif</title>
<p>In the motif-based approach, we identified K exclusive motifs using the MERCI tool. MERCI provides different parameters to generate specific motifs based on positive and negative datasets. We calculated exclusive motifs using the &#x201c;None,&#x201d; KOOLMAN-ROHM, and BETTS-RUSSELL classification methods, assigning a score of +0.5 if the motif was found in a positive sequence and 0 if no match was found. We then combined the predicted labels from the best model with motif scores. The best performing motifs were obtained using the BETTS-RUSSELL classification method. The detailed list of motifs with their occurrence in the RA-inducing dataset is provided in <xref ref-type="table" rid="T4">
<bold>Table&#xa0;4</bold>
</xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>List of highly abundant motifs in RA-inducing peptides.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Motif</th>
<th valign="middle" align="center">Coverage in RA inducers</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">tiny polar hydrophobic G hydrophobic</td>
<td valign="middle" align="center">22</td>
</tr>
<tr>
<td valign="middle" align="left">hydrophobic L hydrophobic aliphatic small</td>
<td valign="middle" align="center">20</td>
</tr>
<tr>
<td valign="middle" align="left">hydrophobic polar hydrophobic tiny polar hydrophobic hydrophobic polar</td>
<td valign="middle" align="center">19</td>
</tr>
<tr>
<td valign="middle" align="left">aliphatic aliphatic hydrophobic polar polar aliphatic</td>
<td valign="middle" align="center">19</td>
</tr>
<tr>
<td valign="middle" align="left">small S hydrophobic G</td>
<td valign="middle" align="center">18</td>
</tr>
<tr>
<td valign="middle" align="left">hydrophobic polar A G hydrophobic</td>
<td valign="middle" align="center">17</td>
</tr>
<tr>
<td valign="middle" align="left">G small small G small</td>
<td valign="middle" align="center">17</td>
</tr>
<tr>
<td valign="middle" align="left">P hydrophobic polar polar hydrophobic</td>
<td valign="middle" align="center">17</td>
</tr>
<tr>
<td valign="middle" align="left">tiny hydrophobic S hydrophobic hydrophobic hydrophobic</td>
<td valign="middle" align="center">16</td>
</tr>
<tr>
<td valign="middle" align="left">tiny S hydrophobic G</td>
<td valign="middle" align="center">15</td>
</tr>
<tr>
<td valign="middle" align="left">tiny S tiny tiny</td>
<td valign="middle" align="center">15</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Polar, H,K,R,D,E,Y,W,T,C,S,N,Q; charged, D,E,R,H,K; negative, D,E; positive, R,H,K; small, A,G,C,S,P,N,D,T,V; tiny, A,G,C,S; hydrophobic, H,F,W,Y,I,L,V,M,K,T,A,G,C; aromatic, H,F,W,Y; aliphatic, I,L,V.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4_3">
<label>3.3</label>
<title>Alignment free approach</title>
<p>In this study, multiple classifiers were employed to differentiate between RA-inducing and non-inducing peptides. The predictive performance of these models was systematically assessed using standard evaluation metrics, with the corresponding calculations derived from the formulas provided in <xref ref-type="disp-formula" rid="eq2">Equations 2</xref>&#x2013;<xref ref-type="disp-formula" rid="eq6">6</xref>. This ensured a comprehensive and rigorous assessment of classification accuracy, reliability, and generalizability.</p>
<sec id="s4_3_1">
<label>3.3.1</label>
<title>Machine learning-based analysis</title>
<p>We applied multiple ML-based classifiers to composition-based features and AABP features. Results are shown in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S5</bold>
</xref>. Our findings demonstrate that among the various composition-based features, the CeTD features performed exceptionally well. Using CeTD based features, we achieved a maximum accuracy and AUC of 71% and 0.75 on the training dataset and 66.30% and 0.75 on the validation dataset, with balanced sensitivity and specificity using the XGB classifier. <xref ref-type="table" rid="T5">
<bold>Table&#xa0;5</bold>
</xref> shows the performance of the best model across all composition and binary profile-based features on the validation dataset.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>The performance of the ML models using the best set of features on the validation dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Feature name</th>
<th valign="middle" align="center">ML model</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">AUC</th>
<th valign="middle" align="center">Kappa</th>
<th valign="middle" align="center">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>CeTD</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>XGB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.02</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.30</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.75</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.33</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.35</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>TPC</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>62.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>65.22</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.74</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.30</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.31</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ALLCOMP</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>LR</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>57.63</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.30</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.32</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>APAAC</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.02</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.64</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.96</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.72</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.23</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.24</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>DPC</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>KNN</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.02</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.30</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>AAC</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>60.61</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.78</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.19</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.19</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>BTC</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>GNB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>50.85</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.78</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.69</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.23</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.26</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>DDR</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>52.54</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>60.87</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.25</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>CTC</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>SVC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>62.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.65</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>PRI</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>LR</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>60.87</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.63</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.23</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.25</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>AABP</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>KNN</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>51.52</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>54.35</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.59</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.07</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.07</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>XGB, extreme gradient boosting; ET, extra tree; LR, logistic regression; KNN, k-nearest neighbors; GNB, Gaussian Na&#xef;ve Baise; SVC, support vector classifier; AUC, area under curve; kappa, Cohen&#x2019;s kappa coefficient; MCC, Mathew&#x2019;s correlation coefficient.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_3_2">
<label>3.3.2</label>
<title>Deep learning-based analysis</title>
<p>We applied a 1D-CNN on various composition-based features as well as on amino acid binary profile-based features. The 1D-CNN performed well on tri-peptide composition features, achieving an AUC of 0.69 on the validation dataset. Detailed results of the 1D-CNN model on different features are provided in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S6</bold>
</xref>. Performance summary is shown in <xref ref-type="table" rid="T6">
<bold>Table&#xa0;6</bold>
</xref>.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>The performance of the 1D-CNN model over different types of features on the validation dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Feature name</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">AUC</th>
<th valign="middle" align="center">Kappa</th>
<th valign="middle" align="center">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>AAC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>88.14</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>21.21</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.60</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.11</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.13</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>DPC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>50.85</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.64</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.44</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.14</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>TPC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>72.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.96</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.69</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.26</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>BTC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.00</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>100.00</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>35.87</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.38</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.00</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.00</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>DDR</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>45.46</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>52.17</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.54</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.01</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.01</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>CTC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>52.54</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>57.61</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.60</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.17</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.19</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>PRI</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>45.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>54.35</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.64</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.14</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.15</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>CeTD</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>71.19</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>48.49</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.04</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.68</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.20</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.20</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>APAAC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>51.52</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>56.52</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.65</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.10</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.11</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>AAB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>54.55</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.44</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.60</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.10</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.10</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ALLCOMP</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>57.63</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.96</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.72</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.25</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.26</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>AUC, area under curve; kappa, Cohen&#x2019;s kappa coefficient; MCC, Mathew&#x2019;s correlation coefficient.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_3_3">
<label>3.3.3</label>
<title>Feature selection techniques</title>
<p>To select the most relevant features, we applied two feature selection techniques&#x2014;SVC-L1 and mRMR. Using SVC-L1, we selected 34 composition-based features. As shown in <xref ref-type="table" rid="T7">
<bold>Table&#xa0;7</bold>
</xref>, we achieved a maximum AUC of 0.72 on the validation dataset using the SVC.</p>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>The performance of the ML models developed using SVC-L1&#x2013;selected features on the validation dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model</th>
<th valign="middle" align="left">Sensitivity</th>
<th valign="middle" align="left">Specificity</th>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">AUC</th>
<th valign="middle" align="left">Kappa</th>
<th valign="middle" align="left">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>DT</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>62.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>48.49</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>57.61</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.58</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.11</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.11</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>RF</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>72.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.29</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.31</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>LR</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.96</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.24</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.25</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>XGB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.10</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>67.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.69</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.34</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.34</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>KNN</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.02</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.30</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>GNB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>62.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.68</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.49</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>54.55</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.69</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.24</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.24</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>SVC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>62.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>67.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.72</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.35</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.37</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>MLP</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>62.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>DT, decision tree; RF, random forest; LR, logistic regression; XGB, extreme gradient boosting; KNN, k-nearest neighbors; GNB, Gaussian Na&#xef;ve Baise; ET, extra tree; SVC, support vector classifier; MLP, multilayer perceptron; AUC, area under curve; kappa: Cohen&#x2019;s kappa coefficient; MCC, Mathew&#x2019;s correlation coefficient.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>We also applied the mRMR technique, which selected 50 composition-based features and achieved a maximum AUC of 0.73 using the RF classifier. These results are presented in <xref ref-type="table" rid="T8">
<bold>Table&#xa0;8</bold>
</xref>.</p>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>The performance of the ML models developed using mRMR-selected features on the validation dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">AUC</th>
<th valign="middle" align="center">Kappa</th>
<th valign="middle" align="center">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>DT</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.78</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.65</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.21</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.22</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>RF</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.04</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>LR</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.10</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.57</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.72</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.40</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>XGB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>55.93</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.04</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.31</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>KN</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>57.63</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>72.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.04</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.69</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.29</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>GNB</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>79.66</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>39.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>65.22</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.60</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.20</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.21</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>52.54</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>72.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.78</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.74</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.22</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.24</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>SVC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.10</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>67.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.71</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.34</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.34</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>MLP</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.41</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>65.22</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.29</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.30</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>DT, decision tree; RF, random forest; LR, logistic regression; XGB, extreme gradient boosting; KNN, k-nearest neighbors; GNB, Gaussian Na&#xef;ve Baise; ET, extra tree; SVC, support vector classifier; MLP, multilayer perceptron; AUC, area under curve; kappa, Cohen&#x2019;s kappa coefficient; MCC, Mathew&#x2019;s correlation coefficient.</p>
</fn>
</table-wrap-foot>
</table-wrap>
<p>Additionally, we implemented mean-based univariate analysis and LR-based feature selection. We applied ML techniques to 305 features, as well as to the top 200, 150, 100, 50, 20, and 10 selected features. Using this approach, we achieved the highest AUC of 0.73 for the top 100, 150, and 200 features. Results are presented in <xref ref-type="table" rid="T9">
<bold>Table&#xa0;9</bold>
</xref>. For detailed results, refer to <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S7</bold>
</xref>. Performance of various ML algorithms on the top 10, 20, and 50 features selected through LR is provided in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S8</bold>
</xref>.</p>
<table-wrap id="T9" position="float">
<label>Table&#xa0;9</label>
<caption>
<p>The performance of the best ML models developed using different sets of top features on the validation dataset. Features were selected using mean-based univariate analysis.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Total feature</th>
<th valign="middle" align="center">ML model</th>
<th valign="middle" align="center">Sensitivity</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">AUC</th>
<th valign="middle" align="center">Kappa</th>
<th valign="middle" align="center">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 305</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>RF</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>57.63</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>60.87</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.72</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.22</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.23</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 200</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>RF</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.10</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.7</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>67.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.34</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.34</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 150</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>RF</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>72.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>64.13</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.29</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.31</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 100</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>67.80</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>67.39</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.73</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.33</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.33</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 50</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>ET</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>61.02</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>66.67</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.04</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.26</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 20</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>SVC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>50.85</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>75.76</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.78</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.23</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.26</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>Top 10</p>
</list-item>
</list>
</td>
<td valign="middle" align="left">
<list list-type="simple">
<list-item>
<p>SVC</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>59.32</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>69.70</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>63.04</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.69</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.27</p>
</list-item>
</list>
</td>
<td valign="middle" align="center">
<list list-type="simple">
<list-item>
<p>0.28</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>RF, random forest; ET, extra tree; SVC, support vector classifier.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s4_4">
<label>3.4</label>
<title>Protein language model-based analysis</title>
<p>We used two pre-trained large language models&#x2014;protBERT and BioBERT&#x2014;for this study. Each model was fine-tuned by changing the number of epochs, which allowed the model to learn and update its parameters depending on the sequences it processed. An epoch is defined as one complete pass through the whole training dataset. The best results were obtained at epoch 10 for protBERT and epoch 3 for BioBERT, achieving maximum AUCs of 0.71 and 0.67, respectively, on the fine-tuned models. Results are shown in <xref ref-type="table" rid="T10">
<bold>Table&#xa0;10</bold>
</xref>.</p>
<table-wrap id="T10" position="float">
<label>Table&#xa0;10</label>
<caption>
<p>Best performance of PLM models on the validation dataset, models were fine-tuned on the training dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Model name</th>
<th valign="middle" align="left">Sensitivity</th>
<th valign="middle" align="left">Specificity</th>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">AUC</th>
<th valign="middle" align="left">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">ProtBERT</td>
<td valign="middle" align="center">0.81</td>
<td valign="middle" align="center">0.58</td>
<td valign="middle" align="center">0.73</td>
<td valign="middle" align="center">0.71</td>
<td valign="middle" align="center">0.40</td>
</tr>
<tr>
<td valign="middle" align="left">BioBERT</td>
<td valign="middle" align="center">0.69</td>
<td valign="middle" align="center">0.58</td>
<td valign="middle" align="center">0.65</td>
<td valign="middle" align="center">0.67</td>
<td valign="middle" align="center">0.26</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Furthermore, we extracted embeddings from the fine-tuned models and applied various ML algorithms. However, these models did not perform well on our dataset. Results for PLM-based models are presented in <xref ref-type="supplementary-material" rid="SM1">
<bold>Supplementary Table S9</bold>
</xref>.</p>
</sec>
<sec id="s4_5">
<label>3.5</label>
<title>Ensemble model</title>
<p>An ensemble model was developed by combining the best-performing ML model (based on CeTD features) with the motif-based approach. BETTS-RUSSELL classification in MERCI (parameters: fp = 2, fn = 0, g = 0, k = 20) identified the best motifs that covered the maximum validation dataset. We combined these motif scores with the best ML classifier. As shown in <xref ref-type="table" rid="T11">
<bold>Table&#xa0;11</bold>
</xref>, this ensemble model achieved the highest AUC of 0.80 and MCC of 0.45 on the validation dataset. These exclusive motifs help in identifying specific regions in proteins that may cause RA. This ensemble model is also implemented in the prediction module of the RAIpred web server for ease of access.</p>
<table-wrap id="T11" position="float">
<label>Table&#xa0;11</label>
<caption>
<p>The performance of the hybrid model developed using exclusive positive motifs (with MERCI classification &#x2013; None, KOOLMAN-ROHM and BETTS-RUSSELL).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Classification method</th>
<th valign="middle" align="left">Requested number of motif</th>
<th valign="middle" align="left">Sensitivity</th>
<th valign="middle" align="left">Specificity</th>
<th valign="middle" align="left">Accuracy</th>
<th valign="middle" align="left">AUC</th>
<th valign="middle" align="left">Kappa</th>
<th valign="middle" align="left">MCC</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">None</td>
<td valign="middle" align="left">K20</td>
<td valign="middle" align="center">67.80</td>
<td valign="middle" align="center">66.67</td>
<td valign="middle" align="center">67.39</td>
<td valign="middle" align="center">0.70</td>
<td valign="middle" align="center">0.33</td>
<td valign="middle" align="center">0.33</td>
</tr>
<tr>
<td valign="middle" align="left">KOOLMAN-ROHM</td>
<td valign="middle" align="left">K20</td>
<td valign="middle" align="center">71.19</td>
<td valign="middle" align="center">66.67</td>
<td valign="middle" align="center">69.57</td>
<td valign="middle" align="center">0.75</td>
<td valign="middle" align="center">0.36</td>
<td valign="middle" align="center">0.37</td>
</tr>
<tr>
<td valign="middle" align="left">BETTS-RUSSELL</td>
<td valign="middle" align="left">K20</td>
<td valign="middle" align="center">71.19</td>
<td valign="middle" align="center">75.76</td>
<td valign="middle" align="center">72.83</td>
<td valign="middle" align="center">0.80</td>
<td valign="middle" align="center">0.44</td>
<td valign="middle" align="center">0.45</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>K, number of requested motifs; AUC, area under curve; kappa, Cohen&#x2019;s kappa coefficient; MCC, Mathew&#x2019;s correlation coefficient.</p>
</fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_6">
<label>3.6</label>
<title>Web server design</title>
<p>We developed &#x201c;RAIpred,&#x201d; available at <ext-link ext-link-type="uri" xlink:href="https://webs.iiitd.edu.in/raghava/raipred/">https://webs.iiitd.edu.in/raghava/raipred/</ext-link>, to provide a user-friendly web interface for predicting HLA class II binding RA-inducing or non-inducing peptides only. The platform includes &#x201c;Prediction,&#x201d; &#x201c;Design,&#x201d; &#x201c;Protein Scan,&#x201d; and &#x201c;Motif Scan&#x201d; modules. The Protein Scan module identifies RA-inducing regions in a given protein sequence. The design module enables generation of all possible analogs of a given peptide and evaluates their RA-inducing potential. The Protein Scan module identifies RA-inducing regions in the protein. The Motif Scan module uses MERCI to map RA-inducing motifs in the query sequence. The platform is responsive and accessible on desktops, laptops, and smartphones.</p>
<p>We also developed a standalone Python tool called &#x201c;RAIpred&#x201d; to assist in identifying potentially disease-causing regions in peptides or proteins. The tool can be downloaded from the web server&#x2019;s download page. The web server is powered by HTML5, Java, CSS3, and PHP and supports a variety of devices, including desktop, tablet, mobile, and iMac.</p>
</sec>
</sec>
<sec id="s5" sec-type="discussion">
<label>4</label>
<title>Discussion</title>
<p>RA is caused by gradual loss of self-tolerance in genetically vulnerable individuals due to various environmental stressors. Both the genetic and environment factors are significantly responsible for the onset of the disease. The ability of peptides or proteins to selectively bind arthritogenic amino acid sequences for presentation to auto-reactive T lymphocytes is provided by a common epitope in the peptide-binding groove area of MHC class II molecules (<xref ref-type="bibr" rid="B37">37</xref>). These T cells generate inflammatory responses by releasing excessive amounts of cytokines and by activating B cells, which are further responsible for the excessive production of autoantibodies and lead to destruction and bone erosion. As these arthritogenic peptides are responsible for inducing T-cell response, they can be targeted for therapeutic purposes by modifying their properties using peptide cyclization, chemical modification, or other <italic>in-vitro</italic> approaches.</p>
<p>Few ML-based techniques have been developed in the past to combat RA. One of these techniques, was developed in order to forecast how well biologic drugs will work in treating patients with RA and AS (ankylosing spondylitis), with an AUC of 0.64 based on the validation dataset. Furthermore, it demonstrates that the most significant predictors of therapy responses were patient self-reporting scales, the Bath Ankylosing Spondylitis Functional Index (BASFI) in AS patients, and the patient global assessment of disease activity (PtGA) in RA patients (<xref ref-type="bibr" rid="B38">38</xref>). Another study uses ML to predict patient relapses based on blood test results and ultrasound examination data (<xref ref-type="bibr" rid="B39">39</xref>). Prasad et&#xa0;al. developed ATRPred, an ML-based technique that uses clinical and demographic characteristics to predict how well RA patients will respond to anti-TNF treatment (<xref ref-type="bibr" rid="B40">40</xref>). One study uses genetic information from SNPs in non-HLA genes to predict RA (<xref ref-type="bibr" rid="B41">41</xref>). To the best of our knowledge, no method has yet been developed to anticipate the T-helper cell-inducing peptides or epitopes that trigger RA.</p>
<p>In this study, we have made a systematic approach for the classification of RA-inducing peptides. We have extracted 291 RA-inducing peptides and 165 RA non-inducing peptides from the IEDB. In the preceding sections, we observed a few key insights from the comprehensive analysis of RA-inducing and non-inducing peptides. Here, compositional analysis indicates that RA-inducing peptides have the highest average composition of glycine and proline as compared to non-inducing peptides, which might be responsible for peptide binding to MHC (<xref ref-type="bibr" rid="B42">42</xref>). In addition to this, positional analysis further marks distinct amino acid preferences for the N-terminal and C-terminal regions in positive and negative datasets, emphasizing the differential roles of residues such as glycine, threonine, and alanine.</p>
<p>In classification modelling, both alignment-based (BLAST and Motif) and alignment-free methods were implemented. BLAST-based approach demonstrated slightly increased performance at higher e-values, which depict the random chances of getting hits. While the Motif-based approach gave the highest number of correct hits for the validation dataset. In the present study, among different composition-based features, we have observed that CeTD composition-based features outperformed all. We have reported maximum accuracy and AUC over the training dataset as 71% &amp; 0.75 and 66.30% &amp; 0.75 over validation dataset with balanced sensitivity and specificity by applying the XGB classifier. This result underscores that CeTD features capture physicochemical peptide properties for our dataset, which is critical for accurate predictions. Finally, we have combined motif analysis and an ML-based methodology to develop an ensemble method. On the validation dataset, an ensemble-based method gets a maximum AUC of 0.80 and an MCC of 0.45. All the performance metrices calculated using <xref ref-type="disp-formula" rid="eq2">Equations 2</xref>&#x2013;<xref ref-type="disp-formula" rid="eq6">6</xref>. The integration of compositional insights and ML algorithms enabled the development of a robust tool &#x201c;RAIPred,&#x201d; for HLA class II binding RA-inducing peptide prediction. In order to provide the scientific community an easy and user-friendly approach for the prediction of RA-inducing peptides, we have developed RAIpred as a web server (<ext-link ext-link-type="uri" xlink:href="https://webs.iiitd.edu.in/raghava/raipred/">https://webs.iiitd.edu.in/raghava/raipred/</ext-link>), a standalone package (<ext-link ext-link-type="uri" xlink:href="https://webs.iiitd.edu.in/raghava/raipred/download.html">https://webs.iiitd.edu.in/raghava/raipred/download.html</ext-link>) and it is also available on Github (<ext-link ext-link-type="uri" xlink:href="https://github.com/raghava/raipred">https://github.com/raghava/raipred</ext-link>) and python package (<ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/raipred/">https://pypi.org/project/raipred/</ext-link>).</p>
<sec id="s5_1">
<label>4.1</label>
<title>Applications of RAIpred</title>
<list list-type="bullet">
<list-item>
<p>Assess the HLA class II binding RA-inducing potential of novel peptides/proteins before therapeutic or GMO applications.</p>
</list-item>
<list-item>
<p>Design therapeutic peptides with optimized physicochemical properties and screen them as HLA class II binding RA-inducers or non-inducers.</p>
</list-item>
<list-item>
<p>Map antigenic epitopes responsible for RA using the Protein Scan module.</p>
</list-item>
<list-item>
<p>Design peptide analogs with single-residue modifications using the Design Module to evaluate HLA class II binding RA-inducing potential.</p>
</list-item>
</list>
</sec>
</sec>
<sec id="s6" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>Peptide-based therapeutics are increasingly popular due to their target specificity and clinical success. Risk assessment of these proteins is essential for preventing them from causing some severe side-effects or being involved in disease development. There are several peptide-based drugs approved by the FDA for the treatment of RA and other autoimmune disorders. RAIpred is a reliable and accurate tool for identifying HLA class II binding RA-inducing peptides, aiding in the development of targeted therapeutics. It also provides deep insights into peptide functionality and helps discover novel bioactive peptides with pharmaceutical relevance.</p>
</sec>
</body>
<back>
<sec id="s7" sec-type="data-availability">
<title>Data availability statement</title>
<p>The dataset used in this study is publicly available for download at <uri xlink:href="https://webs.iiitd.edu.in/raghava/raipred/download.html">https://webs.iiitd.edu.in/raghava/raipred/download.html</uri>. Further inquiries can bedirected to the corresponding author.</p>
</sec>
<sec id="s8" sec-type="author-contributions">
<title>Author contributions</title>
<p>RT: Formal analysis, Writing &#x2013; original draft, Visualization, Validation, Investigation, Methodology, Data curation, Writing &#x2013; review &amp; editing. SJ: Writing &#x2013; original draft, Writing &#x2013; review &amp; editing, Visualization, Validation. PG: Writing &#x2013; review &amp; editing, Visualization. NB: Visualization, Writing &#x2013; review &amp; editing. GR: Validation, Data curation, Formal analysis, Visualization, Methodology, Software, Funding acquisition, Supervision, Investigation, Writing &#x2013; review &amp; editing, Conceptualization, Resources, Writing &#x2013; original draft, Project administration.</p>
</sec>
<sec id="s9" sec-type="funding-information">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research and/or publication of this article. The current work has been supported by the Department of Biotechnology (DBT) grant BT/PR40158/BTIS/137/24/2021.</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>Authors are thankful to the Department of Science and Technology (DST-INSPIRE), University Grants Commission (UGC), Department of Bio-Technology (DBT), and Council of Scientific &amp; Industrial Research (CSIR) for fellowships and financial support, and the Department of Computational Biology, IIITD New Delhi for infrastructure and facilities. We would like to acknowledge that Figures were created using BioRender.com. The preprint is available at biorxiv at <ext-link ext-link-type="uri" xlink:href="https://doi.org/10.1101/2025.03.19.644081">https://doi.org/10.1101/2025.03.19.644081</ext-link>.</p>
</ack>
<sec id="s10" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s11" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declare that no Generative AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec id="s12" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s13" sec-type="supplementary-material">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fimmu.2025.1630863/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fimmu.2025.1630863/full#supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="Table1.xlsx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"/>
<supplementary-material xlink:href="Image1.png" id="SF1" mimetype="image/png"/>
</sec>
<fn-group>
<title>Abbreviations</title>
<fn fn-type="abbr" id="abbrev1">
<p>RA, Rheumatoid arthritis; IEDB, Immune epitope database; HLA, Human leukocyte antigen; XGBoost, extreme gradient boosting classifier; ProtBERT, Protein BERT; DMARDs, Disease-modifying anti-rheumatic drugs; NSAIDs, Non-steroidal anti-inflammatory drugs; GMO, Genetically modified organisms.</p>
</fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>J</given-names>
</name>
<name>
<surname>Guo</surname> <given-names>S</given-names>
</name>
<name>
<surname>Schrodi</surname> <given-names>SJ</given-names>
</name>
<name>
<surname>He</surname> <given-names>D</given-names>
</name>
</person-group>. <article-title>Molecular and cellular heterogeneity in rheumatoid arthritis: mechanisms and clinical implications</article-title>. <source>Front Immunol</source>. (<year>2021</year>) <volume>12</volume>:<elocation-id>790122</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fimmu.2021.790122</pub-id>, PMID: <pub-id pub-id-type="pmid">34899757</pub-id></citation></ref>
<ref id="B2">
<label>2</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Hu</surname> <given-names>W</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>R</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>Q</given-names>
</name>
<name>
<surname>Zhu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>Signaling pathways in rheumatoid arthritis: implications for targeted therapy</article-title>. <source>Signal Transduct Target Ther</source>. (<year>2023</year>) <volume>8</volume>:<fpage>68</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41392-023-01331-9</pub-id>, PMID: <pub-id pub-id-type="pmid">36797236</pub-id></citation></ref>
<ref id="B3">
<label>3</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wasserman</surname> <given-names>AM</given-names>
</name>
</person-group>. <article-title>Diagnosis and management of rheumatoid arthritis</article-title>. <source>Am Fam Physician</source>. (<year>2011</year>) <volume>84</volume>:<page-range>1245&#x2013;52</page-range>.</citation></ref>
<ref id="B4">
<label>4</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Smolen</surname> <given-names>JS</given-names>
</name>
<name>
<surname>Aletaha</surname> <given-names>D</given-names>
</name>
<name>
<surname>McInnes</surname> <given-names>IB</given-names>
</name>
</person-group>. <article-title>Rheumatoid arthritis</article-title>. <source>Lancet (London England)</source>. (<year>2016</year>) <volume>388</volume>:<page-range>2023&#x2013;38</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0140-6736(16)30173-8</pub-id>, PMID: <pub-id pub-id-type="pmid">27156434</pub-id></citation></ref>
<ref id="B5">
<label>5</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gibofsky</surname> <given-names>A</given-names>
</name>
</person-group>. <article-title>Epidemiology, pathophysiology, and diagnosis of rheumatoid arthritis: A Synopsis</article-title>. <source>Am J Manag Care</source>. (<year>2014</year>) <volume>20</volume>:<page-range>S128&#x2013;35</page-range>., PMID: <pub-id pub-id-type="pmid">25180621</pub-id></citation></ref>
<ref id="B6">
<label>6</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname> <given-names>J</given-names>
</name>
<name>
<surname>Fu</surname> <given-names>X</given-names>
</name>
<name>
<surname>Chen</surname> <given-names>X</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Huang</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Liang</surname> <given-names>C</given-names>
</name>
</person-group>. <article-title>Promising therapeutic targets for treatment of rheumatoid arthritis</article-title>. <source>Front Immunol</source>. (<year>2021</year>) <volume>12</volume>:<elocation-id>686155</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fimmu.2021.686155</pub-id>, PMID: <pub-id pub-id-type="pmid">34305919</pub-id></citation></ref>
<ref id="B7">
<label>7</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ngian</surname> <given-names>G-S</given-names>
</name>
</person-group>. <article-title>Rheumatoid arthritis</article-title>. <source>Aust Fam Physician</source>. (<year>2010</year>) <volume>39</volume>:<page-range>626&#x2013;8</page-range>.</citation></ref>
<ref id="B8">
<label>8</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>D</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Li</surname> <given-names>T</given-names>
</name>
<name>
<surname>Zhao</surname> <given-names>X</given-names>
</name>
<name>
<surname>Lv</surname> <given-names>T</given-names>
</name>
<name>
<surname>Fang</surname> <given-names>G</given-names>
</name>
<etal/>
</person-group>. <article-title>Systemic complications of rheumatoid arthritis: Focus on pathogenesis and treatment</article-title>. <source>Front Immunol</source>. (<year>2022</year>) <volume>13</volume>:<elocation-id>1051082</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fimmu.2022.1051082</pub-id>, PMID: <pub-id pub-id-type="pmid">36618407</pub-id></citation></ref>
<ref id="B9">
<label>9</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>More</surname> <given-names>NE</given-names>
</name>
<name>
<surname>Mandlik</surname> <given-names>R</given-names>
</name>
<name>
<surname>Zine</surname> <given-names>S</given-names>
</name>
<name>
<surname>Gawali</surname> <given-names>VS</given-names>
</name>
<name>
<surname>Godad</surname> <given-names>AP</given-names>
</name>
</person-group>. <article-title>Exploring the therapeutic opportunities of potassium channels for the treatment of rheumatoid arthritis</article-title>. <source>Front Pharmacol</source>. (<year>2024</year>) <volume>15</volume>:<elocation-id>1286069</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fphar.2024.1286069</pub-id>, PMID: <pub-id pub-id-type="pmid">38783950</pub-id></citation></ref>
<ref id="B10">
<label>10</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bhasin</surname> <given-names>M</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>SVM based method for predicting HLA-DRB1*0401 binding peptides in an antigen sequence</article-title>. <source>Bioinformatics</source>. (<year>2004</year>) <volume>20</volume>:<page-range>421&#x2013;3</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btg424</pub-id>, PMID: <pub-id pub-id-type="pmid">14960470</pub-id></citation></ref>
<ref id="B11">
<label>11</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Patiyal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Dhall</surname> <given-names>A</given-names>
</name>
<name>
<surname>Kumar</surname> <given-names>N</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>HLA-DR4Pred2: An improved method for predicting HLA-DRB1*04:01 binders</article-title>. <source>Methods</source>. (<year>2024</year>) <volume>232</volume>:<fpage>18</fpage>&#x2013;<lpage>28</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ymeth.2024.10.007</pub-id>, PMID: <pub-id pub-id-type="pmid">39433152</pub-id></citation></ref>
<ref id="B12">
<label>12</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname> <given-names>C-Y</given-names>
</name>
<name>
<surname>Yang</surname> <given-names>H-Y</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>S-F</given-names>
</name>
<name>
<surname>Lai</surname> <given-names>J-H</given-names>
</name>
</person-group>. <article-title>From rheumatoid factor to anti-citrullinated protein antibodies and anti- carbamylated protein antibodies for diagnosis and prognosis prediction in patients with rheumatoid arthritis</article-title>. <source>Int J Mol Sci</source>. (<year>2021</year>) <volume>22</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/ijms22020686</pub-id>, PMID: <pub-id pub-id-type="pmid">33445768</pub-id></citation></ref>
<ref id="B13">
<label>13</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>McInnes</surname> <given-names>IB</given-names>
</name>
<name>
<surname>Schett</surname> <given-names>G</given-names>
</name>
</person-group>. <article-title>Pathogenetic insights from the treatment of rheumatoid arthritis</article-title>. <source>Lancet (London England)</source>. (<year>2017</year>) <volume>389</volume>:<page-range>2328&#x2013;37</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0140-6736(17)31472-1</pub-id>, PMID: <pub-id pub-id-type="pmid">28612747</pub-id></citation></ref>
<ref id="B14">
<label>14</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hammaker</surname> <given-names>D</given-names>
</name>
<name>
<surname>Nygaard</surname> <given-names>G</given-names>
</name>
<name>
<surname>Kuhs</surname> <given-names>A</given-names>
</name>
<name>
<surname>Ai</surname> <given-names>R</given-names>
</name>
<name>
<surname>Boyle</surname> <given-names>DL</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>W</given-names>
</name>
<etal/>
</person-group>. <article-title>Joint location-specific JAK-STAT signaling in rheumatoid arthritis fibroblast-like synoviocytes</article-title>. <source>ACR Open Rheumatol</source>. (<year>2019</year>) <volume>1</volume>:<page-range>640&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/acr2.11093</pub-id>, PMID: <pub-id pub-id-type="pmid">31872186</pub-id></citation></ref>
<ref id="B15">
<label>15</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Simon</surname> <given-names>LS</given-names>
</name>
<name>
<surname>Taylor</surname> <given-names>PC</given-names>
</name>
<name>
<surname>Choy</surname> <given-names>EH</given-names>
</name>
<name>
<surname>Sebba</surname> <given-names>A</given-names>
</name>
<name>
<surname>Quebe</surname> <given-names>A</given-names>
</name>
<name>
<surname>Knopp</surname> <given-names>KL</given-names>
</name>
<etal/>
</person-group>. <article-title>The Jak/STAT pathway: A focus on pain in rheumatoid arthritis</article-title>. <source>Semin Arthritis Rheum</source>. (<year>2021</year>) <volume>51</volume>:<page-range>278&#x2013;84</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.semarthrit.2020.10.008</pub-id>, PMID: <pub-id pub-id-type="pmid">33412435</pub-id></citation></ref>
<ref id="B16">
<label>16</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Radu</surname> <given-names>A-F</given-names>
</name>
<name>
<surname>Bungau</surname> <given-names>SG</given-names>
</name>
</person-group>. <article-title>Management of rheumatoid arthritis: an overview</article-title>. <source>Cells</source>. (<year>2021</year>) <volume>10</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/cells10112857</pub-id>, PMID: <pub-id pub-id-type="pmid">34831081</pub-id></citation></ref>
<ref id="B17">
<label>17</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Demoruelle</surname> <given-names>MK</given-names>
</name>
<name>
<surname>Deane</surname> <given-names>KD</given-names>
</name>
</person-group>. <article-title>Treatment strategies in early rheumatoid arthritis and prevention of rheumatoid arthritis</article-title>. <source>Curr Rheumatol Rep</source>. (<year>2012</year>) <volume>14</volume>:<page-range>472&#x2013;80</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11926-012-0275-1</pub-id>, PMID: <pub-id pub-id-type="pmid">22773387</pub-id></citation></ref>
<ref id="B18">
<label>18</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cronstein</surname> <given-names>BN</given-names>
</name>
</person-group>. <article-title>The mechanism of action of methotrexate</article-title>. <source>Rheum Dis Clin North Am</source>. (<year>1997</year>) <volume>23</volume>:<page-range>739&#x2013;55</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/s0889-857x(05)70358-6</pub-id>, PMID: <pub-id pub-id-type="pmid">9361153</pub-id></citation></ref>
<ref id="B19">
<label>19</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bedoui</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Guillot</surname> <given-names>X</given-names>
</name>
<name>
<surname>Selambarom</surname> <given-names>J</given-names>
</name>
<name>
<surname>Guiraud</surname> <given-names>P</given-names>
</name>
<name>
<surname>Giry</surname> <given-names>C</given-names>
</name>
<name>
<surname>Jaffar-Bandjee</surname> <given-names>MC</given-names>
</name>
<etal/>
</person-group>. <article-title>Methotrexate an old drug with new tricks</article-title>. <source>Int J Mol Sci</source>. (<year>2019</year>) <volume>20</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/ijms20205023</pub-id>, PMID: <pub-id pub-id-type="pmid">31658782</pub-id></citation></ref>
<ref id="B20">
<label>20</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Hua</surname> <given-names>Z</given-names>
</name>
<name>
<surname>Luo</surname> <given-names>X</given-names>
</name>
<name>
<surname>Li</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Yu</surname> <given-names>L</given-names>
</name>
<name>
<surname>Li</surname> <given-names>M</given-names>
</name>
<etal/>
</person-group>. <article-title>Application and pharmacological mechanism of methotrexate in rheumatoid arthritis</article-title>. <source>BioMed Pharmacother</source>. (<year>2022</year>) <volume>150</volume>:<elocation-id>113074</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.biopha.2022.113074</pub-id>, PMID: <pub-id pub-id-type="pmid">35658215</pub-id></citation></ref>
<ref id="B21">
<label>21</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Conley</surname> <given-names>B</given-names>
</name>
<name>
<surname>Bunzli</surname> <given-names>S</given-names>
</name>
<name>
<surname>Bullen</surname> <given-names>J</given-names>
</name>
<name>
<surname>O&#x2019;Brien</surname> <given-names>P</given-names>
</name>
<name>
<surname>Persaud</surname> <given-names>J</given-names>
</name>
<name>
<surname>Gunatillake</surname> <given-names>T</given-names>
</name>
<etal/>
</person-group>. <article-title>What are the core recommendations for rheumatoid arthritis care? Systematic review of clinical practice guidelines</article-title>. <source>Clin Rheumatol</source>. (<year>2023</year>) <volume>42</volume>:<page-range>2267&#x2013;78</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10067-023-06654-0</pub-id>, PMID: <pub-id pub-id-type="pmid">37291382</pub-id></citation></ref>
<ref id="B22">
<label>22</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Moreira</surname> <given-names>PM</given-names>
</name>
<name>
<surname>Correia</surname> <given-names>AM</given-names>
</name>
<name>
<surname>Cerqueira</surname> <given-names>M</given-names>
</name>
<name>
<surname>Gil</surname> <given-names>MF</given-names>
</name>
</person-group>. <article-title>Perioperative management of disease-modifying antirheumatic drugs and other immunomodulators</article-title>. <source>ARP Rheumatol</source>. (<year>2022</year>) <volume>1</volume>:<page-range>218&#x2013;24</page-range>., PMID: <pub-id pub-id-type="pmid">36057090</pub-id></citation></ref>
<ref id="B23">
<label>23</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Angelini</surname> <given-names>J</given-names>
</name>
<name>
<surname>Talotta</surname> <given-names>R</given-names>
</name>
<name>
<surname>Roncato</surname> <given-names>R</given-names>
</name>
<name>
<surname>Fornasier</surname> <given-names>G</given-names>
</name>
<name>
<surname>Barbiero</surname> <given-names>G</given-names>
</name>
<name>
<surname>Dal Cin</surname> <given-names>L</given-names>
</name>
<etal/>
</person-group>. <article-title>JAK-inhibitors for the treatment of rheumatoid arthritis: A focus on the present and an outlook on the future</article-title>. <source>Biomolecules</source>. (<year>2020</year>) <volume>10</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/biom10071002</pub-id>, PMID: <pub-id pub-id-type="pmid">32635659</pub-id></citation></ref>
<ref id="B24">
<label>24</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ansari</surname> <given-names>HR</given-names>
</name>
<name>
<surname>Flower</surname> <given-names>DR</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>AntigenDB: an immunoinformatics database of pathogen antigens</article-title>. <source>Nucleic Acids Res</source>. (<year>2010</year>) <volume>38</volume>:<page-range>D847&#x2013;53</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gkp830</pub-id>, PMID: <pub-id pub-id-type="pmid">19820110</pub-id></citation></ref>
<ref id="B25">
<label>25</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhall</surname> <given-names>A</given-names>
</name>
<name>
<surname>Patiyal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>A hybrid method for discovering interferon-gamma inducing peptides in human and mouse</article-title>. <source>Sci Rep</source>. (<year>2024</year>) <volume>14</volume>:<fpage>26859</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-024-77957-8</pub-id>, PMID: <pub-id pub-id-type="pmid">39501025</pub-id></citation></ref>
<ref id="B26">
<label>26</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhall</surname> <given-names>A</given-names>
</name>
<name>
<surname>Patiyal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Choudhury</surname> <given-names>S</given-names>
</name>
<name>
<surname>Jain</surname> <given-names>S</given-names>
</name>
<name>
<surname>Narang</surname> <given-names>K</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>TNFepitope: A webserver for the prediction of TNF-alpha inducing epitopes</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>160</volume>:<elocation-id>106929</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106929</pub-id>, PMID: <pub-id pub-id-type="pmid">37126926</pub-id></citation></ref>
<ref id="B27">
<label>27</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dhanda</surname> <given-names>SK</given-names>
</name>
<name>
<surname>Gupta</surname> <given-names>S</given-names>
</name>
<name>
<surname>Vir</surname> <given-names>P</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>Prediction of IL4 inducing peptides</article-title>. <source>Clin Dev Immunol</source>. (<year>2013</year>) <volume>2013</volume>:<elocation-id>263952</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1155/2013/263952</pub-id>, PMID: <pub-id pub-id-type="pmid">24489573</pub-id></citation></ref>
<ref id="B28">
<label>28</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Naorem</surname> <given-names>LD</given-names>
</name>
<name>
<surname>Sharma</surname> <given-names>N</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>A web server for predicting and scanning of IL-5 inducing peptides using alignment-free and alignment-based method</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>158</volume>:<elocation-id>106864</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.106864</pub-id>, PMID: <pub-id pub-id-type="pmid">37058758</pub-id></citation></ref>
<ref id="B29">
<label>29</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vita</surname> <given-names>R</given-names>
</name>
<name>
<surname>Mahajan</surname> <given-names>S</given-names>
</name>
<name>
<surname>Overton</surname> <given-names>JA</given-names>
</name>
<name>
<surname>Dhanda</surname> <given-names>SK</given-names>
</name>
<name>
<surname>Martini</surname> <given-names>S</given-names>
</name>
<name>
<surname>Cantrell</surname> <given-names>JR</given-names>
</name>
<etal/>
</person-group>. <article-title>The immune epitope database (IEDB): 2018 update</article-title>. <source>Nucleic Acids Res</source>. (<year>2019</year>) <volume>47</volume>:<page-range>D339&#x2013;43</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gky1006</pub-id>, PMID: <pub-id pub-id-type="pmid">30357391</pub-id></citation></ref>
<ref id="B30">
<label>30</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pande</surname> <given-names>A</given-names>
</name>
<name>
<surname>Patiyal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Lathwal</surname> <given-names>A</given-names>
</name>
<name>
<surname>Arora</surname> <given-names>C</given-names>
</name>
<name>
<surname>Kaur</surname> <given-names>D</given-names>
</name>
<name>
<surname>Dhall</surname> <given-names>A</given-names>
</name>
<etal/>
</person-group>. <article-title>Pfeature: A tool for computing wide range of protein features and building prediction models</article-title>. <source>J Comput Biol</source>. (<year>2023</year>) <volume>30</volume>:<page-range>204&#x2013;22</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1089/cmb.2022.0241</pub-id>, PMID: <pub-id pub-id-type="pmid">36251780</pub-id></citation></ref>
<ref id="B31">
<label>31</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Elnaggar</surname> <given-names>A</given-names>
</name>
<name>
<surname>Heinzinger</surname> <given-names>M</given-names>
</name>
<name>
<surname>Dallago</surname> <given-names>C</given-names>
</name>
<name>
<surname>Rihawi</surname> <given-names>G</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>Yu</given-names>
</name>
<name>
<surname>Jones</surname> <given-names>L</given-names>
</name>
<etal/>
</person-group>. <article-title>ProtTrans: towards cracking the language of life&#x2019;s code through self-supervised deep learning and high performance computing</article-title>. <source>arxiv</source>. (<year>2020</year>) <volume>v3</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2007.06225</pub-id>
</citation></ref>
<ref id="B32">
<label>32</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vacic</surname> <given-names>V</given-names>
</name>
<name>
<surname>Iakoucheva</surname> <given-names>LM</given-names>
</name>
<name>
<surname>Radivojac</surname> <given-names>P</given-names>
</name>
</person-group>. <article-title>Two Sample Logo: a graphical representation of the differences between two sets of sequence alignments</article-title>. <source>Bioinformatics</source>. (<year>2006</year>) <volume>22</volume>:<page-range>1536&#x2013;7</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btl151</pub-id>, PMID: <pub-id pub-id-type="pmid">16632492</pub-id></citation></ref>
<ref id="B33">
<label>33</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Altschul</surname> <given-names>SF</given-names>
</name>
<name>
<surname>Gish</surname> <given-names>W</given-names>
</name>
<name>
<surname>Miller</surname> <given-names>W</given-names>
</name>
<name>
<surname>Myers</surname> <given-names>EW</given-names>
</name>
<name>
<surname>Lipman</surname> <given-names>DJ</given-names>
</name>
</person-group>. <article-title>Basic local alignment search tool</article-title>. <source>J Mol Biol</source>. (<year>1990</year>) <volume>215</volume>:<page-range>403&#x2013;10</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/S0022-2836(05)80360-2</pub-id>, PMID: <pub-id pub-id-type="pmid">2231712</pub-id></citation></ref>
<ref id="B34">
<label>34</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Vens</surname> <given-names>C</given-names>
</name>
<name>
<surname>Rosso</surname> <given-names>M-N</given-names>
</name>
<name>
<surname>Danchin</surname> <given-names>EGJ</given-names>
</name>
</person-group>. <article-title>Identifying discriminative classification-based motifs in biological sequences</article-title>. <source>Bioinformatics</source>. (<year>2011</year>) <volume>27</volume>:<page-range>1231&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bioinformatics/btr110</pub-id>, PMID: <pub-id pub-id-type="pmid">21372086</pub-id></citation></ref>
<ref id="B35">
<label>35</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tomer</surname> <given-names>R</given-names>
</name>
<name>
<surname>Patiyal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Dhall</surname> <given-names>A</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>Prediction of celiac disease associated epitopes and motifs in a protein</article-title>. <source>Front Immunol</source>. (<year>2023</year>) <volume>14</volume>:<elocation-id>1056101</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fimmu.2023.1056101</pub-id>, PMID: <pub-id pub-id-type="pmid">36742312</pub-id></citation></ref>
<ref id="B36">
<label>36</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Kumar</surname> <given-names>N</given-names>
</name>
<name>
<surname>Patiyal</surname> <given-names>S</given-names>
</name>
<name>
<surname>Choudhury</surname> <given-names>S</given-names>
</name>
<name>
<surname>Tomer</surname> <given-names>R</given-names>
</name>
<name>
<surname>Dhall</surname> <given-names>A</given-names>
</name>
<name>
<surname>Raghava</surname> <given-names>GPS</given-names>
</name>
</person-group>. <article-title>DMPPred: a tool for identification of antigenic regions responsible for inducing type 1 diabetes mellitus</article-title>. <source>Brief Bioinform</source>. (<year>2023</year>) <volume>24</volume>:<fpage>bbac525</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/bib/bbac525</pub-id>, PMID: <pub-id pub-id-type="pmid">36524996</pub-id></citation></ref>
<ref id="B37">
<label>37</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hill</surname> <given-names>JA</given-names>
</name>
<name>
<surname>Wang</surname> <given-names>D</given-names>
</name>
<name>
<surname>Jevnikar</surname> <given-names>AM</given-names>
</name>
<name>
<surname>Cairns</surname> <given-names>E</given-names>
</name>
<name>
<surname>Bell</surname> <given-names>DA</given-names>
</name>
</person-group>. <article-title>The relationship between predicted peptide-MHC class II affinity and T-cell activation in a HLA-DRbeta1*0401 transgenic mouse model</article-title>. <source>Arthritis Res Ther</source>. (<year>2003</year>) <volume>5</volume>:<page-range>R40&#x2013;8</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/ar605</pub-id>, PMID: <pub-id pub-id-type="pmid">12716452</pub-id></citation></ref>
<ref id="B38">
<label>38</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lee</surname> <given-names>S</given-names>
</name>
<name>
<surname>Kang</surname> <given-names>S</given-names>
</name>
<name>
<surname>Eun</surname> <given-names>Y</given-names>
</name>
<name>
<surname>Won</surname> <given-names>H-H</given-names>
</name>
<name>
<surname>Kim</surname> <given-names>H</given-names>
</name>
<name>
<surname>Lee</surname> <given-names>J</given-names>
</name>
<etal/>
</person-group>. <article-title>Machine learning-based prediction model for responses of bDMARDs in patients with rheumatoid arthritis and ankylosing spondylitis</article-title>. <source>Arthritis Res Ther</source>. (<year>2021</year>) <volume>23</volume>:<fpage>254</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13075-021-02635-3</pub-id>, PMID: <pub-id pub-id-type="pmid">34627335</pub-id></citation></ref>
<ref id="B39">
<label>39</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Matsuo</surname> <given-names>H</given-names>
</name>
<name>
<surname>Kamada</surname> <given-names>M</given-names>
</name>
<name>
<surname>Imamura</surname> <given-names>A</given-names>
</name>
<name>
<surname>Shimizu</surname> <given-names>M</given-names>
</name>
<name>
<surname>Inagaki</surname> <given-names>M</given-names>
</name>
<name>
<surname>Tsuji</surname> <given-names>Y</given-names>
</name>
<etal/>
</person-group>. <article-title>Machine learning-based prediction of relapse in rheumatoid arthritis patients using data on ultrasound examination and blood test</article-title>. <source>Sci Rep</source>. (<year>2022</year>) <volume>12</volume>:<fpage>7224</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/s41598-022-11361-y</pub-id>, PMID: <pub-id pub-id-type="pmid">35508670</pub-id></citation></ref>
<ref id="B40">
<label>40</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Prasad</surname> <given-names>B</given-names>
</name>
<name>
<surname>McGeough</surname> <given-names>C</given-names>
</name>
<name>
<surname>Eakin</surname> <given-names>A</given-names>
</name>
<name>
<surname>Ahmed</surname> <given-names>T</given-names>
</name>
<name>
<surname>Small</surname> <given-names>D</given-names>
</name>
<name>
<surname>Gardiner</surname> <given-names>P</given-names>
</name>
<etal/>
</person-group>. <article-title>ATRPred: A machine learning based tool for clinical decision making of anti-TNF treatment in rheumatoid arthritis patients</article-title>. <source>PloS Comput Biol</source>. (<year>2022</year>) <volume>18</volume>:<fpage>e1010204</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pcbi.1010204</pub-id>, PMID: <pub-id pub-id-type="pmid">35788746</pub-id></citation></ref>
<ref id="B41">
<label>41</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dudek</surname> <given-names>G</given-names>
</name>
<name>
<surname>Sakowski</surname> <given-names>S</given-names>
</name>
<name>
<surname>Brzezinska</surname> <given-names>O</given-names>
</name>
<name>
<surname>Sarnik</surname> <given-names>J</given-names>
</name>
<name>
<surname>Budlewski</surname> <given-names>T</given-names>
</name>
<name>
<surname>Dragan</surname> <given-names>G</given-names>
</name>
<etal/>
</person-group>. <article-title>Machine learning-based prediction of rheumatoid arthritis with development of ACPA autoantibodies in the presence of non-HLA genes polymorphisms</article-title>. <source>PloS One</source>. (<year>2024</year>) <volume>19</volume>:<fpage>e0300717</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1371/journal.pone.0300717</pub-id>, PMID: <pub-id pub-id-type="pmid">38517871</pub-id></citation></ref>
<ref id="B42">
<label>42</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rauscher</surname> <given-names>S</given-names>
</name>
<name>
<surname>Baud</surname> <given-names>S</given-names>
</name>
<name>
<surname>Miao</surname> <given-names>M</given-names>
</name>
<name>
<surname>Keeley</surname> <given-names>FW</given-names>
</name>
<name>
<surname>Pomes</surname> <given-names>R</given-names>
</name>
</person-group>. <article-title>Proline and glycine control protein self-organization into elastomeric or amyloid fibrils</article-title>. <source>Structure</source>. (<year>2006</year>) <volume>14</volume>:<page-range>1667&#x2013;76</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.str.2006.09.008</pub-id>, PMID: <pub-id pub-id-type="pmid">17098192</pub-id></citation></ref>
</ref-list>
</back>
</article>