<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Microbiol.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Microbiology</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Microbiol.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-302X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmicb.2026.1744805</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Chlamy_ChloroPred: a deep learning-based, highly accurate binary classifier for chloroplast protein prediction in the model microalga, <italic>Chlamydomonas reinhardtii</italic>, with potential cross-proteome versatility</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes">
<name>
<surname>Choi</surname>
<given-names>Hong Il</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3276106"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name>
<surname>Lee</surname>
<given-names>Sung Ho</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<xref ref-type="author-notes" rid="fn0001"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lee</surname>
<given-names>Il Hyung</given-names>
</name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3365447"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lee</surname>
<given-names>Yong Jae</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1419462"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yun</surname>
<given-names>Jin-Ho</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1640688"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Choi</surname>
<given-names>Dong-Yun</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Cho</surname>
<given-names>Dae-Hyun</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/301706"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Shin</surname>
<given-names>Bum-Soo</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3350614"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Chun</surname>
<given-names>Junyoung</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3362763"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lee</surname>
<given-names>Dong Won</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Kim</surname>
<given-names>Hee-Sik</given-names>
</name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/635131"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x0026; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Cell Factory Research Center, Korea Research Institute of Bioscience and Biotechnology (KRIBB)</institution>, <city>Daejeon</city>, <country country="kr">Republic of Korea</country></aff>
<aff id="aff2"><label>2</label><institution>Department of Bioresource and Environmental Engineering, University of Science and Technology (UST)</institution>, <city>Daejeon</city>, <country country="kr">Republic of Korea</country></aff>
<aff id="aff3"><label>3</label><institution>Daemyung Vision Co., Ltd.</institution>, <city>Yongin-si</city>, <country country="kr">Republic of Korea</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Integrative Biotechnology, Sungkyunkwan University</institution>, <city>Suwon-si</city>, <country country="kr">Republic of Korea</country></aff>
<author-notes>
<corresp id="c001"><label>&#x002A;</label>Correspondence: Hong Il Choi, <email xlink:href="mailto:hichoi@kribb.re.kr">hichoi@kribb.re.kr</email>; Hee-Sik Kim, <email xlink:href="mailto:hkim@kribb.re.kr">hkim@kribb.re.kr</email></corresp>
<fn fn-type="equal" id="fn0001"><label>&#x2020;</label><p>These authors have contributed equally to this work and share first authorship</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-23">
<day>23</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1744805</elocation-id>
<history>
<date date-type="received">
<day>12</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>29</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>04</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2026 Choi, Lee, Lee, Lee, Yun, Choi, Cho, Shin, Chun, Lee and Kim.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Choi, Lee, Lee, Lee, Yun, Choi, Cho, Shin, Chun, Lee and Kim</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-23">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The chloroplast, a living relic of an ancient endosymbiotic interaction between a microalga and a microbe and the principal subcellular organelle responsible for biological CO<sub>2</sub> assimilation, is emerging as a key target for research to enhance photosynthetic efficiency beyond its current limitations. Given that accurate protein localization is a prerequisite for the in-depth scientific investigation and practical application of the membrane-compartmentalized photosynthetic organelle, numerous computational prediction tools have been proposed, yet their accuracy remains unsatisfactory.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address the limitation, we herein present Chlamy_ChloroPred, a newly developed deep learning-based framework composed of multi-layered artificial neural networks, carefully designed to perform binary classification of chloroplast proteins in the model photosynthetic microorganism, <italic>Chlamydomonas reinhardtii</italic>. The model captures locality-aware features of determinant amino acid residues in the chloroplast transit peptide (cTP), generally located within the ~50-amino-acid N-terminal region of mature chloroplast proteins, through the integration of ProtBERT-BFD embeddings, stacked bidirectional long short-term memory (BiLSTM) networks, and an attentive pooling layer.</p>
</sec>
<sec>
<title>Results and discussion</title>
<p>Our model achieved an accuracy of 0.8462 for the <italic>C. reinhardtii</italic> proteome, outperforming widely used localization predictors, including TargetP 1.1 (0.4970), TargetP 2.0 (0.7396), and PredAlgo (0.7738) under a binary classification scheme. Comparative analyses further demonstrated that Chlamy_ChloroPred exhibits competitive performance relative to the current state-of-the-art model, PB-Chlamy (0.8521), under identical evaluation conditions. Notably, despite being trained solely on the algal proteome, Chlamy_ChloroPred showed substantial cross-species versatility when applied to the proteome of the terrestrial plant, <italic>Arabidopsis thaliana</italic>, achieving an accuracy of 0.7316 &#x2013; representing a 12.6% improvement over TargetP 2.0, a predictor with previously demonstrated cross-proteome versatility. This likely stems from the model&#x2019;s robust ability to capture conserved features of chloroplast proteins across proteomes from diverse photosynthetic lineages.</p>
</sec>
<sec>
<title>Conclusion</title>
<p>We developed a deep learning&#x2013;based framework, Chlamy_ChloroPred, that integrates carefully designed neural layers with low computational complexity, achieving high predictive accuracy and interpretability. We believe that Chlamy_ChloroPred represents a compelling alternative to existing predictors, especially when accurate inference of chloroplast proteins is required.</p>
</sec>
</abstract>
<kwd-group>
<kwd><italic>Chlamydomonas reinhardtii</italic></kwd>
<kwd>chloroplast protein</kwd>
<kwd>deep learning</kwd>
<kwd>microalgae</kwd>
<kwd>neural network</kwd>
<kwd>protein localization prediction</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work was supported by the STEAM project (RS-2024-00459155), the 2025 Global TOP Strategic Research Group Support Project [the National Research Council of Science &#x0026; Technology (NST) granted by the Korea government (Ministry of Science and ICT)] (GTL25021-300), and the Basic Science Research Programs (the National Research Foundation of Korea) (RS-2024-00353284, RS-2024-00351663, and 2021R1C1C1003425). This study was also supported by the KRIBB Research Initiative Program (KGM5252423).</funding-statement>
</funding-group>
<counts>
<fig-count count="4"/>
<table-count count="1"/>
<equation-count count="5"/>
<ref-count count="45"/>
<page-count count="14"/>
<word-count count="9795"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Aquatic Microbiology</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>There is a nagging concern about the climate crisis caused by exploding anthropogenic CO<sub>2</sub> emissions. Microalgae, photosynthetic microeukaryotes, are emerging as a promising biological platform to combat the global threat due to their ability to devour CO<sub>2</sub> while converting it into valuable biocompounds (<xref ref-type="bibr" rid="ref10">Choi et al., 2021</xref>). In the photosynthetic cells, biological CO<sub>2</sub> fixation occurs in the chloroplast&#x2014;like other terrestrial plants, a subcellular compartment that contains various photosynthesis-related proteinaceous machineries. In addition, the organelle plays a pivotal role in starch, amino acid, and lipid metabolism, all of which are essential for survival of the photosynthetic organisms (<xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>). Since the complex, cascading biochemical reactions occur through the combinatorial functions of groups of enzymes co-localized within the chloroplast, accurately identifying the localization of proteins is one of the most important prerequisites for thorough understanding of life processes and their applications from scientific and engineering perspectives.</p>
<p>Although the chloroplast contains its own bacteria-like circular genome and transcription/translation apparatus&#x2014;evidence of evolutionary endosymbiotic interaction between a microalga and a microbe that has become another intriguing characteristic of the chloroplast, attracting considerable scholarly interest (<xref ref-type="bibr" rid="ref11">Dyo and Purton, 2018</xref>; <xref ref-type="bibr" rid="ref21">Kang et al., 2021</xref>), most of its proteins are encoded by nuclear DNA. This spatial discordance necessitates sophisticated delivery mechanisms to properly localize and ensure the functionality of the nuclear-encoded proteins in the chloroplast (<xref ref-type="bibr" rid="ref28">Leister, 2003</xref>; <xref ref-type="bibr" rid="ref41">Teufel et al., 2022</xref>; <xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>). Translocation of a chloroplast protein (CP) into the chloroplast is initiated by the recognition of the chloroplast transit peptide (cTP), which is an extension sequence that covalently fused to the N-terminal region of the premature CP. After the CP is imported <italic>via</italic> the TOC-TIC (translocon at the outer chloroplast membrane&#x2014;translocon at the inner chloroplast membrane) supercomplex, the cTP is ultimately cleaved by a stromal processing peptidase upon its successful arrival inside the target organelle, leaving behind a folded mature protein (<xref ref-type="bibr" rid="ref26">Lee and Hwang, 2018</xref>). Given the import process, cTP acts as a crucial biological label that directs the cargo (i.e., CP) to the chloroplast <italic>in vivo</italic>, suggesting that its presence may be a reliable indicator of a protein&#x2019;s final subcellular destination (<xref ref-type="bibr" rid="ref9">Caspari, 2022</xref>).</p>
<p>Fueled by the remarkable progress in artificial intelligence (AI), a number of amino acid sequence-based protein localization prediction tools have been suggested (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>; <xref ref-type="bibr" rid="ref2">Almagro Armenteros et al., 2019</xref>; <xref ref-type="bibr" rid="ref41">Teufel et al., 2022</xref>; <xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>). Such <italic>in silico</italic> prediction programs provide good alternatives to reliable-yet-arduous conventional localization methods, such as fluorescence tagging and immunohistochemistry (<xref ref-type="bibr" rid="ref4">Bernstein et al., 1994</xref>; <xref ref-type="bibr" rid="ref24">Kong et al., 2015</xref>; <xref ref-type="bibr" rid="ref30">Mackinder et al., 2017</xref>; <xref ref-type="bibr" rid="ref9">Caspari, 2022</xref>; <xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>), due to its immediacy and throughput. As interest in the photosynthetic organelle has increased, several predictors have also been proposed that are specifically designed to classify chloroplast proteins, which focus on the <italic>N</italic>-terminally located potential cTP region (<xref ref-type="bibr" rid="ref37">Schein, 2001</xref>; <xref ref-type="bibr" rid="ref14">Emanuelsson et al., 2008</xref>). However, they suffer from low predictive effectiveness, which can be attributed to several systematic reasons, including: (i) cTPs share loosely conserved properties in terms of amino acid length and composition&#x2014;despite the identification of a handful of canonical consensus motifs (e.g., V/I-X-A/C), which precludes readily deciphering the common denominators of CPs, ultimately impeding the construction of a precise forecaster (<xref ref-type="bibr" rid="ref8">Bruce, 2001</xref>; <xref ref-type="bibr" rid="ref16">Franz&#x00E9;n et al., 2001</xref>; <xref ref-type="bibr" rid="ref18">Gavel and von Heijne, 2001</xref>; <xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>); (ii) the model architectures employed in previous programs may retain insufficient capability to figure out the enigmatic feature of cTPs; and (iii) the limited quantity and suboptimal quality of the input data used during training process might also adversely affect the model&#x2019;s performance.</p>
<p>In this study, we demonstrate a novel binary classifier for chloroplast proteins, which is specially established for the green microalga, <italic>Chlamydomonas reinhardtii</italic>&#x2014;an ideal model photosynthetic organism for biotechnological studies and cell factory development that awaits a tailored CP predictor to expedite the chloroplast research workflow (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>; <xref ref-type="bibr" rid="ref11">Dyo and Purton, 2018</xref>; <xref ref-type="bibr" rid="ref38">Surridge, 2022</xref>; <xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>; <xref ref-type="bibr" rid="ref12">Einhaus et al., 2024</xref>). To address the problematic issues raised in previous cases, we constructed and trained a multi-layered deep learning framework, named Chlamy_ChloroPred, which consists of a sequential connection of the following: a feature extractor based on ProtBERT-BFD that is a Transformer-based protein language model pre-trained based on 2.5 billion protein sequences (<xref ref-type="bibr" rid="ref13">Elnaggar et al., 2022</xref>; <xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>); a triply stacked RNN-BiLSTM (recurrent neural network-bidirectional long short-term memory) layer; a 4-head multi-head attention module; and an attentive pooling network. These components were brought together to unravel the barely explicit properties of cTPs. Furthermore, the training used hundreds of up-to-date, experimentally proven input data collated from various <italic>C. reinhardtii</italic> proteomics literature sources, considering the significance of data quality and quantity in creating an accurate predictive model. As a result, we successfully developed a binary classifier for CP with an accuracy of 0.8462 for the test dataset, representing a 14.41% improvement over the widely accepted benchmark program, TargetP 2.0 (accuracy of 0.7396) (<xref ref-type="bibr" rid="ref2">Almagro Armenteros et al., 2019</xref>), when evaluated for the same dataset. Comparative analyses further demonstrated that Chlamy_ChloroPred outperforms broader predictors, such as TargetP 1.1&#x2014;the predecessor of TargetP 2.0 (accuracy of 0.4970)&#x2014;and PredAlgo (accuracy of 0.7738) (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>), and exhibits competitive performance relative to the current state-of-the-art (SOTA) model, PB-Chlamy (accuracy of 0.8521) (<xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>), under a binary classification setting. Although our program was trained solely on <italic>C. reinhardtii</italic> proteomics, it also achieved higher accuracy (0.7316) than the cross-proteome benchmark, TargetP 2.0 (0.6500), when tested on CP data from another photosynthetic model, <italic>Arabidopsis thaliana</italic>. On balance, we believe that the Chlamy_ChloroPred program, with its high predictive performance and versatility, will serve as a breakthrough tool in future chloroplast and photosynthesis studies.</p>
</sec>
<sec sec-type="materials|methods" id="sec2">
<label>2</label>
<title>Materials and methods</title>
<sec id="sec3">
<label>2.1</label>
<title>Input data acquisition</title>
<p>Since we aimed at developing a CP classifier tailored for <italic>C. reinhardtii</italic>, we collected 1,262 protein sequences, of which 841 were revealed to be localized in the chloroplast (i.e., CP) and 421 were revealed to be localized in organelles other than the chloroplast (i.e., Non-CP) (<xref ref-type="fig" rid="fig1">Figure 1A</xref>; <xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S1</xref>). To furnish the developing program with data whose localizations are empirically corroborated, we manually mined gene identifications from 6 literature sources, including: 5 reports that experimentally investigated the proteomics of the model green microalga using fluorescence-tagged proteins (<xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>) and mass spectrometry identifying semi-tryptic peptides (<xref ref-type="bibr" rid="ref40">Terashima et al., 2010</xref>; <xref ref-type="bibr" rid="ref5">Bienvenut et al., 2011</xref>; <xref ref-type="bibr" rid="ref44">Zhan et al., 2018</xref>; <xref ref-type="bibr" rid="ref33">Ramundo et al., 2020</xref>); and one reference that thoroughly scrutinized the subcellular destinations of proteins reported in various publications using its own stringent criteria (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>). Protein sequences corresponding to the gene identifications were then scrapped from the <italic>Chlamydomonas reinhardtii</italic> v5.6 database on the Phytozome (<xref ref-type="bibr" rid="ref31">Merchant et al., 2007</xref>). Only primary transcripts were used for building this dataset. During the compilation of amino acid sequences from the references, redundant sequences (i.e., the identical sequence found in multiple sources) were treated as follows: (i) Sequences designated with the same localization were merged (i.e., one was selected); and (ii) those with controversial localizations were removed to avoid any reasonable doubt. After assembly, 420 CPs were excluded from the overall dataset to balance the data distribution. This resulted in the &#x201C;balanced labeled dataset&#x201D; consisting of an equal number of CPs and Non-CPs (421 each; <xref ref-type="fig" rid="fig1">Figure 1A</xref>), given that imbalanced input data may induce systematic bias in the model (<xref ref-type="bibr" rid="ref3">Belhaouari et al., 2024</xref>). Subsequently, we split the resulting dataset into &#x201C;train (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S2</xref>),&#x201D; &#x201C;validation (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S3</xref>),&#x201D; and &#x201C;test (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>)&#x201D; datasets with a ratio of 3:1:1, while maintaining the label distribution of each subset (<xref ref-type="fig" rid="fig1">Figure 1B</xref>).</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Composition and distribution of the data used to develop Chlamy_ChloroPred. <bold>(A)</bold> Sequences from 1,262 proteins were initially collected, but 420 CP-labeled proteins were excluded to balance the labeling of the data. Through sequential data splitting, train, validation, and test datasets were prepared at a ratio of 3:1:1. The indicated percentages correspond to the proportion of a given dataset compared to the previous one immediately before the split. CP stands for chloroplast protein. <bold>(B)</bold> Histograms demonstrating the length distribution of protein sequences from the balanced labeled dataset, train, validation, and test datasets. Several statistics are shown for each dataset, including the mean, maximum, and minimum length of the protein sequences included. The proportions of proteins with lengths greater than 1,000 amino acids (&#x003E;1,000 AAs) are also presented. Max and Min stand for maximum and minimum, respectively.</p>
</caption>
<graphic xlink:href="fmicb-17-1744805-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Figure with two panels showing dataset construction and protein length distributions. Panel A outlines data filtering and splitting from 1,262 proteins to a balanced labeled dataset of 842, split into train, validation, and test sets for CP and non-CP proteins. Panel B presents a histogram of protein length counts for the balanced labeled dataset and inset histograms displaying length distributions for train, validation, and test splits, including statistics for mean, maximum, minimum lengths, and proportion above one thousand amino acids.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec4">
<label>2.2</label>
<title>Neural network model</title>
<p>We designed a deep learning-based classifier framework that integrates a stacked recurrent neural network and attention-based mechanisms to forecast CPs from ProtBERT-BFD embeddings (<xref ref-type="fig" rid="fig2">Figure 2</xref>). Prior to the model organization, the datasets were labeled in a binary fashion as 0 and 1 for Non-CPs and CPs, respectively, to proceed with supervised learning. After labeling, all protein sequences subjected to the train, validation, and test processes were fixed at a maximum length of 1,000 amino acids (AAs) from the N-terminus, with longer sequences truncated and shorter ones padded. The standard length of the N-terminal sequences likely containing cTPs (i.e., 1,000 AAs) was determined by considering that cTPs are typically no longer than this limitation (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>) and that &#x003E;95% of sequences in the overall dataset fall within 1,000-AA coverage (<xref ref-type="fig" rid="fig1">Figure 1B</xref>), making the model framework lightweight while covering the entire length of the most sequences.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Schematic diagram describing the architecture and binary prediction process of Chlamy_ChloroPred. Raw amino acid sequences, which are composed of single-letter amino acid codes, were processed for length normalization to 1,000 AAs and tokenized using the CLS (classification token), PAD (padding token), and SEP (separator token). A large protein language model, ProtBERT-BFD, was employed to encode the input protein sequences with pre-learned biochemical and structural features in a form suitable for the subsequent prediction layer. The resulting embeddings were input into an iterative prediction network consisting mainly of a stacked RNN-BiLSTM (recurrent neural network-bidirectional long short-term memory) and an attentive pooling layers. The training resulted in the establishment of the binary chloroplast protein classifier, Chlamy_ChloroPred, which was then used to estimate performance, predict protein prediction, and investigate versatility. Further DSs refer to additional datasets (DSs) that could be applied to the model (e.g., the <italic>A. thaliana</italic> proteome) for broader applications, other than the train, validation, and test datasets.</p>
</caption>
<graphic xlink:href="fmicb-17-1744805-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Workflow diagram illustrating a deep learning pipeline for chloroplast protein classification, beginning with raw amino acid sequences, tokenization, and embedding using a pre-trained ProtBERT-BFD model, followed by feature extraction, data splitting, and multi-layered BiLSTM neural network training for binary classification with attention mechanisms.</alt-text>
</graphic>
</fig>
<p>As an initial step in constructing the framework, a pre-trained ProtBERT-BFD model was used to obtain embeddings (i.e., 1,024-dimensional vectors) from datasets, enriched with biochemical and structural context pre-learned from large protein corpora. We used the token-level embeddings derived from the ProtBERT-BFD-based feature extractor as inputs of downstream. A positional encoding layer was applied to the input to preserve positional information lost during embedding. Using a layer normalization, the position-encoded sequences were then normalized. Subsequently, the normalized outputs were processed by a stack of triple bidirectional LSTM (BiLSTM) layers, which retain hidden dimensions of 128, 64, and 32&#x202F;units, respectively, in order to capture contextual dependencies along the protein sequence in both the forward and backward directions. Next, we added a multi-head self-attention mechanism with 4 heads and 64 key dimensions to the framework to improve the representation with long-range inter-residue context-aware dependencies. The resulting output was incorporated back into the BiLSTM through a residual connection, followed by layer normalization and a Transformer feed-forward subnetwork. This attention block enables the model to dynamically weight sequence positions and understand the relationships (i.e., global dependencies) among all tokens in parallel. Following the contextualization step, we applied another sublayer of the attention mechanism, an attentive pooling layer, which calculates a weighted average of sequence features. This effectively prioritizes biologically informative sequence regions (e.g., cTP) and simultaneously attenuates less relevant residues for classification. Finally, the pooled representation was regularized using dropout with a probability of 0.45 and then passed through a fully connected dense layer of 256&#x202F;units with rectified linear unit (ReLu) activation and L2 kernel regularization (<italic>&#x03BB;</italic>&#x202F;=&#x202F;1&#x202F;&#x00D7;&#x202F;10<sup>&#x2212;4</sup>), followed by the final dropout step consisting of a single unit with a sigmoid activation function that prints the probability of the sequence being localized to the chloroplast. For binary classification, a probability of &#x2265;0.5 was used as the threshold for determining localization.</p>
</sec>
<sec id="sec5">
<label>2.3</label>
<title>Program execution</title>
<p>We used Google Colaboratory (Colab) environment (Python 3, T4 GPU, and high-RAM mode) for running the deep learning-based program. The model was trained using the Adam optimizer with a learning rate of 1&#x202F;&#x00D7;&#x202F;10<sup>&#x2212;4</sup>, minimizing the binary cross-entropy loss with prediction accuracy as the evaluation metric. The early stopping callback was used to prevent the neural network from overtraining by monitoring the validation loss with a patience of 3. We saved the best version of the trained model with the best validation accuracy for further evaluation using the model checkpoint callback.</p>
</sec>
<sec id="sec6">
<label>2.4</label>
<title>Performance measures of the models</title>
<p>When evaluating the performance of models developed in this study based on the input data described in Section 2.1, the performance metrics including the accuracy, precision, recall&#x2014;also known as sensitivity or true positive rate (TPR), and F1 score were calculated and compared as follows (<xref ref-type="bibr" rid="ref43">Yue et al., 2022</xref>):</p><disp-formula id="E1">
<mml:math id="M1">
<mml:mtext>Accuracy</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mi>TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>TN</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>FN</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>FP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>TN</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(1)</label>
</disp-formula><disp-formula id="E2">
<mml:math id="M2">
<mml:mtext>Precision</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>TP</mml:mi>
<mml:mrow>
<mml:mi>TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>FP</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(2)</label>
</disp-formula><disp-formula id="E3">
<mml:math id="M3">
<mml:mtext>Recall</mml:mtext>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>sensitivity or</mml:mtext>
<mml:mspace width="0.25em"/>
<mml:mi>TPR</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>TP</mml:mi>
<mml:mrow>
<mml:mi>TP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>FN</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(3)</label>
</disp-formula><disp-formula id="E4">
<mml:math id="M4">
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mn>1</mml:mn>
<mml:mspace width="0.25em"/>
<mml:mtext>score</mml:mtext>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext>Precision</mml:mtext>
<mml:mo>&#x00D7;</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mtext>Precision</mml:mtext>
<mml:mo>+</mml:mo>
<mml:mtext>Recall</mml:mtext>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(4)</label>
</disp-formula>
<p>Where TP is true positive (counted if an instance of CP is correctly predicted as CP), TN is true negative (counted if an instance of Non-CP is correctly predicted as Non-CP), FP is false positive (counted if an instance of Non-CP is wrongly predicted as CP), and FN is false negative (counted if an instance of CP is wrongly predicted as Non-CP). Accuracy measures the percentage of correct predictions across all samples, while precision calculates the proportion of positive classifications that were actually correct and improves as the number of false positive decreases. Recall is defined as the proportion of actual positives that were correctly classified as such and improves when the number of false negative decreases. F1 score is the harmonic mean of precision and recall, which balances the importance of both, indicating the reliability of a model.</p>
<p>Meanwhile, to evaluate the model&#x2019;s performance (i.e., effectiveness of the binary classification model), we plotted the ROC (receiver operating characteristic) curve and estimated the AUC (area under the ROC curve). The ROC curve visually represents the performance of a model that is graphed by calculating the TPR (<xref ref-type="disp-formula" rid="E3">Equation 3</xref>) against the false positive rate (FPR) at various threshold values. The FPR, which measures how often a model incorrectly predicts negative samples as positive, is calculated as follows (<xref ref-type="bibr" rid="ref1">Adabor et al., 2025</xref>):</p><disp-formula id="E5">
<mml:math id="M5">
<mml:mtext>False positive rate</mml:mtext>
<mml:mspace width="0.25em"/>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>FPR</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:mi>FP</mml:mi>
<mml:mrow>
<mml:mi>FP</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>TN</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(5)</label>
</disp-formula>
<p>The AUC is one of the most useful single indicators for comparing the overall performance of models, particularly when the dataset is balanced. An AUC score of 1 means the model achieves perfect classification.</p>
<p>As one of the performance measurements of the developed model, the entire <italic>C. reinhardtii</italic> proteome (i.e., 19,526 proteins provided in <italic>Chlamydomonas reinhardtii</italic> genome version 5.6 in the Phytozome database) (<xref ref-type="bibr" rid="ref31">Merchant et al., 2007</xref>) was also prepared and tested as described above.</p>
</sec>
<sec id="sec7">
<label>2.5</label>
<title>Comparison with benchmark programs</title>
<p>The performance of the proposed model was compared with that of the TargetP 2.0 software (<xref ref-type="bibr" rid="ref2">Almagro Armenteros et al., 2019</xref>), which was selected as the primary benchmark in this study due to its ease of access, multi-species and cross-proteome versatility, and established authority in the field of protein localization prediction. When running the software, two parameters were selected: &#x201C;Plant&#x201D; for the organism group and &#x201C;Long output&#x201D; for the output format. Input dataset for the benchmark was identically pre-conditioned as described in Sections 2.1 and 2.2. For the TargetP 2.0 results, a protein was designated as a chloroplast protein when the &#x201C;Chloroplast transfer peptide&#x201D; score was the highest among other scores.</p>
<p>To more comprehensively position Chlamy_ChloroPred among currently available models, we examined the predicted localization of proteins in the test dataset (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>) using broader predictors, including TargetP 1.1 (the previous version of TargetP 2.0) (<xref ref-type="bibr" rid="ref2">Almagro Armenteros et al., 2019</xref>), PredAlgo (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>) (based on the prediction results publicly archived in the Phytozome <italic>Chlamydomonas reinhardtii</italic> v5.6 database, as real-time, up-to-date services for both tools are not currently available, as confirmed by server status checks and personnel communication), and PB-Chlamy (<xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>)&#x2014;a SOTA model capable of predicting multiple subcellular localizations, such as mitochondrial, secretory, and other cellular compartments (integratively classified as &#x201C;other&#x201D;), in addition to chloroplast localization, as well as assigning potential mitochondrial/chloroplast and secretory/chloroplast dual-targeted protein candidates. Given its ability to distinguish dual-targeted proteins, PB-Chlamy was further used to extract putative dual-targeted protein candidates (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S5</xref>).</p>
</sec>
<sec id="sec8">
<label>2.6</label>
<title>Versatility evaluation of the Chlamy_ChloroPred</title>
<p>The versatility of the Chlamy_ChloroPred program was evaluated using a chloroplast proteome dataset from the model terrestrial plant, <italic>A. thaliana</italic>. The chloroplast protein dataset was basically obtained from the report by <xref ref-type="bibr" rid="ref15">Ferro et al. (2010)</xref>. Of the 1,323 protein identifications (IDs) listed in the study, the amino acid sequences of 1,263 protein IDs (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S6</xref>), which are now available on the TAIR (The Arabidopsis Information Resource) portal were tested (<xref ref-type="bibr" rid="ref34">Reiser et al., 2017</xref>). The identical dataset was also applied to the primary benchmark in this study, TargetP 2.0, and the resulting predictions were compared. For the predicted results from TargetP 2.0, proteins that were predicted to be localized in other organelles (e.g., the mitochondria or the extracellular region) were treated as non-CPs. Those that were predicted to be localized in the chloroplast or the thylakoid lumen&#x2014;as a component of the chloroplast&#x2014;were treated as CPs.</p>
</sec>
</sec>
<sec sec-type="results" id="sec9">
<label>3</label>
<title>Results</title>
<sec id="sec10">
<label>3.1</label>
<title>Building input dataset</title>
<p>Given the heavy dependence of model performance on the quality of the input data, the preparation of refined input data is imperative in implementing a high-performance predictive program. Considering this, we first balanced the initially collected dataset consisting of 1,262 proteins that include 841 CP-labeled and 421 non-CP-labeled proteins by excluding 420 CP-labeled proteins, leading to a balanced labeled dataset with 421 CP-labeled and 421 non-CP-labeled protein sequences (842 proteins in total). Subsequently, approximately 80% (673 proteins) of the balanced labeled dataset was grouped into the model-building dataset while <italic>ca</italic>. 20% (169 proteins) of the dataset was grouped into the test dataset. The model-building dataset was further split into train and validation datasets, which contain about 75% (504 proteins) and 25% (169 proteins) of the original dataset, respectively. All splitting and grouping processes were conducted while maintaining balance in label distributions (i.e., CP <italic>vs</italic>. non-CP) because it reduces bias toward the majority class, ensures fair training, and improves the performance and interpretability of a binary classification model (<xref ref-type="bibr" rid="ref17">Friedberg et al., 2013</xref>). As a result, we successfully secured the total dataset, consisting of train, validation, and test subsets in a 3:1:1 ratio (<xref ref-type="fig" rid="fig1">Figure 1A</xref>).</p>
<p>Next, we examined the length distribution of the datasets to confirm that they all have a similar dispersion of amino acid lengths. Similar length distributions were observed in the histograms of each split dataset, and it was found that the subsets demonstrate analogous summary statistic values (e.g., mean and minimum lengths) (<xref ref-type="fig" rid="fig1">Figure 1B</xref>). On the contrary, the maximum lengths of each subset are disparate because the total dataset (i.e., the balanced labeled dataset) contains a sparse number of proteins over 1,000 AAs (2.49%)&#x2014;only 21 out of 842 proteins. The maximum length of sequence data in a dataset is of interest and importance since it would establish a standard for length normalization, which is used to uniform input dimensions. Given that using a long input length (e.g., 4,661 AAs in this study) can lead to a high computational burden and rather adversely affect the model&#x2019;s performance, the input length was standardized to 1,000 AAs, which has a high chance of containing the localization determinant sequence, cTP, without sacrificing the sequence information of more than 95% of protein sequences in any dataset that are shorter than this coverage length (<xref ref-type="fig" rid="fig1">Figure 1B</xref>).</p>
</sec>
<sec id="sec11">
<label>3.2</label>
<title>Prediction model construction</title>
<p>As shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>, a predictive model, named Chlamy_ChloroPred, was constructed in this study that is primarily based on a triple stack of RNN-BiLSTM and an attentive pooling layer. Prior to data input, the length-normalized input sequences were tokenized using special tokens, such as the classification token (CLS), the padding token (PAD), and the separation token (SEP), and then embedded by the ProtBERT-BFD protein language model (<xref ref-type="bibr" rid="ref13">Elnaggar et al., 2022</xref>), which provides enriched biochemical and structural features pre-trained on a large number of protein sequences. As a result, this model outputs the probability of a protein sequence being localized to the chloroplast with a threshold of 0.5, which implements binary classification (<xref ref-type="bibr" rid="ref27">Lee et al., 2022</xref>). The detailed parameters are presented in the Materials and Methods section.</p>
</sec>
<sec id="sec12">
<label>3.3</label>
<title>Performance evaluation</title>
<p>Under the Google Colab environment, the constructed code was executed. Running was terminated at epoch 11 to prevent overfitting, triggered by the early stopping callback. Since the loss value measures how much the model&#x2019;s predictions deviate from the true labels, epoch 11 seemed adequate as the stopping point. This is evident from the result that the stopping point occurred before the train and validation losses largely diverged, while the train and validation accuracies increased steadily (<xref ref-type="fig" rid="fig3">Figures 3A</xref>,<xref ref-type="fig" rid="fig3">B</xref>). Based on the trained program, we graphed the ROC curve and calculated the AUC (<xref ref-type="fig" rid="fig3">Figure 3C</xref>) to examine the overall performance of the model. Our classifier showed an AUC of 0.9090, surpassing the 0.5 AUC of a random guess model. This demonstrates that our model is much better than a dummy classifier at distinguishing between positive (i.e., CP) and negative (i.e., non-CP) classes.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Performance evaluation of Chlamy_ChloroPred and biological interpretation of results from the attentive pooling layer. <bold>(A)</bold> Accuracy and <bold>(B)</bold> loss values plotted against iteration (i.e., epoch) for the train and validation datasets. <bold>(C)</bold> The ROC (receiver operating characteristic) curve for Chlamy_ChloroPred is shown. The AUC (area under the curve) is presented and juxtaposed with that of a dummy classifier. Confusion matrices of the <bold>(D)</bold> train, <bold>(E)</bold> validation, and <bold>(F)</bold> test datasets (DSs). For each dataset, several summary statistic values (i.e., accuracy, precision, recall, and F1 score) are presented. TP, FP, FN, and TN stand for true positive, false positive, false negative, and true negative, respectively. <bold>(G)</bold> Heatmaps showing residue importance during the decision-making process of Chlamy_ChloroPred for the test data. The attention density plot was first illustrated for the overall test data (left panel), then split and separately displayed according to the labels (right panels). Plots of CP and non-CP-labeled proteins are shown in the upper right and lower right panels, respectively. The arrow marks the region to which Chlamy_ChloroPred paid close attention when performing binary classification of CP-labeled proteins.</p>
</caption>
<graphic xlink:href="fmicb-17-1744805-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Figure with multiple panels displaying machine learning model performance and protein data: Panel A shows a line graph of accuracy versus epoch for train and validation datasets, both improving then plateauing around epoch eight; Panel B plots loss versus epoch, both datasets showing decreasing loss; Panel C is a receiver operating characteristic (ROC) curve indicating classifier performance (AUC 0.909) versus random guessing (AUC 0.5); Panels D, E, and F are confusion matrices for train, validation, and test datasets, each matrix includes accuracy, precision, recall, and F1 scores; Panel G displays heatmaps of normalized attention scores by amino acid position for overall, chloroplast, and non-chloroplast protein groups, highlighting differences in model focus across protein types.</alt-text>
</graphic>
</fig>
<p>More specifically, the model&#x2019;s inference performance was evaluated with regard to accuracy, precision, recall, and F1 score using the train, validation, and test datasets. The results were presented in the form of confusion matrices (<xref ref-type="fig" rid="fig3">Figures 3D</xref>&#x2013;<xref ref-type="fig" rid="fig3">F</xref>). The accuracy (<xref ref-type="disp-formula" rid="E1">Equation 1</xref>), precision (<xref ref-type="disp-formula" rid="E2">Equation 2</xref>), recall (<xref ref-type="disp-formula" rid="E3">Equation 3</xref>), and F1 score (<xref ref-type="disp-formula" rid="E4">Equation 4</xref>) were not lower than 0.8462 (with a loss of 0.4052), 0.8718, 0.8000, and 0.8395, respectively, most of which were derived from the test dataset. This reveals the model&#x2019;s balanced and reliable performance in predicting whether given proteins in <italic>C. reinhardtii</italic> are localized to the chloroplast or not.</p>
<p>Subsequently, we illustrated heatmaps of residue importance derived from the attentive pooling layer, which was employed to improve the model&#x2019;s predictive accuracy by focusing attention on relevant sequence features and to offer interpretability through attention weight visualization. As shown in <xref ref-type="fig" rid="fig3">Figure 3G</xref>, our model focused selectively on the N-terminal regions of most CP-labeled proteins in the test dataset (upper right panel of <xref ref-type="fig" rid="fig3">Figure 3G</xref>), while paying relatively higher attention to the C-terminal regions&#x2014;comparatively less related to chloroplast localization (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>)&#x2014;of most non-CP-labeled proteins in the same dataset (lower right panel of <xref ref-type="fig" rid="fig3">Figure 3G</xref>). Concentrated attention at the <italic>N</italic>-terminal region of non-CPs likely reflects the presence of alternative targeting signals or the absence of canonical chloroplast transit peptide features, thereby providing strong negative evidence for chloroplast localization in a binary classification setting. Meanwhile, the minor attention peaks outside the N-terminal region of CPs are likely attributable to the internal sequence features correlated with CPs (<xref ref-type="bibr" rid="ref9">Caspari, 2022</xref>) and background noise effects, rather than <italic>bona fide</italic> targeting signals. It is also noteworthy is that the approximate length of the sequences focused on in the CP-labeled proteins corresponds to the previously known average length of cTPs in <italic>C. reinhardtii</italic> (~50 AAs; arrowed in <xref ref-type="fig" rid="fig3">Figure 3G</xref>) (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>), strongly implying that, as intended, our model pays close attention to the <italic>N</italic>-terminal extensions (i.e., cTP regions) rather than treating every amino acid region equally when predicting the localization of putative chloroplast proteins.</p>
</sec>
<sec id="sec13">
<label>3.4</label>
<title>Performance comparison with benchmark programs</title>
<p>To objectively validate the merit of Chlamy_ChloroPred, we evaluated the summary statistic values (e.g., accuracy, precision, recall, and F1 score) of the primary benchmark protein localization predictor, TargetP 2.0, using the train (<italic>n</italic>&#x202F;=&#x202F;504; <xref ref-type="fig" rid="fig4">Figure 4A</xref>), validation (<italic>n</italic>&#x202F;=&#x202F;169; <xref ref-type="fig" rid="fig4">Figure 4B</xref>), test (<italic>n</italic>&#x202F;=&#x202F;169; <xref ref-type="fig" rid="fig4">Figure 4C</xref>), and total (<italic>n</italic>&#x202F;=&#x202F;842; <xref ref-type="fig" rid="fig4">Figure 4D</xref>)&#x2014;the aggregate total of the split datasets, which is the same as the balanced labeled dataset in <xref ref-type="fig" rid="fig1">Figure 1A</xref>&#x2014;datasets in this study and then collated the performance indices of our model (<xref ref-type="fig" rid="fig3">Figures 3D</xref>&#x2013;<xref ref-type="fig" rid="fig3">F</xref>) with those of TargetP 2.0. By and large, TargetP 2.0 exhibited higher precision values, but lower accuracy and recall values, across the datasets. Despite the higher precision values, the much lower recall values contributed to the inferior F1 scores of TargetP 2.0. The precision-recall trade-offs observed from TargetP 2.0&#x2014;i.e., prioritizing precision over recall&#x2014;implicate that the benchmark was likely programmed with strict, conservative classification criteria for chloroplast proteins. Given the increased accuracy (from 0.7396% to 0.8462%, a 14.41% improvement) and F1 score (from 0.6563% to 0.8395%, a 27.91% improvement) observed on the test dataset (<xref ref-type="fig" rid="fig3">Figures 3F</xref>,<xref ref-type="fig" rid="fig3">C</xref>), it is evident that Chlamy_ChloroPred outperforms the existing benchmark. To more comprehensively establish the position of our developed model among broader predictors, we further examined the predicted localizations of proteins in the test dataset (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>) using TargetP 1.1 (<xref ref-type="fig" rid="fig4">Figure 4E</xref>), PredAlgo (<xref ref-type="fig" rid="fig4">Figure 4F</xref>), and PB-Chlamy (<xref ref-type="fig" rid="fig4">Figure 4G</xref>). As a result, Chlamy_ChloroPred demonstrated superior classification performance compared with the legacy predictors, including TargetP 1.1 and PredAlgo, in terms of accuracy (70.26% and 9.36% improvements, respectively), precision (81.63% and 17.07% improvements, respectively), and F1 score (281.24% and 6.74% improvements, respectively). When compared with the current SOTA model, PB-Chlamy, Chlamy_ChloroPred exhibited comparable and competitive binary classification performance, showing a modest improvement in recall (3.03%), accompanied by marginal reductions in accuracy (0.69%), precision (3.57%), and F1 score (0.15%). Collectively, these results suggest that Chlamy_ChloroPred represents a viable alternative to existing models, particularly in cases where reliable binary chloroplast protein classification in <italic>C. reinhardtii</italic> is required.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Prediction performance of the primary benchmark in this study, TargetP 2.0, was evaluated in terms of accuracy, precision, recall, and F1 score individually for the <bold>(A)</bold> train, <bold>(B)</bold> validation, and <bold>(C)</bold> test datasets (DSs), as well as for the <bold>(D)</bold> total dataset (i.e., the balanced labeled dataset in <xref ref-type="fig" rid="fig1">Figure 1A</xref>), enabling direct comparison with the performance of our developed program, Chlamy_ChloroPred. For broader and more comprehensive comparison, predictions using <bold>(E)</bold> TargetP 1.1, <bold>(F)</bold> PredAlgo, and <bold>(G)</bold> PB-Chlamy were also performed on the identical test DS. It should be noted that, because predicted localization information from the PredAlgo predictor for one protein (Cre14.g634279) is missing from the Phytozome database (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>), the associated statistical analyses were conducted using a total of 168 proteins (<italic>n</italic> =&#x202F;168). TP, FP, FN, and TN stand for true positive, false positive, false negative, and true negative, respectively.</p>
</caption>
<graphic xlink:href="fmicb-17-1744805-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Grid of seven labeled confusion matrix heatmaps (A-G), each showing counts of true positives, false positives, false negatives, and true negatives for CP and Non-CP classifications. Each panel includes dataset size, accuracy, precision, recall, and F1 score, with color keys representing count magnitudes.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec14">
<label>3.5</label>
<title>Versatility of Chlamy_ChloroPred</title>
<p>Although our program was constructed as a binary chloroplast protein classifier solely based on the <italic>C. reinhardtii</italic> proteome, we proceeded to use Chlamy_ChloroPred to predict protein localization for the chloroplast proteome of the model terrestrial plant, <italic>A. thaliana</italic>, in order to examine its potential compatibility with other photosynthetic organisms, which could lead to broader applications. In this versatility test, the TargetP 2.0 program was again employed as the comparative benchmark, given its validated multi-species and cross-proteome versatility. For the publicly available <italic>A. thaliana</italic> chloroplast proteome collected from the previous report and the TAIR database (<xref ref-type="bibr" rid="ref15">Ferro et al., 2010</xref>; <xref ref-type="bibr" rid="ref34">Reiser et al., 2017</xref>), Chlamy_ChloroPred classified 924 proteins as CPs out of 1,263 CP-labeled proteins in <italic>A. thaliana</italic>, whereas TargetP 2.0 predicted only 821 as CPs (<xref ref-type="table" rid="tab1">Table 1</xref>). As a result, the accuracy and F1 score were 0.7316 and 0.8450, respectively, for our model and 0.6500 and 0.7879, respectively, for the benchmark. Thus, improvements of 12.6% and 7.25% were observed in the accuracy and F1 score of Chlamy_ChloroPred, respectively, compared to TargetP 2.0&#x2014;although the precision and recall values are less meaningful in this evaluation due to the systematic absence of TN and FP cases, since the input dataset is composed entirely of CP-labeled data, which is fully biased. On balance, we confirmed that this model has the potential to be versatile for various photosynthetic organisms with the core photosynthetic organelle, the chloroplast, as evidenced by its superior prediction performance for the <italic>A. thaliana</italic> chloroplast proteome compared to the commonly referenced protein localization predictor for plants, TargetP 2.0 (<xref ref-type="bibr" rid="ref6">Bj&#x00F6;rnsdotter et al., 2021</xref>; <xref ref-type="bibr" rid="ref35">Sanaboyana and Elcock, 2024</xref>).</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Versatility of Chlamy_ChloroPred for the <italic>A. thaliana</italic> CP database.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" rowspan="2">For <italic>A. thaliana</italic> CP database (<italic>n</italic> =&#x202F;1,263)</th>
<th align="center" valign="top" colspan="2">Predicted result<sup>&#x002A;</sup></th>
<th align="center" valign="top" colspan="4">Performance metrics (summary statistics)</th>
</tr>
<tr>
<th align="center" valign="top">CP</th>
<th align="center" valign="top">Non-CP</th>
<th align="center" valign="top">Accuracy</th>
<th align="center" valign="top">Precision<sup>&#x2020;</sup></th>
<th align="center" valign="top">Recall<sup>&#x2021;</sup></th>
<th align="center" valign="top">F1 score</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle" style="background-color:#f2f2f2">Chlamy_ChloroPred</td>
<td align="center" valign="middle">924</td>
<td align="center" valign="middle">339</td>
<td align="center" valign="middle">0.7316</td>
<td align="center" valign="middle">N/A</td>
<td align="center" valign="middle">0.7316</td>
<td align="center" valign="middle">0.8450</td>
</tr>
<tr>
<td align="left" valign="middle" style="background-color:#f2f2f2">TargetP 2.0</td>
<td align="center" valign="middle">821</td>
<td align="center" valign="middle">442</td>
<td align="center" valign="middle">0.6500</td>
<td align="center" valign="middle">N/A</td>
<td align="center" valign="middle">0.6500</td>
<td align="center" valign="middle">0.7879</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The summary statistic performance metrics were compared with those from TargetP 2.0 benchmark for the same input dataset. CP stands for chloroplast protein.</p>
<p><sup>&#x002A;</sup>For the predicted results from TargetP 2.0, proteins predicted to be localized in other organelles (e.g., the mitochondria or the extracellular region) were treated as non-CPs, while those predicted to be localized in the chloroplast or the thylakoid lumen were treated as CPs.</p>
<p><sup>&#x2020;</sup>Precision is not an applicable metric for this specific dataset (i.e., the <italic>A. thaliana</italic> chloroplast proteome), which is entirely biased toward CPs. N/A denotes not applicable.</p>
<p><sup>&#x2021;</sup>According to the Eqs. <xref ref-type="disp-formula" rid="E1">1</xref>, and <xref ref-type="disp-formula" rid="E3">3</xref>, since TN and FP are zero, the recall should be the same as the accuracy.</p>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec sec-type="discussion" id="sec15">
<label>4</label>
<title>Discussion</title>
<p>As interest in biological CO<sub>2</sub> conversion and chloroplasts, core intracellular organelles involved in photosynthesis, has increased, several predictors have been developed that can infer the presence of chloroplast proteins in a given proteomic database, but their performance and reliability remain unsatisfactory. To address this issue, we developed a new predictor, named Chlamy_ChloroPred, constructed based on a refined proteome from <italic>C. reinhardtii</italic>&#x2014;a photosynthetic microorganism that serves as a model chassis for various molecular biology and genetic engineering studies of the chloroplast. Aided by embeddings enriched with diverse informative features derived from the large protein language model, ProtBERT-BFD, our model&#x2014;featuring a complex architecture composed of a triply stacked BiLSTM network coupled with an attentive pooling layer&#x2014;achieved accurate binary classification of chloroplast proteins in the <italic>C. reinhardtii</italic> proteome, attaining a notable test accuracy of 0.8462. This performance is consistent with previous findings showing that the integration of large language models (LLMs) with attention mechanisms represents a promising direction for future model development (<xref ref-type="bibr" rid="ref22">Kong et al., 2024a</xref>; <xref ref-type="bibr" rid="ref23">Kong et al., 2024b</xref>). The prediction performance surpasses that of widely used predictors, including TargetP 1.1, TargetP 2.0, and PredAlgo, which achieved accuracies of 0.4970, 0.7396, and 0.7738, respectively, on the same dataset. Furthermore, our results demonstrate that the proposed model serves as a reliable alternative to the current SOTA predictor, PB-Chlamy, exhibiting competitive performance across multiple evaluation metrics.</p>
<p>This outperformance is likely due to our model&#x2019;s ability to capture a key region (i.e., amino acid residues) that mediates the destination of a chloroplast-localized protein, as evidenced by the following two findings. Firstly, our model demonstrates a high prediction accuracy with a balanced F1 score despite taking only the foremost 1,000 amino acid regions of proteins as inputs, while sacrificing the ensuing residues of proteins longer than 1,000-AA, instead of using the entire sequences&#x2014;which is devised to keep the program lightweight. Intriguingly, when we tried to execute the program preliminarily with a longer input length normalized to 4,661 AAs, considering the longest sequence in the dataset (<xref ref-type="fig" rid="fig1">Figure 1B</xref>), we found that the trial rather reduced the model&#x2019;s predictive performance and increased the program&#x2019;s computational demand (data not shown). The observation, truncating the C-terminal regions did not impair prediction performance, suggests that the model effectively identifies and exploits biologically informative residues within the <italic>N</italic>-terminal regions, on which the inference of chloroplast localization is based. Secondly, and more specifically, Chlamy_ChloroPred focused intensively on the most front ~50-AA sequences of chloroplast proteins (<xref ref-type="bibr" rid="ref39">Tardif et al., 2012</xref>), which are potential locations for cTP, using its locality-awareness conferred by the attentive pooling layer (<xref ref-type="fig" rid="fig3">Figure 3G</xref>) when making the final classification determination.</p>
<p>Another advantage of our model is its versatility, which may allow for wider compatibility with other photosynthetic organisms, as exemplified by the investigation using the <italic>Arabidopsis</italic> chloroplast proteome. Achieving an accuracy of 0.7316, Chlamy_ChloroPred demonstrates a 12.6% improvement over the widely used benchmark program TargetP 2.0, which has validated multi-species and cross-proteome versatility, despite being originally developed for the precise binary classification of potential chloroplast proteins in <italic>C. reinhardtii</italic> using only the <italic>Chlamydomonas</italic> proteome database. This may also be attributed to our model&#x2019;s capability to apprehend the conserved characteristics of CP across organisms&#x2014;albeit the inexplicitly revealed common features of CP (<xref ref-type="bibr" rid="ref8">Bruce, 2001</xref>), which is empowered by the well-designed architecture employing the large-scale, pre-learned protein features obtained from ProtBERT-BFD.</p>
<p>Meanwhile, compared with other predictive models, our model exhibits a tendency toward permissive classification, in turn leading to a relatively high number of false positives. For example, when we applied Chlamy_ChloroPred to predict protein localizations for the whole <italic>C. reinhardtii</italic> proteome, it identified 5,335 putative chloroplast proteins out of 19,526 entire proteins, corresponding to 27.32% of the total proteome. The predicted proportion is quite higher than that reported in a previous study (<italic>ca</italic>. 13%) (<xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>), suggesting that our model adopts relatively liberal and permissive criteria for inferring protein localization. This is consistent with the higher FPR of 0.1176 observed in our model (<xref ref-type="fig" rid="fig3">Figure 3F</xref>) compared to the benchmark of 0.0235 (<xref ref-type="fig" rid="fig4">Figure 4C</xref>) for the test dataset in this study, as calculated using <xref ref-type="disp-formula" rid="E5">Equation 5</xref>, implying that Chlamy_ChloroPred can occasionally produce false positive predictions, even considering the conservative classification propensity of TargetP 2.0 (see Section 3.4).</p>
<p>In addition to the use of a strictly balanced dataset&#x2014;which does not reflect real-world proteome distributions, where non-chloroplast proteins predominate, and may contribute to an increase false positive rate (<xref ref-type="bibr" rid="ref32">Pawlicki et al., 2020</xref>), several factors may also influence the elevated false positive rate of this model, such as ambiguity between mitochondrial targeting peptides (mTPs) and cTPs, which serve as key determinants of the final protein destinations (<xref ref-type="bibr" rid="ref25">Kunze and Berger, 2015</xref>). To investigate the impact of this potential factor on our model in greater depth, we evaluated its ability to discriminate between chloroplast and mitochondrial proteins by systemically examining and comparing its predictions with those of other predictors used in this study for genes included in the test dataset (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>). We focused on examining specific false positive cases&#x2014;namely, instances in which mitochondrial proteins were incorrectly predicted as chloroplast proteins. We initiated this analysis by retrieving mitochondrial proteins with previously experimentally validated localizations (<xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>) from the protein set under investigation. This led us to compile a set of 12 mitochondrial proteins (Cre09.g393506, Cre10.g449100, Cre07.g349350, Cre13.g563150, Cre02.g088000, Cre03.g157700, Cre01.g054500, Cre10.g420700, Cre09.g393210, Cre12.g496750, Cre10.g428300, and Cre10.g440400), including three dual-targeted proteins (Cre01.g054500, Cre10.g440400, and Cre12.g496750). Among these, the former two are mitochondrial/chloroplast dual-targeted proteins, whereas the latter is a mitochondrial/cytoplasmic dual-targeted protein. To facilitate an intuitive comparison of predictor-dependent false positive frequencies, we excluded the dual-targeted proteins from the analysis. Under this condition, the resulting false positive frequencies were 0.22 (2/9), 0.33 (3/9), 0.11 (1/9), 0.00 (0/9), and 0.11 (1/9), for Chlamy_ChloroPred, PredAlgo, TargetP 1.1, TargetP 2.0, and PB-Chlamy, respectively (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>). This finding suggests that Chlamy_ChloroPred exhibits a measurable capability to discriminate chloroplast proteins from potentially confounding mitochondrial proteins. However, the sharpness of this discrimination remains limited, reflecting a slightly more permissive behavior relative to predictors with lower false positive rates (FPRs), such as TargetP 2.0 (FPR&#x202F;=&#x202F;0.0235) (<xref ref-type="fig" rid="fig4">Figure 4C</xref>) and PB-Chlamy (FPR&#x202F;=&#x202F;0.0824) (<xref ref-type="fig" rid="fig4">Figure 4G</xref>), which may be associated with the elevated FPR observed for our model.</p>
<p>Withal, such permissiveness may be advantageous for capturing chloroplast protein candidates among dual-targeted proteins, as some predictors frequently miss these candidates due to ambiguity in their N-terminal targeting signals (<xref ref-type="bibr" rid="ref19">Gould et al., 2024</xref>). This notion is supported by our localization analysis based on 48 putative dual-targeted proteins predicted by PB-Chlamy (<xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S5</xref>), which was conducted in consideration of both the scarcity of experimentally validated dual-targeting proteins and the high reported accuracy PB-Chlamy, together with its capability to explicitly annotate dual-targeted proteins. As a result, among the 48 putative dual-targeted candidates, PredAlgo, TargetP 1.1, TargetP 2.0, and Chlamy_ChloroPred predicted 39 (81.25%), 22 (45.83%), 18 (37.50%), and 38 (79.17%) proteins as chloroplast proteins, respectively. In parallel, PredAlgo classified 6 proteins as mitochondrial, whereas TargetP 1.1 and TargetP 2.0 assigned 25 and 14 proteins, respectively, to mitochondrial localization. Meanwhile, Chlamy_ChloroPred predicted the remaining 10 proteins as non-chloroplast proteins. These results provide a clue that our developed model, Chlamy_ChloroPred, a degree of relatively effectively captures chloroplast localization as one of the possible destinations of putative chloroplast-related dual-targeted proteins. At the same time, these findings stress the need for continued model development to enable more complete resolution of dual-localization-related ambiguities, thereby allowing more comprehensive capture of chloroplast proteins within dual-targeted pools.</p>
<p>The current restriction of our model to binary classification of proteins as either CPs or non-CPs represents an additional opportunity for future improvement. In the interim, combinatorial use with other specialized predictors targeting specific subcellular regions, such as the nucleus, mitochondria, and extracellular regions (<xref ref-type="bibr" rid="ref7">Brameier et al., 2007</xref>; <xref ref-type="bibr" rid="ref45">Zhao et al., 2019</xref>; <xref ref-type="bibr" rid="ref36">Savojardo et al., 2020</xref>), could exert a synergistic effect in resolving the localization of proteins whose subcellular destinations remain controversial or experimentally undefined. Moreover, while Chlamy_ChloroPred appears capable of correctly assigning certain subchloroplast-localized proteins to the chloroplast, as illustrated by its classification of a thylakoid lumen protein (Cre06.g256250; <xref ref-type="supplementary-material" rid="SM1">Supplementary Dataset S4</xref>) as a chloroplast protein (<xref ref-type="bibr" rid="ref20">He et al., 2023</xref>), its current capability is limited to collapsing distinct subchloroplast localizations into a single chloroplast category, thereby hindering fine-grained localization resolution. Consequently, achieving higher specificity in subchloroplast localization will require extension of the current prediction framework. One key direction is to expand Chlamy_ChloroPred to enable discrimination among subchloroplast compartments, including the envelope, stroma, thylakoid membrane, and thylakoid lumen. Provided that sufficient high-quality, experimentally validated training data with precisely annotated subchloroplast localization become available, such an upgraded model could function as a standalone tool to facilitate in-depth chloroplast research across diverse organisms. Furthermore, although only a few archives that facilitate convenient comparison between predicted and experimentally validated localization data&#x2014;primarily based on fluorescence-tagged datasets&#x2014;are currently available (<xref ref-type="bibr" rid="ref30">Mackinder et al., 2017</xref>; <xref ref-type="bibr" rid="ref29">Li et al., 2019</xref>; <xref ref-type="bibr" rid="ref42">Wang et al., 2023</xref>), many comparative analyses still rely heavily on manual retrieval from dispersed references. In this regard, integrating an automated framework into our model to systematically link prediction outputs with a broader range of experimental evidence, including immunohistochemistry data (<xref ref-type="bibr" rid="ref4">Bernstein et al., 1994</xref>), would substantially enhance the user-friendliness and strengthen prediction reliability.</p>
</sec>
<sec sec-type="conclusions" id="sec16">
<label>5</label>
<title>Conclusion</title>
<p>We successfully developed a deep learning-based framework, Chlamy_ChloroPred, which integrates meticulously designed neural layers while maintaining low computational demand, achieving high predictive accuracy and interpretability. While exhibiting modest permissiveness in classification, Chlamy_ChloroPred outperforms widely used protein localization predictors, including the TargetP series (versions 1.1 and 2.0) and PredAlgo, in chloroplast protein classification for the proteome of the model microalga, <italic>C. reinhardtii</italic>, and demonstrates competitive performance relative to the current SOTA model, PB-Chlamy, under the same binary classification setting. Furthermore, our model exhibits potential versatility across broader photosynthetic lineages, as exemplified by its applicability to the proteome of the representative terrestrial plant, <italic>A. thaliana</italic>. Given that the chloroplast is a nearly ubiquitous component among primary producers (<xref ref-type="bibr" rid="ref38">Surridge, 2022</xref>) and plays a crucial role in photosynthesis&#x2014;the principal biological CO<sub>2</sub> fixation mechanism, Chlamy_ChloroPred would serve as a powerful computational proxy for cumbersome localization experiments, thereby expediting both fundamental chloroplast research and applied engineering efforts aimed at enhancing the CO<sub>2</sub> assimilation capacity and crop yield of phototrophic organisms.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec17">
<title>Data availability statement</title>
<p>The datasets presented in this study can be found in online repositories. The names of the repository/repositories and accession number(s) can be found in the article/<xref ref-type="supplementary-material" rid="SM1">Supplementary material</xref>. All scripts and trained models used in this study are publicly available at a GitHub repository (<ext-link xlink:href="https://github.com/superstitione/Chlamy_ChloroPred" ext-link-type="uri">https://github.com/superstitione/Chlamy_ChloroPred</ext-link>).</p>
</sec>
<sec sec-type="author-contributions" id="sec18">
<title>Author contributions</title>
<p>HC: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. SL: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. IL: Conceptualization, Investigation, Software, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. YL: Data curation, Funding acquisition, Investigation, Methodology, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. J-HY: Formal analysis, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. D-YC: Formal analysis, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. D-HC: Formal analysis, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. B-SS: Formal analysis, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. JC: Formal analysis, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. DL: Formal analysis, Investigation, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. H-SK: Conceptualization, Data curation, Formal analysis, Funding acquisition, Investigation, Methodology, Project administration, Resources, Software, Supervision, Validation, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<ack>
<title>Acknowledgments</title>
<p>The authors would like to express the best thanks to the developers of the PredAlgo algorithm, namely Olivier Vallon (Centre National de la Recherche Scientifique, France) and Laurent Cournac (Institut de Recherche pour le D&#x00E9;velopment, France), for their kind discussion regarding the benchmark algorithm used in this study.</p>
</ack>
<sec sec-type="COI-statement" id="sec19">
<title>Conflict of interest</title>
<p>SL and IL were employed by Daemyung Vision Co., Ltd.</p>
<p>The remaining author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec20">
<title>Generative AI statement</title>
<p>The author(s) declared that Generative AI was used in the creation of this manuscript. Generative AI was used to check potential syntax errors in the program code.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec21">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec22">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/fmicb.2026.1744805/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/fmicb.2026.1744805/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Table_1.XLSX" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>SUPPLEMENTARY DATASET S1</label>
<caption>
<p>Initially collected dataset (<italic>n</italic>=1262).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table_2.XLSX" id="SM2" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>SUPPLEMENTARY DATASET S2</label>
<caption>
<p>Train dataset (<italic>n</italic>=504).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table_3.XLSX" id="SM3" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>SUPPLEMENTARY DATASET S3</label>
<caption>
<p>Validation dataset (<italic>n</italic>=169).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table_4.XLSX" id="SM4" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>SUPPLEMENTARY DATASET S4</label>
<caption>
<p>Test dataset (<italic>n</italic>=169).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table_5.XLSX" id="SM5" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>SUPPLEMENTARY DATASET S5</label>
<caption>
<p>Putative dual-targeted proteins (<italic>n</italic>=48).</p>
</caption>
</supplementary-material>
<supplementary-material xlink:href="Table_6.XLSX" id="SM6" mimetype="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" xmlns:xlink="http://www.w3.org/1999/xlink">
<label>SUPPLEMENTARY DATASET S6</label>
<caption>
<p>Arabidopsis thaliana chloroplast proteins (<italic>n</italic>=1263).</p>
</caption>
</supplementary-material>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Adabor</surname><given-names>E. S.</given-names></name> <name><surname>Adu</surname><given-names>P.</given-names></name> <name><surname>Asamoah</surname><given-names>D. A.</given-names></name></person-group> (<year>2025</year>). <article-title>Bamclassifier: a machine learning method for assessing iron deficiency</article-title>. <source>Sci. Rep.</source> <volume>15</volume>:<fpage>32264</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-025-92892-y</pub-id></mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Almagro Armenteros</surname><given-names>J. J.</given-names></name> <name><surname>Salvatore</surname><given-names>M.</given-names></name> <name><surname>Emanuelsson</surname><given-names>O.</given-names></name> <name><surname>Winther</surname><given-names>O.</given-names></name> <name><surname>Von Heijne</surname><given-names>G.</given-names></name> <name><surname>Elofsson</surname><given-names>A.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>Detecting sequence signals in targeting peptides using deep learning</article-title>. <source>Life Sci. Alliance</source> <volume>2</volume>:<fpage>e201900429</fpage>. doi: <pub-id pub-id-type="doi">10.26508/lsa.201900429</pub-id>, <pub-id pub-id-type="pmid">31570514</pub-id></mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Belhaouari</surname><given-names>S. B.</given-names></name> <name><surname>Islam</surname><given-names>A.</given-names></name> <name><surname>Kassoul</surname><given-names>K.</given-names></name> <name><surname>Al-Fuqaha</surname><given-names>A.</given-names></name> <name><surname>Bouzerdoum</surname><given-names>A.</given-names></name></person-group> (<year>2024</year>). <article-title>Oversampling techniques for imbalanced data in regression</article-title>. <source>Expert Syst. Appl.</source> <volume>252</volume>:<fpage>124118</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2024.124118</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bernstein</surname><given-names>M.</given-names></name> <name><surname>Beech</surname><given-names>P. L.</given-names></name> <name><surname>Katz</surname><given-names>S. G.</given-names></name> <name><surname>Rosenbaum</surname><given-names>J. L.</given-names></name></person-group> (<year>1994</year>). <article-title>A new kinesin-like protein (Klp1) localized to a single microtubule of the <italic>Chlamydomonas</italic> flagellum</article-title>. <source>J. Cell Biol.</source> <volume>125</volume>, <fpage>1313</fpage>&#x2013;<lpage>1326</lpage>. doi: <pub-id pub-id-type="doi">10.1083/jcb.125.6.1313</pub-id>, <pub-id pub-id-type="pmid">8207060</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bienvenut</surname><given-names>W. V.</given-names></name> <name><surname>Espagne</surname><given-names>C.</given-names></name> <name><surname>Martinez</surname><given-names>A.</given-names></name> <name><surname>Majeran</surname><given-names>W.</given-names></name> <name><surname>Valot</surname><given-names>B.</given-names></name> <name><surname>Zivy</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>Dynamics of post-translational modifications and protein stability in the stroma of <italic>Chlamydomonas reinhardtii</italic> chloroplasts</article-title>. <source>Proteomics</source> <volume>11</volume>, <fpage>1734</fpage>&#x2013;<lpage>1750</lpage>. doi: <pub-id pub-id-type="doi">10.1002/pmic.201000634</pub-id>, <pub-id pub-id-type="pmid">21462344</pub-id></mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bj&#x00F6;rnsdotter</surname><given-names>E.</given-names></name> <name><surname>Nadzieja</surname><given-names>M.</given-names></name> <name><surname>Chang</surname><given-names>W.</given-names></name> <name><surname>Escobar-Herrera</surname><given-names>L.</given-names></name> <name><surname>Mancinotti</surname><given-names>D.</given-names></name> <name><surname>Angra</surname><given-names>D.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>VC1 catalyses a key step in the biosynthesis of vicine in faba bean</article-title>. <source>Nat Plants</source> <volume>7</volume>, <fpage>923</fpage>&#x2013;<lpage>931</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41477-021-00950-w</pub-id>, <pub-id pub-id-type="pmid">34226693</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Brameier</surname><given-names>M.</given-names></name> <name><surname>Krings</surname><given-names>A.</given-names></name> <name><surname>Maccallum</surname><given-names>R. M.</given-names></name></person-group> (<year>2007</year>). <article-title>NucPred&#x2014;predicting nuclear localization of proteins</article-title>. <source>Bioinformatics</source> <volume>23</volume>, <fpage>1159</fpage>&#x2013;<lpage>1160</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btm066</pub-id></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bruce</surname><given-names>B. D.</given-names></name></person-group> (<year>2001</year>). <article-title>The paradox of plastid transit peptides: conservation of function despite divergence in primary structure</article-title>. <source>Biochimica et Biophysica Acta (BBA)</source> <volume>1541</volume>, <fpage>2</fpage>&#x2013;<lpage>21</lpage>. doi: <pub-id pub-id-type="doi">10.1016/s0167-4889(01)00149-5</pub-id>, <pub-id pub-id-type="pmid">11750659</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Caspari</surname><given-names>O. D.</given-names></name></person-group> (<year>2022</year>). <article-title>Transit peptides often require downstream unstructured sequence for efficient chloroplast import in <italic>Chlamydomonas reinhardtii</italic></article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>:<fpage>825797</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpls.2022.825797</pub-id></mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Choi</surname><given-names>H. I.</given-names></name> <name><surname>Hwang</surname><given-names>S.-W.</given-names></name> <name><surname>Kim</surname><given-names>J.</given-names></name> <name><surname>Park</surname><given-names>B.</given-names></name> <name><surname>Jin</surname><given-names>E.</given-names></name> <name><surname>Choi</surname><given-names>I.-G.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Augmented CO<sub>2</sub> tolerance by expressing a single H<sup>+</sup>&#x2212;pump enables microalgal valorization of industrial flue gas</article-title>. <source>Nat. Commun.</source> <volume>12</volume>:<fpage>6049</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41467-021-26325-5</pub-id>, <pub-id pub-id-type="pmid">34663809</pub-id></mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dyo</surname><given-names>Y. M.</given-names></name> <name><surname>Purton</surname><given-names>S.</given-names></name></person-group> (<year>2018</year>). <article-title>The algal chloroplast as a synthetic biology platform for production of therapeutic proteins</article-title>. <source>Microbiology</source> <volume>164</volume>, <fpage>113</fpage>&#x2013;<lpage>121</lpage>. doi: <pub-id pub-id-type="doi">10.1099/mic.0.000599</pub-id>, <pub-id pub-id-type="pmid">29297850</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Einhaus</surname><given-names>A.</given-names></name> <name><surname>Baier</surname><given-names>T.</given-names></name> <name><surname>Kruse</surname><given-names>O.</given-names></name></person-group> (<year>2024</year>). <article-title>Molecular design of microalgae as sustainable cell factories</article-title>. <source>Trends Biotechnol.</source> <volume>42</volume>, <fpage>728</fpage>&#x2013;<lpage>738</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.tibtech.2023.11.010</pub-id></mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Elnaggar</surname><given-names>A.</given-names></name> <name><surname>Heinzinger</surname><given-names>M.</given-names></name> <name><surname>Dallago</surname><given-names>C.</given-names></name> <name><surname>Rehawi</surname><given-names>G.</given-names></name> <name><surname>Wang</surname><given-names>Y.</given-names></name> <name><surname>Jones</surname><given-names>L.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>ProtTrans: toward understanding the language of life through self-supervised learning</article-title>. <source>IEEE Trans. Pattern Anal. Mach. Intell.</source> <volume>44</volume>, <fpage>7112</fpage>&#x2013;<lpage>7127</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TPAMI.2021.3095381</pub-id>, <pub-id pub-id-type="pmid">34232869</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Emanuelsson</surname><given-names>O.</given-names></name> <name><surname>Nielsen</surname><given-names>H.</given-names></name> <name><surname>Heijne</surname><given-names>G. V.</given-names></name></person-group> (<year>2008</year>). <article-title>ChloroP, a neural network-based method for predicting chloroplast transit peptides and their cleavage sites</article-title>. <source>Protein Sci.</source> <volume>8</volume>, <fpage>978</fpage>&#x2013;<lpage>984</lpage>. doi: <pub-id pub-id-type="doi">10.1110/ps.8.5.978</pub-id></mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ferro</surname><given-names>M.</given-names></name> <name><surname>Brugi&#x00E8;re</surname><given-names>S.</given-names></name> <name><surname>Salvi</surname><given-names>D.</given-names></name> <name><surname>Seigneurin-Berny</surname><given-names>D.</given-names></name> <name><surname>Court</surname><given-names>M.</given-names></name> <name><surname>Moyet</surname><given-names>L.</given-names></name> <etal/></person-group>. (<year>2010</year>). <article-title>AT_CHLORO, a comprehensive chloroplast proteome database with subplastidial localization and curated information on envelope proteins</article-title>. <source>Mol. Cell. Proteomics</source> <volume>9</volume>, <fpage>1063</fpage>&#x2013;<lpage>1084</lpage>. doi: <pub-id pub-id-type="doi">10.1074/mcp.M900325-MCP200</pub-id></mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Franz&#x00E9;n</surname><given-names>L.-G.</given-names></name> <name><surname>Rochaix</surname><given-names>J.-D.</given-names></name> <name><surname>Von Heijne</surname><given-names>G.</given-names></name></person-group> (<year>2001</year>). <article-title>Chloroplast transit peptides from the green alga <italic>Chlamydomonas reinhardtii</italic> share features with both mitochondrial and higher plant chloroplast presequences</article-title>. <source>FEBS Lett.</source> <volume>260</volume>, <fpage>165</fpage>&#x2013;<lpage>168</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0014-5793(90)80094-y</pub-id></mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Friedberg</surname><given-names>I.</given-names></name> <name><surname>Wei</surname><given-names>Q.</given-names></name> <name><surname>Dunbrack</surname><given-names>R. L.</given-names></name></person-group> (<year>2013</year>). <article-title>The role of balanced training and testing data sets for binary classifiers in bioinformatics</article-title>. <source>PLoS One</source> <volume>8</volume>:<fpage>e67863</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0067863</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gavel</surname><given-names>Y.</given-names></name> <name><surname>Von Heijne</surname><given-names>G.</given-names></name></person-group> (<year>2001</year>). <article-title>A conserved cleavage-site motif in chloroplast transit peptides</article-title>. <source>FEBS Lett.</source> <volume>261</volume>, <fpage>455</fpage>&#x2013;<lpage>458</lpage>. doi: <pub-id pub-id-type="doi">10.1016/0014-5793(90)80614-o</pub-id></mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Gould</surname><given-names>S. B.</given-names></name> <name><surname>Magiera</surname><given-names>J.</given-names></name> <name><surname>Garc&#x00ED;a Garc&#x00ED;a</surname><given-names>C.</given-names></name> <name><surname>Raval</surname><given-names>P. K.</given-names></name></person-group> (<year>2024</year>). <article-title>Reliability of plastid and mitochondrial localisation prediction declines rapidly with the evolutionary distance to the training set increasing</article-title>. <source>PLoS Comput. Biol.</source> <volume>20</volume>:<fpage>e1012575</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pcbi.1012575</pub-id>, <pub-id pub-id-type="pmid">39527633</pub-id></mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>He</surname><given-names>S.</given-names></name> <name><surname>Crans</surname><given-names>V. L.</given-names></name> <name><surname>Jonikas</surname><given-names>M. C.</given-names></name></person-group> (<year>2023</year>). <article-title>The pyrenoid: the eukaryotic CO<sub>2</sub>-concentrating organelle</article-title>. <source>Plant Cell</source> <volume>35</volume>, <fpage>3236</fpage>&#x2013;<lpage>3259</lpage>. doi: <pub-id pub-id-type="doi">10.1093/plcell/koad157</pub-id>, <pub-id pub-id-type="pmid">37279536</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kang</surname><given-names>B.-C.</given-names></name> <name><surname>Bae</surname><given-names>S.-J.</given-names></name> <name><surname>Lee</surname><given-names>S.</given-names></name> <name><surname>Lee</surname><given-names>J. S.</given-names></name> <name><surname>Kim</surname><given-names>A.</given-names></name> <name><surname>Lee</surname><given-names>H.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>Chloroplast and mitochondrial DNA editing in plants</article-title>. <source>Nat Plants</source> <volume>7</volume>, <fpage>899</fpage>&#x2013;<lpage>905</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41477-021-00943-9</pub-id>, <pub-id pub-id-type="pmid">34211132</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kong</surname><given-names>G.</given-names></name> <name><surname>Fan</surname><given-names>Y.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Yang</surname><given-names>Z.</given-names></name></person-group> (<year>2024a</year>). &#x201C;<article-title>Messenger RNA subcellular localization prediction via large language models and attention mechanisms</article-title>&#x201D; in (ed.) <person-group person-group-type="editor"><name><surname>Zhou</surname><given-names>M.</given-names></name></person-group> <source>2024 IEEE international conference on systems, man, and cybernetics (SMC)</source> (<publisher-loc>New Jersey</publisher-loc>: <publisher-name>IEEE</publisher-name>).</mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Kong</surname><given-names>G.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Fan</surname><given-names>Y.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name></person-group> (<year>2024b</year>). &#x201C;<article-title>SLP-T5: a new method for predicting protein sub-chloroplast localization based on text-to-text transfer transformer</article-title>&#x201D; in (eds.) <person-group person-group-type="editor"><name><surname>Cannataro</surname><given-names>M.</given-names></name> <name><surname>Zheng</surname><given-names>H.</given-names></name> <name><surname>Gao</surname><given-names>L.</given-names></name> <name><surname>Cheng</surname><given-names>J.</given-names></name> <name><surname>de Miranda</surname><given-names>J. L.</given-names></name><name><surname>Zumpano</surname><given-names>E.</given-names></name> <etal/></person-group>. <source>2024 IEEE international conference on bioinformatics and biomedicine (BIBM)</source> (<publisher-loc>Lisbon</publisher-loc>: <publisher-name>BIBM</publisher-name>).</mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kong</surname><given-names>J. N.</given-names></name> <name><surname>Hardin</surname><given-names>K.</given-names></name> <name><surname>Dinkins</surname><given-names>M.</given-names></name> <name><surname>Wang</surname><given-names>G.</given-names></name> <name><surname>He</surname><given-names>Q.</given-names></name> <name><surname>Mujadzic</surname><given-names>T.</given-names></name> <etal/></person-group>. (<year>2015</year>). <article-title>Regulation of Chlamydomonas flagella and ependymal cell motile cilia by ceramide-mediated translocation of GSK3</article-title>. <source>Mol. Biol. Cell</source> <volume>26</volume>, <fpage>4451</fpage>&#x2013;<lpage>4465</lpage>. doi: <pub-id pub-id-type="doi">10.1091/mbc.E15-06-0371</pub-id></mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kunze</surname><given-names>M.</given-names></name> <name><surname>Berger</surname><given-names>J.</given-names></name></person-group> (<year>2015</year>). <article-title>The similarity between <italic>N</italic>-terminal targeting signals for protein import into different organelles and its evolutionary relevance</article-title>. <source>Front. Physiol.</source> <volume>6</volume>:<fpage>259</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fphys.2015.00259</pub-id>, <pub-id pub-id-type="pmid">26441678</pub-id></mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname><given-names>D. W.</given-names></name> <name><surname>Hwang</surname><given-names>I.</given-names></name></person-group> (<year>2018</year>). <article-title>Evolution and design principles of the diverse chloroplast transit peptides</article-title>. <source>Mol. Cells</source> <volume>41</volume>, <fpage>161</fpage>&#x2013;<lpage>167</lpage>. doi: <pub-id pub-id-type="doi">10.14348/molcells.2018.0033</pub-id>, <pub-id pub-id-type="pmid">29487274</pub-id></mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lee</surname><given-names>H.</given-names></name> <name><surname>Lee</surname><given-names>S.</given-names></name> <name><surname>Lee</surname><given-names>I.</given-names></name> <name><surname>Nam</surname><given-names>H.</given-names></name></person-group> (<year>2022</year>). <article-title>AMP-BERT: prediction of antimicrobial peptide function based on a BERT model</article-title>. <source>Protein Sci.</source> <volume>32</volume>:<fpage>e4529</fpage>. doi: <pub-id pub-id-type="doi">10.1002/pro.4529</pub-id></mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Leister</surname><given-names>D.</given-names></name></person-group> (<year>2003</year>). <article-title>Chloroplast research in the genomic age</article-title>. <source>Trends Genet.</source> <volume>19</volume>, <fpage>47</fpage>&#x2013;<lpage>56</lpage>. doi: <pub-id pub-id-type="doi">10.1016/s0168-9525(02)00003-3</pub-id></mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>X.</given-names></name> <name><surname>Patena</surname><given-names>W.</given-names></name> <name><surname>Fauser</surname><given-names>F.</given-names></name> <name><surname>Jinkerson</surname><given-names>R. E.</given-names></name> <name><surname>Saroussi</surname><given-names>S.</given-names></name> <name><surname>Meyer</surname><given-names>M. T.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>A genome-wide algal mutant library and functional screen identifies genes required for eukaryotic photosynthesis</article-title>. <source>Nat. Genet.</source> <volume>51</volume>, <fpage>627</fpage>&#x2013;<lpage>635</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41588-019-0370-6</pub-id>, <pub-id pub-id-type="pmid">30886426</pub-id></mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Mackinder</surname><given-names>L. C. M.</given-names></name> <name><surname>Chen</surname><given-names>C.</given-names></name> <name><surname>Leib</surname><given-names>R. D.</given-names></name> <name><surname>Patena</surname><given-names>W.</given-names></name> <name><surname>Blum</surname><given-names>S. R.</given-names></name> <name><surname>Rodman</surname><given-names>M.</given-names></name> <etal/></person-group>. (<year>2017</year>). <article-title>A spatial interactome reveals the protein organization of the algal CO<sub>2</sub>-concentrating mechanism</article-title>. <source>Cell</source> <volume>171</volume>, <fpage>133</fpage>&#x2013;<lpage>147</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cell.2017.08.044</pub-id></mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Merchant</surname><given-names>S. S.</given-names></name> <name><surname>Prochnik</surname><given-names>S. E.</given-names></name> <name><surname>Vallon</surname><given-names>O.</given-names></name> <name><surname>Harris</surname><given-names>E. H.</given-names></name> <name><surname>Karpowicz</surname><given-names>S. J.</given-names></name> <name><surname>Witman</surname><given-names>G. B.</given-names></name> <etal/></person-group>. (<year>2007</year>). <article-title>The <italic>Chlamydomonas</italic> genome reveals the evolution of key animal and plant functions</article-title>. <source>Science</source> <volume>318</volume>, <fpage>245</fpage>&#x2013;<lpage>250</lpage>. doi: <pub-id pub-id-type="doi">10.1126/science.1143609</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Pawlicki</surname><given-names>M.</given-names></name> <name><surname>Chora&#x015B;</surname><given-names>M.</given-names></name> <name><surname>Kozik</surname><given-names>R.</given-names></name> <name><surname>Ho&#x0142;ubowicz</surname><given-names>W.</given-names></name></person-group> (<year>2020</year>). &#x201C;<article-title>On the impact of network data balancing in cybersecurity applications</article-title>&#x201D; in (eds). <person-group person-group-type="editor"><name><surname>Krzhizhanovskaya</surname><given-names>V. V.</given-names></name> <name><surname>Z&#x00E1;vodszky</surname><given-names>G.</given-names></name> <name><surname>Lees</surname><given-names>M. H.</given-names></name> <name><surname>Dongarra</surname><given-names>J. J.</given-names></name> <name><surname>Sloot</surname><given-names>P. M. A.</given-names></name> <name><surname>Brissos</surname><given-names>S.</given-names></name> <etal/></person-group>. <source>Computational science&#x2014;ICCS 2020</source> (<publisher-loc>Amsterdam</publisher-loc>: <publisher-name>ICCS</publisher-name>), <fpage>196</fpage>&#x2013;<lpage>210</lpage>.</mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Ramundo</surname><given-names>S.</given-names></name> <name><surname>Asakura</surname><given-names>Y.</given-names></name> <name><surname>Salom&#x00E9;</surname><given-names>P. A.</given-names></name> <name><surname>Strenkert</surname><given-names>D.</given-names></name> <name><surname>Boone</surname><given-names>M.</given-names></name> <name><surname>Mackinder</surname><given-names>L. C. M.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Coexpressed subunits of dual genetic origin define a conserved supercomplex mediating essential protein import into chloroplasts</article-title>. <source>Proc. Natl. Acad. Sci. USA</source> <volume>117</volume>, <fpage>32739</fpage>&#x2013;<lpage>32749</lpage>. doi: <pub-id pub-id-type="doi">10.1073/pnas.2014294117</pub-id>, <pub-id pub-id-type="pmid">33273113</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Reiser</surname><given-names>L.</given-names></name> <name><surname>Subramaniam</surname><given-names>S.</given-names></name> <name><surname>Li</surname><given-names>D.</given-names></name> <name><surname>Huala</surname><given-names>E.</given-names></name></person-group> (<year>2017</year>). <article-title>Using the <italic>Arabidopsis</italic> information resource (TAIR) to find information about <italic>Arabidopsis</italic> genes</article-title>. <source>Curr. Protoc. Bioinform.</source> <volume>60</volume>:<fpage>e574</fpage>. doi: <pub-id pub-id-type="doi">10.1002/cpz1.574</pub-id></mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Sanaboyana</surname><given-names>V. R.</given-names></name> <name><surname>Elcock</surname><given-names>A. H.</given-names></name></person-group> (<year>2024</year>). <article-title>Improving signal and transit peptide predictions using AlphaFold2-predicted protein structures</article-title>. <source>J. Mol. Biol.</source> <volume>436</volume>:<fpage>168393</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jmb.2023.168393</pub-id>, <pub-id pub-id-type="pmid">38065275</pub-id></mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Savojardo</surname><given-names>C.</given-names></name> <name><surname>Bruciaferri</surname><given-names>N.</given-names></name> <name><surname>Tartari</surname><given-names>G.</given-names></name> <name><surname>Martelli</surname><given-names>P. L.</given-names></name> <name><surname>Casadio</surname><given-names>R.</given-names></name> <name><surname>Cowen</surname><given-names>L.</given-names></name></person-group> (<year>2020</year>). <article-title>DeepMito: accurate prediction of protein sub-mitochondrial localization using convolutional neural networks</article-title>. <source>Bioinformatics</source> <volume>36</volume>, <fpage>56</fpage>&#x2013;<lpage>64</lpage>. doi: <pub-id pub-id-type="doi">10.1093/bioinformatics/btz512</pub-id></mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Schein</surname><given-names>A. I.</given-names></name></person-group> (<year>2001</year>). <article-title>Chloroplast transit peptide prediction: a peek inside the black box</article-title>. <source>Nucleic Acids Res.</source> <volume>29</volume>:<fpage>82e</fpage>. doi: <pub-id pub-id-type="doi">10.1093/nar/29.16.e82.</pub-id></mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Surridge</surname><given-names>C.</given-names></name></person-group> (<year>2022</year>). <article-title>TOC&#x2013;TIC imports</article-title>. <source>Nat Plants</source> <volume>8</volume>, <fpage>1333</fpage>&#x2013;<lpage>1333</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41477-022-01319-3</pub-id></mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tardif</surname><given-names>M.</given-names></name> <name><surname>Atteia</surname><given-names>A.</given-names></name> <name><surname>Specht</surname><given-names>M.</given-names></name> <name><surname>Cogne</surname><given-names>G.</given-names></name> <name><surname>Rolland</surname><given-names>N.</given-names></name> <name><surname>Brugi&#x00E8;re</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2012</year>). <article-title>PredAlgo: a new subcellular localization prediction tool dedicated to green algae</article-title>. <source>Mol. Biol. Evol.</source> <volume>29</volume>, <fpage>3625</fpage>&#x2013;<lpage>3639</lpage>. doi: <pub-id pub-id-type="doi">10.1093/molbev/mss178</pub-id>, <pub-id pub-id-type="pmid">22826458</pub-id></mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Terashima</surname><given-names>M.</given-names></name> <name><surname>Specht</surname><given-names>M.</given-names></name> <name><surname>Naumann</surname><given-names>B.</given-names></name> <name><surname>Hippler</surname><given-names>M.</given-names></name></person-group> (<year>2010</year>). <article-title>Characterizing the anaerobic response of <italic>Chlamydomonas reinhardtii</italic> by quantitative proteomics</article-title>. <source>Mol. Cell. Proteomics</source> <volume>9</volume>, <fpage>1514</fpage>&#x2013;<lpage>1532</lpage>. doi: <pub-id pub-id-type="doi">10.1074/mcp.M900421-MCP200</pub-id></mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Teufel</surname><given-names>F.</given-names></name> <name><surname>Almagro Armenteros</surname><given-names>J. J.</given-names></name> <name><surname>Johansen</surname><given-names>A. R.</given-names></name> <name><surname>G&#x00ED;slason</surname><given-names>M. H.</given-names></name> <name><surname>Pihl</surname><given-names>S. I.</given-names></name> <name><surname>Tsirigos</surname><given-names>K. D.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>SignalP 6.0 predicts all five types of signal peptides using protein language models</article-title>. <source>Nat. Biotechnol.</source> <volume>40</volume>, <fpage>1023</fpage>&#x2013;<lpage>1025</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41587-021-01156-3</pub-id>, <pub-id pub-id-type="pmid">34980915</pub-id></mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>L.</given-names></name> <name><surname>Patena</surname><given-names>W.</given-names></name> <name><surname>Van Baalen</surname><given-names>K. A.</given-names></name> <name><surname>Xie</surname><given-names>Y.</given-names></name> <name><surname>Singer</surname><given-names>E. R.</given-names></name> <name><surname>Gavrilenko</surname><given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>A chloroplast protein atlas reveals punctate structures and spatial organization of biosynthetic pathways</article-title>. <source>Cell</source> <volume>186</volume>, <fpage>3499</fpage>&#x2013;<lpage>3518</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.cell.2023.06.008</pub-id></mixed-citation></ref>
<ref id="ref43"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yue</surname><given-names>Y.</given-names></name> <name><surname>Ye</surname><given-names>C.</given-names></name> <name><surname>Peng</surname><given-names>P.-Y.</given-names></name> <name><surname>Zhai</surname><given-names>H.-X.</given-names></name> <name><surname>Ahmad</surname><given-names>I.</given-names></name> <name><surname>Xia</surname><given-names>C.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>A deep learning framework for identifying essential proteins based on multiple biological information</article-title>. <source>BMC Bioinformatics</source> <volume>23</volume>:<fpage>318</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s12859-022-04868-8</pub-id>, <pub-id pub-id-type="pmid">35927611</pub-id></mixed-citation></ref>
<ref id="ref44"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhan</surname><given-names>Y.</given-names></name> <name><surname>Marchand</surname><given-names>C. H.</given-names></name> <name><surname>Maes</surname><given-names>A.</given-names></name> <name><surname>Mauries</surname><given-names>A.</given-names></name> <name><surname>Sun</surname><given-names>Y.</given-names></name> <name><surname>Dhaliwal</surname><given-names>J. S.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>Pyrenoid functions revealed by proteomics in <italic>Chlamydomonas reinhardtii</italic></article-title>. <source>PLoS One</source> <volume>13</volume>:<fpage>e0185039</fpage>. doi: <pub-id pub-id-type="doi">10.1371/journal.pone.0185039</pub-id>, <pub-id pub-id-type="pmid">29481573</pub-id></mixed-citation></ref>
<ref id="ref45"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>L.</given-names></name> <name><surname>Poschmann</surname><given-names>G.</given-names></name> <name><surname>Waldera-Lupa</surname><given-names>D.</given-names></name> <name><surname>Rafiee</surname><given-names>N.</given-names></name> <name><surname>Kollmann</surname><given-names>M.</given-names></name> <name><surname>St&#x00FC;hler</surname><given-names>K.</given-names></name></person-group> (<year>2019</year>). <article-title>OutCyte: a novel tool for predicting unconventional protein secretion</article-title>. <source>Sci. Rep.</source> <volume>9</volume>:<fpage>19448</fpage>. doi: <pub-id pub-id-type="doi">10.1038/s41598-019-55351-z</pub-id>, <pub-id pub-id-type="pmid">31857603</pub-id></mixed-citation></ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0002"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/492362/overview">Neha Arora</ext-link>, Skidmore College, United States</p></fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0003"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/26501/overview">Michael Hippler</ext-link>, University of M&#x00FC;nster, Germany</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3209806/overview">Ge Kong</ext-link>, Beihang University, China</p></fn>
</fn-group>
<glossary>
<def-list>
<title>Glossary</title>
<def-item>
<term>AA</term>
<def>
<p>Amino acid</p>
</def>
</def-item>
<def-item>
<term>AI</term>
<def>
<p>Artificial intelligence</p>
</def>
</def-item>
<def-item>
<term>AUC</term>
<def>
<p>Area under the ROC curve</p>
</def>
</def-item>
<def-item>
<term>CO<sub>2</sub></term>
<def>
<p>Carbon dioxide</p>
</def>
</def-item>
<def-item>
<term>CP</term>
<def>
<p>Chloroplast protein</p>
</def>
</def-item>
<def-item>
<term>cTP</term>
<def>
<p>Chloroplast transit peptide</p>
</def>
</def-item>
<def-item>
<term>DNA</term>
<def>
<p>Deoxyribonucleic acid</p>
</def>
</def-item>
<def-item>
<term>FN</term>
<def>
<p>False negative</p>
</def>
</def-item>
<def-item>
<term>FP</term>
<def>
<p>False positive</p>
</def>
</def-item>
<def-item>
<term>FPR</term>
<def>
<p>False positive rate</p>
</def>
</def-item>
<def-item>
<term>mTP</term>
<def>
<p>Mitochondrial targeting peptide</p>
</def>
</def-item>
<def-item>
<term>ProtBERT-BFD</term>
<def>
<p>Protein-based bidirectional encoder representations from transformers-big fantastic database</p>
</def>
</def-item>
<def-item>
<term>RNN-BiLSTM</term>
<def>
<p>Recurrent neural network-bidirectional long short-term memory</p>
</def>
</def-item>
<def-item>
<term>ROC</term>
<def>
<p>Receiver operating characteristic</p>
</def>
</def-item>
<def-item>
<term>SOTA</term>
<def>
<p>State-of-the-art</p>
</def>
</def-item>
<def-item>
<term>TAIR</term>
<def>
<p>The Arabidopsis information resource</p>
</def>
</def-item>
<def-item>
<term>TN</term>
<def>
<p>True negative</p>
</def>
</def-item>
<def-item>
<term>TOC-TIC</term>
<def>
<p>Translocon at the outer chloroplast membrane&#x2014;translocon at the inner chloroplast membrane</p>
</def>
</def-item>
<def-item>
<term>TP</term>
<def>
<p>True positive</p>
</def>
</def-item>
<def-item>
<term>TPR</term>
<def>
<p>True positive rate</p>
</def>
</def-item>
</def-list>
</glossary>
</back>
</article>