<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1730047</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Fine-grained few-shot class-incremental identification of medicinal plants via frequency-aware contrastive learning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Tan</surname><given-names>Chaoqun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3009377/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Qin</surname><given-names>Zhonghan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Tang</surname><given-names>Zihan</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Huang</surname><given-names>Yongliang</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Li</surname><given-names>Ke</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Intelligent Medicine, Chengdu University of Traditional Chinese Medicine</institution>, <city>Chengdu</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>National Key Laboratory of Fundamental Science on Synthetic Vision, School of Computer Science, Sichuan University</institution>, <city>Chengdu</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>School of Economics, Southwestern University of Finance and Economics</institution>, <city>Chengdu</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff4"><label>4</label><institution>Department of Pharmacy, Hospital of Chengdu University of Traditional Chinese Medicine</institution>, <city>Chengdu</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Ke Li, <email xlink:href="mailto:likescu@scu.edu.cn">likescu@scu.edu.cn</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-13">
<day>13</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1730047</elocation-id>
<history>
<date date-type="received">
<day>22</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>06</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Tan, Qin, Tang, Huang and Li.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Tan, Qin, Tang, Huang and Li</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-13">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Developing robust algorithmic tools for accurately identifying diverse medicinal plant species is critical for advancing precision medicine. Although deep learning methods have shown considerable promise, they generally require large-scale annotated datasets, which are often difficult to acquire given the vast taxonomic diversity and limited labeled samples available for many plant species. To address this, we propose a novel Frequency-Aware Guided Domain Enhancement Contrastive Learning (FGDE) framework, designed to incrementally learn new categories from few annotated examples while alleviating catastrophic forgetting and overfitting. Our approach integrates high- and low-frequency components to refine feature representations, using multi-frequency fusion to preserve detail-enhanced information. Contrastive learning is further employed to strengthen multi-semantic aggregation and extract discriminative features across both visual and label domains. Additionally, we introduce a multi-objective loss function to enhance semantic compactness within base classes and improve separation among incremental classes. Extensive experiments demonstrate that FGDE significantly outperforms state-of-the-art methods on our collected dataset and two public benchmarks. These results underscore the potential of our model to support practical applications in intelligent plant identification and precision agriculture.</p>
</abstract>
<kwd-group>
<kwd>contrastive learning</kwd>
<kwd>fine-grained few-shot incremental learning</kwd>
<kwd>frequency-aware</kwd>
<kwd>identification</kwd>
<kwd>medicinal plant</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This study was funded by the National Natural Science Foundation of China (No. 82405033), Natural Science Foundation of Sichuan Province (No. 2026NSFSC1837), China Postdoctoral Science Foundation (No. 2025MD774046), Sichuan Provincial Department of Human Resources and Social Security-Postdoctoral Research Special Foundation (No. TB2025094) and the Research Promotion Plan for Xinglin Scholars in Chengdu University of Traditional Chinese Medicine (No. BSZ2024030) and (No. QJRC2024007).</funding-statement>
</funding-group>
<counts>
<fig-count count="12"/>
<table-count count="6"/>
<equation-count count="28"/>
<ref-count count="48"/>
<page-count count="18"/>
<word-count count="8682"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Sustainable and Intelligent Phytoprotection</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Medicinal plants, renowned for their therapeutic properties and historical significance, play a pivotal role in the clinical practice of traditional medicine (<xref ref-type="bibr" rid="B31">Sun et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B45">Zang et&#xa0;al., 2025</xref>). Consequently, they have garnered significant attention from both traditional healers and modern medical practitioners (<xref ref-type="bibr" rid="B39">Wang et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B3">Armijos et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B6">Chen et&#xa0;al., 2025</xref>). However, due to the confusion by different varieties for the affected quality and commercial value that have been reported, increasing concern has been expressed by the public (<xref ref-type="bibr" rid="B42">Xiao et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B46">Zhang et&#xa0;al., 2022</xref>; <xref ref-type="bibr" rid="B36">Vani et&#xa0;al., 2025</xref>). Therefore, accurate authentication of medicinal plant species is critical for practical application. Conventionally, detecting active ingredients such as organic acids and flavonoids serves as the gold standard for identifying medicinal plant varieties (<xref ref-type="bibr" rid="B41">Wu et&#xa0;al., 2025</xref>). While these laboratory-based methods offer high precision, they are often time-consuming, costly, and reliant on specialized equipment (<xref ref-type="bibr" rid="B9">Fitzgerald et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B43">Xiao et&#xa0;al., 2025</xref>). Alternatively, intelligent sensory technologies combined with chemometric methods have gained traction, yet they remain constrained by specific instrumentation requirements.</p>
<p>With recent advancements in Deep Learning (DL), computer vision has emerged as a promising, non-destructive, and rapid solution for plant identification, demonstrating remarkable success in medical image classification (<xref ref-type="bibr" rid="B24">Pandey and Jain, 2022</xref>; <xref ref-type="bibr" rid="B16">Huang et&#xa0;al., 2025</xref>; <xref ref-type="bibr" rid="B38">Wang et&#xa0;al., 2025</xref>). The efficacy of DL-based approaches in automating taxonomy is widely acknowledged (<xref ref-type="bibr" rid="B4">Attri et&#xa0;al., 2023</xref>). However, these data-driven models typically rely on large-scale annotated datasets to learn robust feature representations (<xref ref-type="bibr" rid="B37">Wang et&#xa0;al., 2021</xref>). In the context of medicinal plants, the sheer diversity of species renders the construction of comprehensive, large-scale annotated datasets impractical. Furthermore, acquiring images across a broad spectrum of varieties presents significant challenges due to the inherent difficulties in sample collection (<xref ref-type="bibr" rid="B20">LeCun et&#xa0;al., 2015</xref>; <xref ref-type="bibr" rid="B33">Tan et&#xa0;al., 2024</xref>). Consequently, how can we design a model capable of effectively learning feature representations from limited annotated data? Developing a system that can rapidly adapt to new concepts using only a few annotated samples would be highly beneficial for the advancement of the field.</p>
<p>Few-Shot Learning (FSL) (<xref ref-type="bibr" rid="B8">Fei-Fei et&#xa0;al., 2006</xref>; <xref ref-type="bibr" rid="B10">Gao et&#xa0;al., 2023</xref>) aims to enable image classification models to adapt to new tasks using scarce annotated samples. These frameworks typically involve a training phase for model adaptability and an adaptation phase for new tasks (<xref ref-type="bibr" rid="B7">Dvornik et&#xa0;al., 2019</xref>). Several studies have successfully applied FSL to plant analysis, such as leaf classification (<xref ref-type="bibr" rid="B2">Arg&#xfc;eso et&#xa0;al., 2020</xref>), plant detection (<xref ref-type="bibr" rid="B28">Rezaei et&#xa0;al., 2024</xref>), and hyperspectral categorization (<xref ref-type="bibr" rid="B5">Cai et&#xa0;al., 2023</xref>). However, standard FSL methods are prone to catastrophic forgetting, where adapting to new tasks degrades performance on previous ones. To mitigate this, Few-Shot Class-Incremental Learning (FSCIL) (<xref ref-type="bibr" rid="B34">Tao et&#xa0;al., 2020</xref>) was introduced, utilizing techniques like neural gas networks (<xref ref-type="bibr" rid="B22">Martinetz and Schulten, 1991</xref>; <xref ref-type="bibr" rid="B25">Prudent and Ennaji, 2005</xref>) to dynamically model feature space topology. Despite this progress, mainstream approaches (<xref ref-type="bibr" rid="B1">Ahmed et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B11">Han et&#xa0;al., 2024</xref>) often employ a frozen backbone pre-trained with cross-entropy loss. This strategy frequently fails to effectively separate class margins, leading to poor generalization (<xref ref-type="bibr" rid="B26">Raichur et&#xa0;al., 2024</xref>; <xref ref-type="bibr" rid="B48">Zhou et&#xa0;al., 2024</xref>). Moreover, the data often presents fine-grained features: minimal distinction between different species (low inter-class variance) and significant variation within the same species (high intra-class variance). Such ambiguity hinders the model&#x2019;s ability to discriminate between new and old classes, resulting in false classifications.</p>
<p>Most existing techniques in fine-grained classification primarily focus on extracting image edge signals or high-frequency features (<xref ref-type="bibr" rid="B30">Song et&#xa0;al., 2023</xref>). While these detailed features are generally effective in revealing subtle inter-class differences, it is crucial to further enhance the distinction between fine-grained classes. Subsequently, it is essential to enhance the distinction between fine-grained classes and achieve clearer clustering of novel and old data, even with limited samples.</p>
<p>Motivated by these challenges, this paper proposes a novel Frequency-Aware Guided Domain Enhancement Contrastive Learning Model (FGDE). This framework constructs discriminative features by integrating high- and low-frequency components and leverages the class-clustering capability of contrastive learning. The result is a feature distribution characterized by improved intra-class compactness and inter-class separability. As illustrated in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>, the detailed features are refined by incorporating high-frequency components to enhance domain-specific representations (<xref ref-type="bibr" rid="B21">Li et&#xa0;al., 2023</xref>). The proposed method is described in the third section, and the experimental results and analysis are shown in the fourth section. The main contributions of this paper are summarized as follows:</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Illustration of our FGDE. Different classes are marked in different colors. Our proposed network extracts the high-frequency and low-frequency features using the Discrete Cosine Transform (DCT). Enhanced features improve the clustering performance of the model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g001.tif">
<alt-text content-type="machine-generated">Diagram showing three versions of an image: original, low-frequency, and high-frequency components, processed through a model. The output includes base classes and incremental sequences represented by color-coded dots.</alt-text>
</graphic></fig>
<list list-type="order">
<list-item>
<p>A novel Frequency-Aware Guided Domain Enhancement Contrastive Learning Model (FGDE) is proposed to strengthen the fine-grained semantic extension of base classes and the separation of subsequent classes. It achieves detail-enhanced feature representation by integrating multi-frequency components, thereby refining domain-specific distinctions.</p></list-item>
<list-item>
<p>We propose high-frequency and low-frequency components to enrich the original features and unearth class-discriminative information in both the visual and label domains. Subsequently, it enhances multi-semantic aggregation awareness, facilitating more precise differentiation of fine-grained images.</p></list-item>
<list-item>
<p>We introduce contrastive loss, cross-entropy loss, and feature augmentation loss. This mechanism minimizes intra-class variance while maximizing inter-class separation, significantly enhancing the model&#x2019;s discriminative power and generalization capabilities.</p></list-item>
<list-item>
<p>We showcase robust performance on our datasets and public datasets, outperforming previous state-of-the-art methods. Furthermore, we perform a thorough analysis to evaluate the importance of each component.</p></list-item>
</list>
</sec>
<sec id="s2">
<label>2</label>
<title>Data collection and preprocessing</title>
<sec id="s2_1">
<label>2.1</label>
<title>Sample preparation</title>
<p>We collected 28 different specimens and their derived products, which all samples were sourced from the Lotus Pond Chinese Medicinal Plant Market in Chengdu China. These samples were authenticated by experts from the Chengdu Institute of Food and Drug Control (Chengdu, China). The dried samples were obtained from the original intact specimens. Post-collection, they were stored in standard cold storage conditions.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Data acquisition</title>
<p>A self-developed high-resolution data acquisition device (Canon EOS 60D) was used to acquire images in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2A</bold></xref>. The device is composed of a box, a light system, and an image acquisition system, which can provide stable and consistent environmental conditions. The image acquisition process is illustrated in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref>.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The image detection to detection results. <bold>(A)</bold> Image Acquisition, <bold>(B)</bold> Image Data, <bold>(C)</bold> Image Detection.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g002.tif">
<alt-text content-type="machine-generated">A three-panel process image. Panel A shows a camera setup above a lightbox for imaging objects. Panel B displays numerous images acquired by A. Panel C shows the cropped results.</alt-text>
</graphic></fig>
<p>All images are captured using a 35mm CMOS sensor with a resolution of 5120&#xd7;3840, as shown in <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2B</bold></xref>. Images are annotated and cropped to obtain a target, see <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2C</bold></xref>. We remove incomplete, blurry, and inappropriate images. Our collected dataset is shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. Due to the potential for a highly unbalanced training dataset, such as the interfered classes within our dataset. This issue is addressed by balancing each class through data augmentation. Specifically, we augment the data to ensure a uniform distribution of 250 samples in each class.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Random samples from the dataset, which consists of 28 different CHMs and their produced products. Namely <bold>(A)</bold> chaoshanzha <bold>(B)</bold> honghuajiao <bold>(C)</bold> jiaoshanzha <bold>(D)</bold> hanyuanhuajiao <bold>(E)</bold> shanzhatan <bold>(F)</bold> qingjiao <bold>(G)</bold> sichuanhuajiao <bold>(H)</bold> tengjiao <bold>(I)</bold> jiangbanxia <bold>(J)</bold> lubei <bold>(K)</bold> songbei <bold>(L)</bold> shengbanxia.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g003.tif">
<alt-text content-type="machine-generated">Twelve panels labeled A to L display different biological cross-sections and seeds. Each panel shows a series of four images, highlighting variations in texture, color, and shape. Panels A and B depict ring-like cross-sections; C and D show seed pods; E and F display round, fruit-like shapes; G to L present various seeds and seeds within fruits, each with unique surface details and colors ranging from pale to dark.</alt-text>
</graphic></fig>
<p>Different processed plants of <italic>Shanzha</italic> include <italic>Chaoshanzha</italic>, <italic>Jiaoshanzha</italic>, and <italic>Shanzhatan</italic>. Similarly, various processing plants are used to obtain different processes with Banxia, including <italic>Jiangbanxia</italic>, <italic>Fabanxia</italic>, <italic>Qingbanxia</italic>, and <italic>JingBanxia</italic>. However, <italic>ShuiBanxia</italic> is often used as a counterfeit product of <italic>Qingbanxia</italic>. Additionally, Jiang <italic>Nanxing</italic>, as a processed product derived from Tiger&#x2019;s Paw Southern Star, is commonly considered a fake of <italic>Jiangbanxia</italic> in the commercial market. <italic>Lubeimu</italic>, <italic>Qingbeimu</italic>, and <italic>Songbiemu</italic> are the most common circulation of <italic>ChuanBeimu</italic>. According to the properties of images, all data are detected to remove redundant pixels that contain no information.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3_1">
<label>3.1</label>
<title>Problem definition</title>
<p>Continuous incremental classes are the key factors in FSCIL. In this paper, the first session can learn a generalizable representation. Then, multiple few-shot incremental classes are executed. There is the training data <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msubsup><mml:mi>D</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> is the training data from session <inline-formula>
<mml:math display="inline" id="im2"><mml:mi>t</mml:mi></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are the <inline-formula>
<mml:math display="inline" id="im5"><mml:mi>i</mml:mi></mml:math></inline-formula>-th image and corresponding label respectively. The training images are expressed as <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mi>s</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mi>D</mml:mi><mml:mn>0</mml:mn></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>D</mml:mi><mml:mn>1</mml:mn></mml:msup><mml:mo>&#x2026;</mml:mo><mml:mo>.</mml:mo><mml:msup><mml:mi>D</mml:mi><mml:mi>N</mml:mi></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. For the initial sequence <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msup><mml:mtext>D</mml:mtext><mml:mn>0</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula>, the image domain contains <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msup><mml:mi>C</mml:mi><mml:mn>0</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula> classes, and the label domain is <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msup><mml:mi>L</mml:mi><mml:mn>0</mml:mn></mml:msup></mml:mrow></mml:math></inline-formula>. For subsequent incremental sequences, the label domain has no overlap, the rest contained in new classes are invisible in base data. When the <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:msubsup><mml:mi>D</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>i</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula> are trained, and the model is tested in <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msubsup><mml:mtext>D</mml:mtext><mml:mrow><mml:mtext>test</mml:mtext></mml:mrow><mml:mtext>i</mml:mtext></mml:msubsup></mml:mrow></mml:math></inline-formula>, which contains all encountered classes <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>&#x222a;</mml:mo><mml:mtext>&#x2009;</mml:mtext><mml:msub><mml:mi>C</mml:mi><mml:mn>1</mml:mn></mml:msub><mml:mo>&#x222a;</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mo>&#x2026;</mml:mo><mml:mo>.</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> in the <inline-formula>
<mml:math display="inline" id="im13"><mml:mi>t</mml:mi></mml:math></inline-formula>-th subsequent. In FSCIL, the initial sequence is with many samples, and the model only has access to a few samples in the following subsequent. Specifically, the incremental data are always organized in N-way K-shot format, N is the class, and K represents the training images of each class.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Overview</title>
<p>The architecture of our proposed FGDE model is illustrated in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>. In the first phase, multiple predefined augmentations are applied to enrich the fine-grained images. Subsequently, Discrete Cosine Transform (DCT) is employed to extract multi-frequency features, which are fused with the original images to construct high-frequency and low-frequency enhanced representations. Simultaneously, label representations are expanded to encapsulate the semantic consistency of the images. The visual patches and these expanded labels interact to ensure cross-modal alignment and refine the embedding space, thereby improving the separability of base classes. In the second phase, generated contrastive learning pairs are utilized to enhance multi-semantic aggregation and mine class-discriminative information. Here, semantic granularity is enriched via contrastive learning. We jointly optimize contrastive, feature augmentation, and cross-entropy losses to minimize intra-class variance and maximize inter-class distance. During the third phase, the model adapts to new classes using limited few-shot samples. A similar metric is employed to assign incoming samples to their respective prototypes, ensuring robust generalization and stability while mitigating catastrophic forgetting.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The overall pipeline of our FGDE framework consists of three phases. Phase 1 emphasizes learning richer representations of the original space for both the image and label domains through multiple predefined transformations. Phase 2 involves leveraging contrastive learning to distinguish between positive and negative sample pairs. Phase 3 focuses on training the limited new classes to mitigate catastrophic forgetting and reduce overfitting.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g004.tif">
<alt-text content-type="machine-generated">The overall pipeline of our FGDE framework consists of three phases.</alt-text>
</graphic></fig>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Frequency-aware guided multi-semantic feature enhancement</title>
<p>Data scarcity in base classes restricts the diversity of learned semantic features, leading to poor generalization and unclear class boundaries. To enhance feature robustness, we apply targeted visual transformations, focusing on color and shape as suggested by previous studies. Specifically, we employ random cropping to expand the fine-grained feature space. Given an image <inline-formula>
<mml:math display="inline" id="im14"><mml:mi>X</mml:mi></mml:math></inline-formula>, the cropping dimensions are defined as <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x223c;</mml:mo><mml:mtext>Rand</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x223c;</mml:mo><mml:mtext>Rand</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. Given a rand point (<inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>r</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im18"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>r</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>), where <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>w</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im20"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>h</mml:mi><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. Thus, the point (<inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>f</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>f</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>) of the lower right corner of the final cropping area are computed in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref>:</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>f</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>w</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>f</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>r</mml:mi></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Random cropping of varying sizes is employed to capture local information, enhancing both local feature understanding and fine-grained semantic perception. To further enrich class-aware semantics, we introduce a transformation set <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:mi>&#x2131;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mi>r</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, consisting of color jittering (<inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>) and random rotation (<inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mi>r</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>). The processed RGB images are then transformed into the frequency domain using the 2D DCT, which expresses pixel data via a linear combination of cosine basis functions. Leveraging the superior energy compaction of DCT over the complex-valued DFT (<xref ref-type="bibr" rid="B12">He et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B16">Huang et&#xa0;al., 2025</xref>), each channel of the input image <inline-formula>
<mml:math display="inline" id="im26"><mml:mi>X</mml:mi></mml:math></inline-formula> is converted to the frequency spectrum <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>:</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>b</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>W</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mi>cos</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>&#x3c0;</mml:mi><mml:mi>h</mml:mi></mml:mrow><mml:mi>H</mml:mi></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>+</mml:mo><mml:mn>0.5</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:mstyle><mml:mi>cos</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>&#x3c0;</mml:mi><mml:mi>w</mml:mi></mml:mrow><mml:mi>W</mml:mi></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>b</mml:mi><mml:mo>+</mml:mo><mml:mn>0.5</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im28"><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mi>H</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>w</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> represent the horizontal and vertical frequency indices. The normalization coefficients <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>w</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are defined as in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:msqrt><mml:mrow><mml:mn>1</mml:mn><mml:mo stretchy="false">/</mml:mo><mml:mi>N</mml:mi></mml:mrow></mml:msqrt><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>if&#xa0;</mml:mtext><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:msqrt><mml:mrow><mml:mn>2</mml:mn><mml:mo stretchy="false">/</mml:mo><mml:mi>N</mml:mi></mml:mrow></mml:msqrt><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext>otherwise</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow><mml:mo>,</mml:mo><mml:mtext>where&#xa0;</mml:mtext><mml:mi>N</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>H</mml:mi><mml:mo>,</mml:mo><mml:mi>W</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#xa0;</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>In the resulting spectrum <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, low-frequency components are concentrated near the origin <inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, while high-frequency components are distributed in the peripheral regions. Then, we apply a binary mask <inline-formula>
<mml:math display="inline" id="im33"><mml:mi>M</mml:mi></mml:math></inline-formula> to separate the spectrum into low-frequency and high-frequency components. We define a cut-off threshold <inline-formula>
<mml:math display="inline" id="im34"><mml:mi>&#x3c4;</mml:mi></mml:math></inline-formula> based on the Manhattan distance in the frequency domain. The mask <inline-formula>
<mml:math display="inline" id="im35"><mml:mi>M</mml:mi></mml:math></inline-formula> is defined as in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>M</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mtable><mml:mtr><mml:mtd><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext mathvariant="italic">if&#xa0;&#xa0;</mml:mtext><mml:mi>h</mml:mi><mml:mo>+</mml:mo><mml:mi>w</mml:mi><mml:mo>&#x2264;</mml:mo><mml:mi>&#x3c4;</mml:mi></mml:mrow></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:mtd><mml:mtd><mml:mrow><mml:mtext mathvariant="italic">otherwise</mml:mtext></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Subsequently, the low-frequency spectrum <inline-formula>
<mml:math display="inline" id="im36"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> and high-frequency spectrum <inline-formula>
<mml:math display="inline" id="im37"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula> are derived via the Hadamard product (<inline-formula>
<mml:math display="inline" id="im38"><mml:mo>&#x2299;</mml:mo></mml:math></inline-formula>): <inline-formula>
<mml:math display="inline" id="im39"><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x2299;</mml:mo><mml:mi>M</mml:mi><mml:mo>,</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mi>i</mml:mi><mml:mi>g</mml:mi><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:msup><mml:mo>&#x2299;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn mathvariant="bold">1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>M</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. Finally, we project the masked spectra back to the spatial domain using the 2D Inverse DCT (IDCT) in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>:</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msubsup><mml:mover accent="true"><mml:mi>I</mml:mi><mml:mo>&#x2dc;</mml:mo></mml:mover><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mrow><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>b</mml:mi><mml:mo>=</mml:mo><mml:mn>0</mml:mn></mml:mrow><mml:mrow><mml:mi>W</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup></mml:mstyle><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:msub><mml:mi>&#x3b1;</mml:mi><mml:mi>w</mml:mi></mml:msub><mml:msubsup><mml:mi>P</mml:mi><mml:mrow><mml:mi>h</mml:mi><mml:mo>,</mml:mo><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>d</mml:mi></mml:mrow></mml:msubsup><mml:mi>cos</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>&#x3c0;</mml:mi><mml:mi>h</mml:mi></mml:mrow><mml:mi>H</mml:mi></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>+</mml:mo><mml:mn>0.5</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle><mml:mi>cos</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>&#x3c0;</mml:mi><mml:mi>w</mml:mi></mml:mrow><mml:mi>W</mml:mi></mml:mfrac><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>b</mml:mi><mml:mo>+</mml:mo><mml:mn>0.5</mml:mn></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>These components are then sent into the encoder and obtain the original image <inline-formula>
<mml:math display="inline" id="im40"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>X</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, low-frequency <inline-formula>
<mml:math display="inline" id="im41"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and high-frequency <inline-formula>
<mml:math display="inline" id="im42"><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> feature maps. This enables the extraction of discriminative details from high-frequency components and structural context from low-frequency components. The feature extraction is defined as in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>:</p>
<disp-formula>
<mml:math display="block" id="M6"><mml:mrow><mml:msubsup><mml:mi>I</mml:mi><mml:mi>l</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>&#x3b8;</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>l</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:msubsup><mml:mi>I</mml:mi><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>&#x3b8;</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>X</mml:mi><mml:mo>+</mml:mo><mml:mi>X</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xa0;</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>Utilizing enhanced discriminative feature maps as prior knowledge augments the model&#x2019;s ability to capture critical information and adapt to incremental data. Specifically, by aligning samples with class prototypes via the high-frequency features <inline-formula>
<mml:math display="inline" id="im43"><mml:mrow><mml:msubsup><mml:mi>I</mml:mi><mml:mi>h</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula>, we encode fine-grained details that effectively sharpen decision boundaries and enhance model performance. Then, the embedded image is computed by <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>:</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mi>I</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mi>D</mml:mi><mml:mi>C</mml:mi><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:msub><mml:mtext>T</mml:mtext><mml:mtext>r</mml:mtext></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:msub><mml:mtext>T</mml:mtext><mml:mtext>c</mml:mtext></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:msub><mml:mi>C</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>X</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>I</mml:mi><mml:mi>l</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im44"><mml:mrow><mml:msub><mml:mi>C</mml:mi><mml:mn>0</mml:mn></mml:msub></mml:mrow></mml:math></inline-formula> is designated as the encountered class. This can extend various semantics and fill the unallocated image embedding space. It can also provide semantic knowledge, which encourages extensive learning of different semantics for better generalization.</p>
<p>For label domain, the predefined alternations can generate multiple augment image-label pair (<inline-formula>
<mml:math display="inline" id="im45"><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mi>y</mml:mi></mml:mrow></mml:math></inline-formula>), where <inline-formula>
<mml:math display="inline" id="im46"><mml:mrow><mml:mi>&#x2131;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>. <inline-formula>
<mml:math display="inline" id="im47"><mml:mi>N</mml:mi></mml:math></inline-formula> is the number of transformed extension space. <inline-formula>
<mml:math display="inline" id="im48"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are the generated extension images, and the corresponding labels is <inline-formula>
<mml:math display="inline" id="im49"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>y</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>N</mml:mi><mml:mo>+</mml:mo><mml:mi>n</mml:mi></mml:mrow></mml:math></inline-formula>. Thus, the label space is extended with the fine-grained class-aware embedding derived from the original space. The association between the image domain and the label domain can effectively provide richer semantic details to improve the accuracy. Likewise, the training within the embedding space <inline-formula>
<mml:math display="inline" id="im50"><mml:mi>&#x2131;</mml:mi></mml:math></inline-formula> can be expressed by <xref ref-type="disp-formula" rid="eq8">Equation 8</xref>:</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mo>;</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Detail-enhanced discriminative feature representation</title>
<p>Although effective for coarse classification, existing methods are limited in handling fine-grained data. We therefore propose an embedding-based supervised contrastive learning strategy using the MoCo (<xref ref-type="bibr" rid="B12">He et&#xa0;al., 2020</xref>) framework. This method optimizes feature distances by clustering positive pairs and separating negative ones. Given an instance <inline-formula>
<mml:math display="inline" id="im51"><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, we generate a query view <inline-formula>
<mml:math display="inline" id="im52"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Aug</mml:mtext></mml:mrow><mml:mi>q</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> and a key view <inline-formula>
<mml:math display="inline" id="im53"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mtext>Aug</mml:mtext></mml:mrow><mml:mi>k</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> via data augmentation. A shared encoder <inline-formula>
<mml:math display="inline" id="im54"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, comprising a feature extractor and a classifier, is then employed to extract the corresponding features. As shown in <xref ref-type="disp-formula" rid="eq9">Equation 9</xref>:</p>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>q</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mi>&#x3c9;</mml:mi><mml:mi>T</mml:mi></mml:msup><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im55"><mml:mrow><mml:msup><mml:mi>&#x3c9;</mml:mi><mml:mi>T</mml:mi></mml:msup><mml:mi>&#x3f5;</mml:mi><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mtext>d</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mtext>C</mml:mtext><mml:mn>0</mml:mn></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is the weight value, and <inline-formula>
<mml:math display="inline" id="im56"><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mi>&#x3f5;</mml:mi><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mtext>d</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is the feature function. Among them, the query encoder <inline-formula>
<mml:math display="inline" id="im57"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are encoded through gradient descent, while the key encoder <inline-formula>
<mml:math display="inline" id="im58"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> are encoded by a progressing encoder, driven by a momentum update with the <inline-formula>
<mml:math display="inline" id="im59"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>. The queue of key embedding is maintained to store the feature vector.</p>
<p>In the label domain, a label queue maintains labels corresponding to the feature queue, facilitating the differentiation of positive and negative pairs. This queue preserves an identical length to the feature queue. Subsequently, the contrastive loss is computed to drive the model to capture discriminative fine-grained features. This optimization effectively minimizes intra-class distance while maximizing inter-class variation, thereby fostering deep interaction between the visual and label domains.</p>
<sec id="s3_2_2_1">
<label>3.2.2.1</label>
<title>Inter-class variation</title>
<p>Usually, denote the representations of a class for a gathering center as prototypes <inline-formula>
<mml:math display="inline" id="im60"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, all prototypes of different classes are far away from each other. The <inline-formula>
<mml:math display="inline" id="im61"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is expressed by <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>:</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:msub><mml:mi>X</mml:mi><mml:mi>a</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im62"><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the number of classes <inline-formula>
<mml:math display="inline" id="im63"><mml:mi>j</mml:mi></mml:math></inline-formula>. <inline-formula>
<mml:math display="inline" id="im64"><mml:mrow><mml:msub><mml:mtext>X</mml:mtext><mml:mtext>a</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula> is the feature vector of the <inline-formula>
<mml:math display="inline" id="im65"><mml:mi>a</mml:mi></mml:math></inline-formula>-th sample. Thus, denoted two prototypes <inline-formula>
<mml:math display="inline" id="im66"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im67"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> for class <inline-formula>
<mml:math display="inline" id="im68"><mml:mtext>j</mml:mtext></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im69"><mml:mtext>k</mml:mtext></mml:math></inline-formula> in base session, the Euclidean distance of inter-class variation is calculated by <xref ref-type="disp-formula" rid="eq11">Equation 11</xref>:</p>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M12"><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mrow><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:mi>k</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msqrt><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mi>d</mml:mi></mml:msub><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>P</mml:mi><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mi>k</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:mstyle></mml:mrow></mml:msqrt></mml:mrow></mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im70"><mml:mi>d</mml:mi></mml:math></inline-formula> is the dimension of the feature vector. <inline-formula>
<mml:math display="inline" id="im71"><mml:mi>i</mml:mi></mml:math></inline-formula> is the <inline-formula>
<mml:math display="inline" id="im72"><mml:mi>i</mml:mi></mml:math></inline-formula>-th sample. For the subsequent incremental sequence, the novel classes are also obtained by computing the distance between their prototypes with the samples.</p>
</sec>
<sec id="s3_2_2_2">
<label>3.2.2.2</label>
<title>Intra-class distances</title>
<p>The analysis of intra-class distances involves computing the Euclidean distances between the samples and prototype <inline-formula>
<mml:math display="inline" id="im73"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> within the same class <inline-formula>
<mml:math display="inline" id="im74"><mml:mi>j</mml:mi></mml:math></inline-formula>, and then determining the average value. For the testing sample, the intra-class distances are computed by <xref ref-type="disp-formula" rid="eq12">Equation 12</xref>:</p>
<disp-formula id="eq12"><label>(12)</label>
<mml:math display="block" id="M13"><mml:mrow><mml:msubsup><mml:mi>d</mml:mi><mml:mi>j</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>a</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:msubsup><mml:mrow><mml:msqrt><mml:mrow><mml:mstyle displaystyle="true"><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mi>d</mml:mi></mml:msub><mml:mrow><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mi>x</mml:mi><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:msubsup><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msup></mml:mrow></mml:mstyle></mml:mrow></mml:msqrt></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>Where <inline-formula>
<mml:math display="inline" id="im75"><mml:mrow><mml:msubsup><mml:mi>x</mml:mi><mml:mi>j</mml:mi><mml:mi>i</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula> is the feature vector of samples. The smaller the intra-class distance, the more effectively the samples within the same classxcluster. This enhances the distinct separation of local information in the feature space and is crucial for accurate fine-grained identification.</p>
</sec>
<sec id="s3_2_2_3">
<label>3.2.2.3</label>
<title>Augmentation feature analysis</title>
<p>The model should also focus on global features for the multi-transformation imbalanced-grained data. To improve the generalization ability of class separation, we consider the global augmentation set from the generalization features. It serves as query view and optimizes the feature space by learning the general features of different classes. Likewise, the image-label pairs <inline-formula>
<mml:math display="inline" id="im76"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>m</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> for global augmentation can be processed by <xref ref-type="disp-formula" rid="eq13">Equation 13</xref>:</p>
<disp-formula id="eq13"><label>(13)</label>
<mml:math display="block" id="M14"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mo>;</mml:mo><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>M</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>M</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>l</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#xa0;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>m</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>With the augmentation feature analysis, the model can better concentrate the detail-enhanced information to distinguish imbalanced fine-grained images and optimize the feature space.</p>
</sec>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Incremental class inference</title>
<p>With the incremental sequence, the backbone network is fixed, and the classifier is computed by computing the novel class prototypes. The novel class information can be acquired to enable the extension of classifiers with the prototypes of basic classes and extended-augmentation classes. As shown in <xref ref-type="disp-formula" rid="eq14">Equation 14</xref>:</p>
<disp-formula id="eq14"><label>(14)</label>
<mml:math display="block" id="M15"><mml:mrow><mml:msubsup><mml:mi>W</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>11</mml:mn></mml:mrow><mml:mn>0</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow><mml:mn>0</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mn>0</mml:mn></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x222a;</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>11</mml:mn></mml:mrow><mml:mn>1</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow><mml:mn>1</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mn>1</mml:mn></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x2026;</mml:mo><mml:mo>&#x222a;</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>11</mml:mn></mml:mrow><mml:mi>t</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>12</mml:mn></mml:mrow><mml:mi>t</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im77"><mml:mi>b</mml:mi></mml:math></inline-formula> is the number of basic classes, <inline-formula>
<mml:math display="inline" id="im78"><mml:mi>N</mml:mi></mml:math></inline-formula> is the number of transformed extension space, and <inline-formula>
<mml:math display="inline" id="im79"><mml:mi>t</mml:mi></mml:math></inline-formula> is the number of incremental sequences. The prototypes <inline-formula>
<mml:math display="inline" id="im80"><mml:mrow><mml:msub><mml:mtext>W</mml:mtext><mml:mtext>n</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula> represents the focus of global and local fine-grained semantics from the original classes, shown in <xref ref-type="disp-formula" rid="eq15">Equation 15</xref>:</p>
<disp-formula id="eq15"><label>(15)</label>
<mml:math display="block" id="M16"><mml:mrow><mml:msub><mml:mi>W</mml:mi><mml:mi>n</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mrow><mml:mn>0</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>n</mml:mi></mml:mrow><mml:mn>0</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mn>0</mml:mn></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x222a;</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mrow><mml:mn>1</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>2</mml:mn><mml:mi>n</mml:mi></mml:mrow><mml:mn>1</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mn>1</mml:mn></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x2026;</mml:mo><mml:mo>&#x222a;</mml:mo><mml:mtext>&#xa0;</mml:mtext><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mn>1</mml:mn><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>Subsequently, the classifier is updated by the novel classes&#x2019; prototypes combined with the original class prototypes. It can help push the novel samples away from the distributions of the old classes and benefit novel class-aware semantic information generalization. The FC layer of the model is updated by contrasting novel query samples with these slowly evolving key embedding of base classes from the feature queue. Finally, the cosine similarity between embedding and all prototypes is computed to obtain the inference results for the test image. It can be formulated as <xref ref-type="disp-formula" rid="eq16">Equations 16</xref> and <xref ref-type="disp-formula" rid="eq17">17</xref>:</p>
<disp-formula id="eq16"><label>(16)</label>
<mml:math display="block" id="M17"><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi><mml:mo>=</mml:mo><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>g</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mrow><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>m</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msubsup><mml:mi>w</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq17"><label>(17)</label>
<mml:math display="block" id="M18"><mml:mrow><mml:msubsup><mml:mi>&#x3c9;</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msubsup><mml:mi>n</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup></mml:mrow></mml:msubsup><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo stretchy="false">/</mml:mo><mml:msubsup><mml:mi>n</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>t</mml:mi></mml:msubsup></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>By adapting the classifier to the novel classes while keeping the backbone unchanged, it can maximize the preservation of previously acquired knowledge.</p>
</sec>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Loss function</title>
<p>In this paper, the loss functions consist of three parts: cross-entropy loss (<xref ref-type="disp-formula" rid="eq18">Equation 18</xref>), contrastive loss (<xref ref-type="disp-formula" rid="eq19">Equation 19</xref>), and feature augmentation loss (<xref ref-type="disp-formula" rid="eq20">Equation 20</xref>). The model leverages sufficient data within basic classes to obtain multi-semantic aggregated information from fine-grained images by optimizing per-sample loss, simultaneously, maximizing inter-class margins.</p>
<p>For the extension of frequency-aware guided alternations, the anchor image <inline-formula>
<mml:math display="inline" id="im81"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is processed by the cross-entropy loss that computes class features to their targets. It is computed by <xref ref-type="disp-formula" rid="eq18">Equation 18</xref>:</p>
<disp-formula id="eq18"><label>(18)</label>
<mml:math display="block" id="M19"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi><mml:mi>b</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mi>log</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>To generate the query view <inline-formula>
<mml:math display="inline" id="im82"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and the key view <inline-formula>
<mml:math display="inline" id="im83"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, we apply specific data augmentations. For a specific anchor index <inline-formula>
<mml:math display="inline" id="im84"><mml:mi>i</mml:mi></mml:math></inline-formula>, we define the set of all indices in the current batch (or memory queue) as <inline-formula>
<mml:math display="inline" id="im85"><mml:mrow><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. We strictly categorize these indices into two subsets: Positive Set: This set contains the indices of samples that share the same class label as the anchor <inline-formula>
<mml:math display="inline" id="im86"><mml:mi>i</mml:mi></mml:math></inline-formula>: <inline-formula>
<mml:math display="inline" id="im87"><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>j</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2223;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>&#x2260;</mml:mo><mml:mi>i</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. Negative Set: This set contains the indices of samples from all other classes: <inline-formula>
<mml:math display="inline" id="im88"><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>A</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>i</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2223;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo>&#x2260;</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. And we adopt the InfoNCE loss (<xref ref-type="bibr" rid="B1">Ahmed et&#xa0;al., 2024</xref>) as our contrastive objective. The objective is to maximize the similarity between the anchor and its positive peers while minimizing the similarity with negative samples. The loss for anchor <inline-formula>
<mml:math display="inline" id="im89"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is formulated as <xref ref-type="disp-formula" rid="eq19">Equation 19</xref>:</p>
<disp-formula id="eq19"><label>(19)</label>
<mml:math display="block" id="M20"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mi>&#x3f5;</mml:mi><mml:msub><mml:mi>Q</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mi>log</mml:mi></mml:mrow></mml:mstyle><mml:mfrac><mml:mrow><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2299;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mo stretchy="false">/</mml:mo><mml:mi>&#x3c4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mi>&#x404;</mml:mi><mml:msub><mml:mi>K</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:msub><mml:mi>e</mml:mi><mml:mi>x</mml:mi><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2299;</mml:mo><mml:msub><mml:mi>x</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">/</mml:mo><mml:mi>&#x3c4;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im90"><mml:mo>&#x2299;</mml:mo></mml:math></inline-formula> denotes the dot product, <inline-formula>
<mml:math display="inline" id="im91"><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> is the cardinality of the positive set, and <inline-formula>
<mml:math display="inline" id="im92"><mml:mi>&#x3c4;</mml:mi></mml:math></inline-formula> is the temperature parameter (set to 16). The denominator sums over all contrastive samples to strictly regulate the embedding space. <inline-formula>
<mml:math display="inline" id="im93"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the element of positive set <inline-formula>
<mml:math display="inline" id="im94"><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im95"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is the element of negative set <inline-formula>
<mml:math display="inline" id="im96"><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>. It aims to pull <inline-formula>
<mml:math display="inline" id="im97"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> closer to <inline-formula>
<mml:math display="inline" id="im98"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, and push <inline-formula>
<mml:math display="inline" id="im99"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> further to <inline-formula>
<mml:math display="inline" id="im100"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>. In this paper, to complement contrastive loss, global feature augmentation loss is computed for sample <inline-formula>
<mml:math display="inline" id="im101"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> to improve the generalization of class separation. Denote the prototype for each class as <inline-formula>
<mml:math display="inline" id="im102"><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, the <inline-formula>
<mml:math display="inline" id="im103"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is expressed by <xref ref-type="disp-formula" rid="eq20">Equation 20</xref>:</p>
<disp-formula id="eq20"><label>(20)</label>
<mml:math display="block" id="M21"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>R</mml:mi></mml:mfrac><mml:mstyle displaystyle="true"><mml:msubsup><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi><mml:mi>R</mml:mi></mml:msubsup><mml:mrow><mml:msub><mml:mi>P</mml:mi><mml:mi>j</mml:mi></mml:msub><mml:mi>log</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mrow></mml:mstyle></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im104"><mml:mi>R</mml:mi></mml:math></inline-formula> is the number of training images. And the overall training objective can be concluded as <xref ref-type="disp-formula" rid="eq21">Equation 21</xref>:</p>
<disp-formula id="eq21"><label>(21)</label>
<mml:math display="block" id="M22"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>A</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math>
</disp-formula>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experimental results and discusses</title>
<sec id="s4_1">
<label>4.1</label>
<title>Dataset</title>
<p>To validate the generalization of our method, additional experiments are conducted on two publicly available herb datasets, with comparisons made against other methods.</p>
<p>The two herb datasets are the Chinese Medicine dataset (<xref ref-type="bibr" rid="B35">Thella and Ulagamuthalvi, 2021</xref>) and Medicinal Leaf (<xref ref-type="bibr" rid="B18">Huang and Xu, 2023</xref>), and excerpts of these datasets are illustrated in <xref ref-type="fig" rid="f5"><bold>Figures&#xa0;5</bold></xref> and <xref ref-type="fig" rid="f6"><bold>6</bold></xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>The part of the Chinese Medicine dataset. The Chinese Medicine dataset comprises 20 different types of Chinese medicinal plants, comprising a total of 3000 images.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g005.tif">
<alt-text content-type="machine-generated">The part of the Chinese Medicine dataset. The Chinese Medicine dataset comprises 20 different types of Chinese medicinal plants, comprising a total of 3000 images.</alt-text>
</graphic></fig>
<fig id="f6" position="float">
<label>Figure&#xa0;6</label>
<caption>
<p>The part of Medicinal Leaf dataset. The Medicinal Leaf dataset contains 100 types of herbal plants, comprising a total of 10000 images.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g006.tif">
<alt-text content-type="machine-generated">The part of Medicinal Leaf dataset. The Medicinal Leaf dataset contains 100 types of herbal plants, comprising a total of 10000 images.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Implementation details</title>
<p>The model is optimized by SGD with 0.9 momentum. The initial learning rate is 0.1, and the learning rate decay strategy is StepLR. The batch size is set to 16, and the final model is obtained when reaches 100 epochs in the basic learning phase. For the incremental learning phase, we fine-tune the per-trained model, and the novel query samples are compared with the key embedding of basic training. The model updates the classifier over 10 epochs to mitigate overfitting. The code is built by using PyTorch=2.2.1 with Python=3.11. The model is trained on a PC (equipped with an Intel i7 processor) with a graphics processing unit card (NVIDIA 4090, 24G memory).</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Performance metrics</title>
<p>We use Accuracy, <italic>Precision</italic>, <italic>Recall</italic>, <italic>Specificity</italic>, and <italic>F</italic> 1<italic>Sore</italic> as evaluation metrics as shown in <xref ref-type="disp-formula" rid="eq22">Equations 22</xref>&#x2013;<xref ref-type="disp-formula" rid="eq27">27</xref>. Furthermore, harmonic mean value (HM) (<xref ref-type="bibr" rid="B19">Kim et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B44">Yang et&#xa0;al., 2023</xref>) is used to balance the inherent biases of basic classes with incremental classes.</p>
<disp-formula id="eq22"><label>(22)</label>
<mml:math display="block" id="M23"><mml:mrow><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq23"><label>(23)</label>
<mml:math display="block" id="M24"><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>P</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq24"><label>(24)</label>
<mml:math display="block" id="M25"><mml:mrow><mml:mtext>Recall</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>F</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq25"><label>(25)</label>
<mml:math display="block" id="M26"><mml:mrow><mml:mtext>Specificity</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow><mml:mrow><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mo>+</mml:mo><mml:mi>T</mml:mi><mml:mi>N</mml:mi></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq26"><label>(26)</label>
<mml:math display="block" id="M27"><mml:mrow><mml:mtext>F</mml:mtext><mml:mn>1</mml:mn><mml:mtext>&#xa0;Score</mml:mtext><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#xd7;</mml:mo><mml:mtext>Precision</mml:mtext><mml:mo>&#xd7;</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow><mml:mrow><mml:mtext>Precision</mml:mtext><mml:mo>+</mml:mo><mml:mtext>Recall</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq27"><label>(27)</label>
<mml:math display="block" id="M28"><mml:mrow><mml:mi>H</mml:mi><mml:mi>M</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>where TN represents the number of True Negative, and TP denotes the number of True Positive. FN indicates the number of False Negative, and FP is the number of False Positive. <inline-formula>
<mml:math display="inline" id="im105"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the accuracy of base classes, <inline-formula>
<mml:math display="inline" id="im106"><mml:mrow><mml:msub><mml:mi>A</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mi>n</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is the top-1 accuracy of incremental classes.</p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Performance of identification results</title>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Performance of model identification</title>
<p>We split our dataset into the training set and the testing set. Specifically, 75% of the data is used for training, and the remaining 25% is used for testing. The basic training phase is composed of 16 classes with 200 samples. And there are 4 incremental sequences, within 3 classes and 5 samples of each class. The experimental results of each class are detailed in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>The experimental results of different varieties by our model.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Class</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">F1_ score</th>
<th valign="middle" align="center">Class</th>
<th valign="middle" align="center">Precision</th>
<th valign="middle" align="center">Recall</th>
<th valign="middle" align="center">Specificity</th>
<th valign="middle" align="center">F1_ score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">A</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">O</td>
<td valign="middle" align="center">97.000</td>
<td valign="middle" align="center">636.0</td>
<td valign="middle" align="center">99.900</td>
<td valign="middle" align="center">76.800</td>
</tr>
<tr>
<td valign="middle" align="center">B</td>
<td valign="middle" align="center">82.400</td>
<td valign="middle" align="center">82.400</td>
<td valign="middle" align="center">99.200</td>
<td valign="middle" align="center">82.400</td>
<td valign="middle" align="center">P</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">20.000</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">33.300</td>
</tr>
<tr>
<td valign="middle" align="center">C</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">99.600</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">99.800</td>
<td valign="middle" align="center">Q</td>
<td valign="middle" align="center">57.900</td>
<td valign="middle" align="center">62.000</td>
<td valign="middle" align="center">98.300</td>
<td valign="middle" align="center">59.900</td>
</tr>
<tr>
<td valign="middle" align="center">D</td>
<td valign="middle" align="center">90.200</td>
<td valign="middle" align="center">96.000</td>
<td valign="middle" align="center">99.500</td>
<td valign="middle" align="center">93.000</td>
<td valign="middle" align="center">R</td>
<td valign="middle" align="center">45.800</td>
<td valign="middle" align="center">84.500</td>
<td valign="middle" align="center">96.200</td>
<td valign="middle" align="center">59.400</td>
</tr>
<tr>
<td valign="middle" align="center">E</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">S</td>
<td valign="middle" align="center">88.900</td>
<td valign="middle" align="center">76.000</td>
<td valign="middle" align="center">99.600</td>
<td valign="middle" align="center">81.900</td>
</tr>
<tr>
<td valign="middle" align="center">F</td>
<td valign="middle" align="center">95.800</td>
<td valign="middle" align="center">99.200</td>
<td valign="middle" align="center">99.800</td>
<td valign="middle" align="center">97.500</td>
<td valign="middle" align="center">T</td>
<td valign="middle" align="center">30.300</td>
<td valign="middle" align="center">62.000</td>
<td valign="middle" align="center">96.000</td>
<td valign="middle" align="center">40.700</td>
</tr>
<tr>
<td valign="middle" align="center">G</td>
<td valign="middle" align="center">80.900</td>
<td valign="middle" align="center">82.800</td>
<td valign="middle" align="center">99.100</td>
<td valign="middle" align="center">81.800</td>
<td valign="middle" align="center">U</td>
<td valign="middle" align="center">87.500</td>
<td valign="middle" align="center">70.000</td>
<td valign="middle" align="center">99.700</td>
<td valign="middle" align="center">77.800</td>
</tr>
<tr>
<td valign="middle" align="center">H</td>
<td valign="middle" align="center">98.600</td>
<td valign="middle" align="center">87.600</td>
<td valign="middle" align="center">99.900</td>
<td valign="middle" align="center">92.800</td>
<td valign="middle" align="center">V</td>
<td valign="middle" align="center">45.000</td>
<td valign="middle" align="center">30.000</td>
<td valign="middle" align="center">99.000</td>
<td valign="middle" align="center">36.000</td>
</tr>
<tr>
<td valign="middle" align="center">I</td>
<td valign="middle" align="center">95.800</td>
<td valign="middle" align="center">92.000</td>
<td valign="middle" align="center">99.800</td>
<td valign="middle" align="center">93.900</td>
<td valign="middle" align="center">W</td>
<td valign="middle" align="center">30.000</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">95.700</td>
<td valign="middle" align="center">46.200</td>
</tr>
<tr>
<td valign="middle" align="center">J</td>
<td valign="middle" align="center">99.500</td>
<td valign="middle" align="center">80.800</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">89.200</td>
<td valign="middle" align="center">X</td>
<td valign="middle" align="center">32.300</td>
<td valign="middle" align="center">62.000</td>
<td valign="middle" align="center">97.600</td>
<td valign="middle" align="center">42.500</td>
</tr>
<tr>
<td valign="middle" align="center">K</td>
<td valign="middle" align="center">84.800</td>
<td valign="middle" align="center">94.000</td>
<td valign="middle" align="center">99.200</td>
<td valign="middle" align="center">89.200</td>
<td valign="middle" align="center">Z</td>
<td valign="middle" align="center">66.700</td>
<td valign="middle" align="center">2.000</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">3.900</td>
</tr>
<tr>
<td valign="middle" align="center">L</td>
<td valign="middle" align="center">86.000</td>
<td valign="middle" align="center">93.200</td>
<td valign="middle" align="center">99.300</td>
<td valign="middle" align="center">89.500</td>
<td valign="middle" align="center">Z1</td>
<td valign="middle" align="center">72.200</td>
<td valign="middle" align="center">52.000</td>
<td valign="middle" align="center">99.800</td>
<td valign="middle" align="center">60.500</td>
</tr>
<tr>
<td valign="middle" align="center">M</td>
<td valign="middle" align="center">80.700</td>
<td valign="middle" align="center">87.200</td>
<td valign="middle" align="center">99.000</td>
<td valign="middle" align="center">83.800</td>
<td valign="middle" align="center">Z2</td>
<td valign="middle" align="center">91.700</td>
<td valign="middle" align="center">44.000</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">59.500</td>
</tr>
<tr>
<td valign="middle" align="center">N</td>
<td valign="middle" align="center">93.900</td>
<td valign="middle" align="center">49.200</td>
<td valign="middle" align="center">99.800</td>
<td valign="middle" align="center">64.600</td>
<td valign="middle" align="center">Z3</td>
<td valign="middle" align="center">91.300</td>
<td valign="middle" align="center">42.000</td>
<td valign="middle" align="center">100.0</td>
<td valign="middle" align="center">57.500</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>As shown in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>, the identification performance is notably strong for the base classes. In particular, the model demonstrates high accuracy and robustness in classifying classes A, E, C, F, D, H, and I. In contrast, the F1-scores for the incremental classes predominantly range between 0.3 and 0.6, highlighting the model&#x2019;s limited effectiveness in learning new classes. For instance, classes R and Q exhibit high recall but low precision; this can be attributed to significant inter-class feature similarity, which leads to misclassification. To further evaluate our model&#x2019;s performance, we calculated the confusion matrix, and the experimental results are depicted in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7</bold></xref>.</p>
<fig id="f7" position="float">
<label>Figure&#xa0;7</label>
<caption>
<p>The experimental results of the confusion matrix. <bold>(A)</bold> is ours, <bold>(B)</bold> is the result without high-frequency and low-frequency enhanced images. Significant contrast areas are marked with red and green.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g007.tif">
<alt-text content-type="machine-generated">Two confusion matrices, labeled (A) and (B), showing true vs. predicted labels for a classification task. Both matrices have a color gradient from light to dark blue, representing low to high values. Red rectangles in (A) and green rectangles in (B) highlight areas of significant values, indicating patterns of confusion in predictions. Numerical values range up to two hundred fifty. Axes are labeled as True Labels and Predicted Labels, with a color scale bar on the right indicating the value intensity.</alt-text>
</graphic></fig>
<p>While the identification results for base classes are comparable across methods, the improvements in incremental classes are more pronounced. In the confusion matrices, a brighter diagonal indicates higher identification accuracy, with significant contrast areas highlighted in red and green. As shown in <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7A</bold></xref>, for base class identification, the marked areas demonstrate that FGDE achieves superior performance with fewer misclassifications compared to other methods. When comparing <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7A</bold></xref> with <xref ref-type="fig" rid="f7"><bold>Figure&#xa0;7B</bold></xref>, our model demonstrates distinct advantages over the baseline lacking high-frequency and low-frequency enhancement, particularly in the highlighted regions. Mechanistically, low-frequency components capture global structural features, while high-frequency components extract local fine-grained details, thereby enriching the image representation. Conversely, while the method without frequency-aware extension maintains a visible diagonal for base classes, it performs poorly on novel classes. In contrast, our method exhibits robust performance, indicating that it effectively adapts to novel classes without disrupting previous decision boundaries.</p>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Different losses of model identification</title>
<p>To better evaluate the performance of the model and enhance the explanation of training, the loss and accuracy results of our model are shown in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>.</p>
<fig id="f8" position="float">
<label>Figure&#xa0;8</label>
<caption>
<p>The loss and accuracy results of our model. <bold>(A)</bold> is the curve of different loss changes, and the curve for training accuracy. <bold>(B)</bold> is the curve of the testing loss and the curve of the testing accuracy of our model.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g008.tif">
<alt-text content-type="machine-generated">Four line graphs show training and testing metrics over 100 epochs. Upper left: training loss with ce_loss, fa_loss, and cl_loss lines decreasing. Upper right: training accuracy increases reaching about 0.95. Lower left: testing loss decreases with fluctuations. Lower right: testing accuracy increases steadily, reaching around 0.9.</alt-text>
</graphic></fig>
<p>The loss curves and their convergence trends are illustrated in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8</bold></xref>. Throughout the training process, the loss consistently decreased while accuracy improved, eventually leading to model convergence. Specifically, the trajectories of the multi-objective losses are detailed in <xref ref-type="fig" rid="f8"><bold>Figure&#xa0;8A</bold></xref>. The loss exhibits a steady decline until stabilizing at approximately 80 epochs, with the model achieving peak accuracy at epoch 88. Evaluations on the testing set confirm the model&#x2019;s robust classification performance.</p>
<p>Simultaneously, as illustrated in <xref ref-type="fig" rid="f9"><bold>Figure&#xa0;9</bold></xref>, While the CE loss baseline provides marginal class separation, our proposed method demonstrates superior capability in distinguishing base classes and integrating novel classes with minimal feature overlap.</p>
<fig id="f9" position="float">
<label>Figure&#xa0;9</label>
<caption>
<p>The testing loss and accuracy results of our model and with only CE loss.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g009.tif">
<alt-text content-type="machine-generated">Two line graphs compare testing loss and accuracy over 100 epochs. The left graph shows testing loss, while the right shows testing accuracy. &#x201c;Our&#x201d; method, in blue, initially shows higher loss but stabilizes lower than the &#x201c;CE&#x201d; method, in orange. For accuracy, &#x201c;Our&#x201d; method consistently achieves higher values compared to &#x201c;CE&#x201d;.</alt-text>
</graphic></fig>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>Visualization of class separation</title>
<p>To verify the effectiveness of our model, we visualize the identification results using a scatter plot, as shown in <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10</bold></xref>. The horizontal axis represents the True Labels, while the vertical axis denotes the Predicted Labels. In this visualization, points aligned closely with the diagonal indicate accurate classification performance. Conversely, points deviating from the diagonal represent misclassifications, where the magnitude of deviation highlights the discrepancy between the predicted and ground truth labels.</p>
<fig id="f10" position="float">
<label>Figure&#xa0;10</label>
<caption>
<p>The scatter plot of different classes. <bold>(A)</bold> is our model, and <bold>(B)</bold> is without high-frequency and low-frequency enhanced images. The horizontal axis represents the true labels (True Labels), and the vertical axis represents the predicted labels (Predictions). The color of each point represents a different class, and each color uniquely corresponds to a class.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g010.tif">
<alt-text content-type="machine-generated">Scatter plots labeled (A) and (B) show predictions versus true labels for 28 classes with a color-coded legend. Both plots demonstrate a positive correlation, with scattered points in various colors representing different classes. Plot (A) has a tighter cluster of points compared to plot (B), which exhibits a wider spread, particularly among higher classes. Each class maintains a distinct color, enabling identification across the plots.</alt-text>
</graphic></fig>
<p>In <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10A</bold></xref>, the points are predominantly clustered along the diagonal, indicating that the model achieves robust overall classification performance. In contrast, while <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10B</bold></xref> exhibits some alignment with the diagonal, a significantly larger number of points deviate from it. This dispersion is particularly pronounced in specific categories, such as classes 7 and 8, indicating higher misclassification rates. For the newly added classes, <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10A</bold></xref> maintains relatively high accuracy despite occasional errors. Conversely, <xref ref-type="fig" rid="f10"><bold>Figure&#xa0;10B</bold></xref> reveals a marked decline in performance for these later classes, evidenced by a substantial increase in misclassified points and greater deviations from the diagonal. In summary, our method leverages a frequency-based separation strategy: low-frequency components extract fundamental structural features, while high-frequency components capture fine-grained details. This approach enhances inter-class separability and minimizes the interference of new classes on existing representations. Consequently, our method demonstrates significant advantages, exhibiting superior accuracy and robustness.</p>
</sec>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Comparison with state of the arts</title>
<sec id="s4_5_1">
<label>4.5.1</label>
<title>Our dataset</title>
<p>To evaluate the accuracy performance of our model, the state-of-the-art FSCIL models are compared with ours. The experimental results are shown in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>The results of the comparison of the state-of-the-art FSCIL models.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Methods</th>
<th valign="middle" rowspan="2" align="center">Base Accuracy</th>
<th valign="middle" colspan="4" align="center">Accuracy in each sequence (%)</th>
<th valign="middle" rowspan="2" align="center">HM (%)</th>
<th valign="middle" rowspan="2" align="center">Improve (%)</th>
</tr>
<tr>
<th valign="middle" align="center">1</th>
<th valign="middle" align="center">2</th>
<th valign="middle" align="center">3</th>
<th valign="middle" align="center">4</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">iCaRL (<xref ref-type="bibr" rid="B27">Rebuffi et&#xa0;al., 2017</xref>)</td>
<td valign="middle" align="center">70.542</td>
<td valign="middle" align="center">67.490</td>
<td valign="middle" align="center">60.371</td>
<td valign="middle" align="center">56.423</td>
<td valign="middle" align="center">50.812</td>
<td valign="middle" align="center">58.774</td>
<td valign="middle" align="center">+27.825</td>
</tr>
<tr>
<td valign="middle" align="center">FSLL (<xref ref-type="bibr" rid="B23">Mazumder et&#xa0;al., 2021</xref>)</td>
<td valign="middle" align="center">91.421</td>
<td valign="middle" align="center">87.310</td>
<td valign="middle" align="center">84.037</td>
<td valign="middle" align="center">77.034</td>
<td valign="middle" align="center">69.532</td>
<td valign="middle" align="center">75.023</td>
<td valign="middle" align="center">+11.576</td>
</tr>
<tr>
<td valign="middle" align="center">C-FSCIL (<xref ref-type="bibr" rid="B14">Hersche et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="center">92.040</td>
<td valign="middle" align="center">85.403</td>
<td valign="middle" align="center">82.760</td>
<td valign="middle" align="center">75.351</td>
<td valign="middle" align="center">70.130</td>
<td valign="middle" align="center">78.411</td>
<td valign="middle" align="center">+8.188</td>
</tr>
<tr>
<td valign="middle" align="center">FACT (<xref ref-type="bibr" rid="B47">Zhou et&#xa0;al., 2022</xref>)</td>
<td valign="middle" align="center">94.051</td>
<td valign="middle" align="center">87.494</td>
<td valign="middle" align="center">83.057</td>
<td valign="middle" align="center">76.157</td>
<td valign="middle" align="center">70.426</td>
<td valign="middle" align="center">79.284</td>
<td valign="middle" align="center">+7.315</td>
</tr>
<tr>
<td valign="middle" align="center">SAVC (<xref ref-type="bibr" rid="B30">Song et&#xa0;al., 2023</xref>)</td>
<td valign="middle" align="center">94.081</td>
<td valign="middle" align="center">90.014</td>
<td valign="middle" align="center">82.729</td>
<td valign="middle" align="center">79.747</td>
<td valign="middle" align="center">75.071</td>
<td valign="middle" align="center">84.328</td>
<td valign="middle" align="center">+2.271</td>
</tr>
<tr>
<td valign="middle" align="center">Wang (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2024</xref>)</td>
<td valign="middle" align="center">94.132</td>
<td valign="middle" align="center">88.093</td>
<td valign="middle" align="center">83.282</td>
<td valign="middle" align="center">76.421</td>
<td valign="middle" align="center">73.579</td>
<td valign="middle" align="center">83.101</td>
<td valign="middle" align="center">+3.498</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>Our</bold></td>
<td valign="middle" align="center"><bold>95.000</bold></td>
<td valign="middle" align="center"><bold>91.701</bold></td>
<td valign="middle" align="center"><bold>84.873</bold></td>
<td valign="middle" align="center"><bold>82.516</bold></td>
<td valign="middle" align="center"><bold>78.906</bold></td>
<td valign="middle" align="center">86.599</td>
<td valign="middle" align="center"/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values indicates the optimal solution among all methods.</p>
</table-wrap-foot>
</table-wrap>
<p>Our proposed method achieves a peak accuracy of 95.000% on the base classes, surpassing all existing baselines. Across subsequent incremental sessions, our method maintains robust performance, recording accuracies of 91.701% in the first session and 78.906% in the fourth. Notably, catastrophic forgetting is significantly better mitigated compared to competing approaches.</p>
<p>While most methods suffer performance degradation as new classes are introduced, distinct patterns emerge. iCaRL exhibits the most inferior performance, starting with a base accuracy of 70.542% and declining rapidly, resulting in the lowest Harmonic Mean (HM) of 58.774%. Although FSLL, FACT, and C-FSCIL achieve respectable base accuracies (91.421%, 92.040%, and 94.051%, respectively), they experience sharp drops in later sessions, yielding HM scores below 80%. SAVC and Wang&#x2019;s method demonstrate relatively stronger resilience, with HMs of 84.328% and 83.101%, respectively. Nevertheless, our approach consistently outperforms these leading methods across all sessions, securing the highest HM of 86.599%.</p>
<p>This superior performance is attributed to our frequency decomposition strategy. By separating images into low-frequency components (capturing global structural features) and high-frequency components (preserving fine-grained details), we generate a detail-enhanced discriminative representation. This mechanism bolsters both base class separability and novel class generalization. In summary, our model retains base class knowledge while adapting to new sequences with minimal degradation, marking a significant advancement over state-of-the-art methods in solving the FSCIL challenge.</p>
</sec>
<sec id="s4_5_2">
<label>4.5.2</label>
<title>Visualization of class activation maps</title>
<p>Class Activation Maps (CAMs) are essential for interpreting model decisions by highlighting influential image regions. For each instance, the original image is shown with its corresponding CAMs in <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>. The color intensity represents the activation level, signifying the importance of each region in the model&#x2019;s classification result.</p>
<fig id="f11" position="float">
<label>Figure&#xa0;11</label>
<caption>
<p>The visualization of the different attention modules for each class. The heat maps of each class are randomly selected. The first is the original image, the second is the heatmap of Wang&#x2019;s (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2024</xref>) method, and the last is ours. The results are organized into <bold>(A, B)</bold>. <bold>(A)</bold> are the base classes, <bold>(B)</bold> are the incremental classes.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g011.tif">
<alt-text content-type="machine-generated">Twelve sets of images are arranged in two rows labeled (A) and (B), each with three rows: &#x201c;Img,&#x201d; &#x201c;Wang,&#x201d; and &#x201c;Our.&#x201d; The top row shows original images of objects like huajiao or chuanbeimu. The middle row displays the &#x201c;Wang&#x201d; method, and the bottom row shows the &#x201c;Our&#x201d; method, both using color maps to visualize data or differences on the objects. Each set numbered one to twelve demonstrates a comparison among the three rows, highlighting visual variations across different methods.</alt-text>
</graphic></fig>
<p>From <xref ref-type="fig" rid="f11"><bold>Figure&#xa0;11</bold></xref>, a systematic comparison across both base and incremental classes reveals a consistent pattern of superior performance by our method. Our approach demonstrates significantly more accurate localization of target objects, with activations that adhere tightly to object boundaries while effectively suppressing background noise. For instance, in items (1), (4), (8), and (10) (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2024</xref>), produces diffuse activations that often spill into the background or focus on restricted, peripheral regions. In contrast, our method generates heatmaps that are precisely centered on the targets, covering their salient regions more effectively. Furthermore, our model consistently yields more comprehensive activation maps that encompass the entire object, suggesting the acquisition of a holistic representation. This is particularly evident in items (2), (5), (7), and (11). Whereas (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2024</xref>) tends to fixate on local textures or edges, our model captures the full semantic structure of the object. This holistic understanding is crucial for robust classification, rendering the model less susceptible to variations in orientation or partial occlusion. Moreover, the heatmaps generated by our method exhibit a more concentrated focus on the objects&#x2019; discriminative regions. In contrast, the activations in the Wang (<xref ref-type="bibr" rid="B40">Wang et&#xa0;al., 2024</xref>) model appears scattered and less intense, as evident in examples (5), (9), and (12). Our model, conversely, produces strong, focused activations localized on the core features of the objects. This indicates that our approach more effectively identifies key predictive features and is less prone to relying on spurious image correlations. These qualitative results strongly support our hypothesis that the proposed methodology facilitates learning more robust and interpretable feature representations. By generating more complete and accurately localized heatmaps, our model demonstrates a deeper semantic understanding of the image content and enhanced mitigation of catastrophic forgetting.</p>
</sec>
<sec id="s4_5_3">
<label>4.5.3</label>
<title>Chinese medicine dataset</title>
<p>We compare our method with other state-of-the-art methods on the Chinese Medicine dataset, and the comparison results are shown in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref>. From <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12A</bold></xref>, the base classes are 8, the number of classes in each incremental subsequent is 4, and the N-way is set to 3. We can see that our model exhibits the highest accuracy among other mainstream methods. From <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12B</bold></xref>, the model is initialized with 60 base classes, followed by incremental sessions containing 8 classes each, with the setting of 3-shot samples per class.</p>
<fig id="f12" position="float">
<label>Figure&#xa0;12</label>
<caption>
<p>Comparison with the state-of-the-art on two public datasets: <bold>(A)</bold> Chinese Medicine and <bold>(B)</bold> Medicinal Leaf. Error bars indicate the standard deviation, which are used to visualize the performance variance and stability of each method.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1730047-g012.tif">
<alt-text content-type="machine-generated">Two line graphs compare accuracy over several stages for different methods. Graph A shows accuracy from stages zero to four for methods iCaRL, FSLL, FACT, C-FSCIL, SAVC, Wang, and Our. Graph B extends stages to eight, displaying similar trends. Overall, accuracy tends to decline across stages, with iCaRL generally showing the lowest accuracy compared to other methods.</alt-text>
</graphic></fig>
<p>As observed in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref>, iCaRL exhibits the most precipitous performance decline, suggesting that its replay-based strategy lacks the stability required for FSCIL tasks and struggles to mitigate interference from novel classes. FSLL attains high initial accuracy but suffers a sharp drop in later stages, indicating that despite early effectiveness, its long-term generalization capability is limited. FACT and C-FSCIL display a more gradual decay, reflecting stronger knowledge retention, although performance degradation persists. Conversely, SAVC and the method by Wang et&#xa0;al. maintain relatively stable performance, particularly in later sessions. Furthermore, to evaluate the statistical reliability of our results, we add the error bars to represent the standard deviation in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12</bold></xref>. Our method is consistently compact for incremental sessions in both Figures (A) and (B). This low variance indicates that our FGDE model is highly robust to initialization differences and data sampling fluctuations, maintaining stable performance even as the number of classes increases. In contrast, baseline methods such as iCaRL and FSLL exhibit larger error bars in several sessions (e.g., Session 0 and 3 in <xref ref-type="fig" rid="f12"><bold>Figure&#xa0;12B</bold></xref>), suggesting higher instability. Notably, our method outperforms all competing approaches, achieving consistent improvement and minimal degradation. By leveraging fine-grained feature comparison and multi-semantic discrimination, our approach significantly enhances adaptability to novel categories while effectively mitigating catastrophic forgetting.</p>
</sec>
</sec>
<sec id="s4_6">
<label>4.6</label>
<title>Horizontal comparison and performance trade-offs</title>
<sec id="s4_6_1" sec-type="results">
<label>4.6.1</label>
<title>Results of different backbones</title>
<p>To select the appropriate backbone to extract features in our model, the various backbones are compared to reflect the impact of model efficiency. We also evaluate the model with a hybrid transformer structure, such as MBConv (<xref ref-type="bibr" rid="B15">Howard et&#xa0;al., 2017</xref>).</p>
<p>Using running time as a key feasibility metric, we compared various backbones in <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref>. ResNet20 and ResNet50 showed lower accuracy and longer inference times compared to ResNet18. VGG&#x2019;s simpler architecture limited its feature extraction, reducing base class accuracy, while EfficientNet and DenseNet121 proved computationally expensive due to their complexity. Results from MBConv further highlighted the limitations of hybrid transformer structures in this setting. Although VGG19 improved incremental class accuracy by 1.322%, its slower running time made it less viable. Thus, ResNet18 was selected for achieving the highest comprehensive performance.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>The results of the comparison of the different backbones.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Backbone</th>
<th valign="middle" rowspan="2" align="center">Base accuracy</th>
<th valign="middle" colspan="4" align="center">Accuracy in each sequence (%)</th>
<th valign="middle" rowspan="2" align="center">HM (%)</th>
<th valign="middle" rowspan="2" align="center">Improve (%)</th>
<th valign="middle" rowspan="2" align="center">Time (min)</th>
</tr>
<tr>
<th valign="middle" align="center">1</th>
<th valign="middle" align="center">2</th>
<th valign="middle" align="center">3</th>
<th valign="middle" align="center">4</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">VGG-16 (<xref ref-type="bibr" rid="B29">Simonyan and Zisserman, 2014</xref>)</td>
<td valign="middle" align="center">70.250</td>
<td valign="middle" align="center">67.365</td>
<td valign="middle" align="center">64.439</td>
<td valign="middle" align="center">62.742</td>
<td valign="middle" align="center">60.474</td>
<td valign="middle" align="center">63.755</td>
<td valign="middle" align="center">+22.844</td>
<td valign="middle" align="center">251</td>
</tr>
<tr>
<td valign="middle" align="center">VGG-19 (<xref ref-type="bibr" rid="B29">Simonyan and Zisserman, 2014</xref>)</td>
<td valign="middle" align="center">90.865</td>
<td valign="middle" align="center">89.423</td>
<td valign="middle" align="center"><bold>88.702</bold></td>
<td valign="middle" align="center"><bold>86.899</bold></td>
<td valign="middle" align="center"><bold>86.659</bold></td>
<td valign="middle" align="center"><bold>87.921</bold></td>
<td valign="middle" align="center">-1.322</td>
<td valign="middle" align="center">292</td>
</tr>
<tr>
<td valign="middle" align="center">ResNet20 (<xref ref-type="bibr" rid="B13">He et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="center">86.875</td>
<td valign="middle" align="center">71.875</td>
<td valign="middle" align="center">72.132</td>
<td valign="middle" align="center">68.275</td>
<td valign="middle" align="center">65.767</td>
<td valign="middle" align="center">69.512</td>
<td valign="middle" align="center">+17.087</td>
<td valign="middle" align="center">153</td>
</tr>
<tr>
<td valign="middle" align="center">ResNet50 (<xref ref-type="bibr" rid="B13">He et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="center">93.510</td>
<td valign="middle" align="center">86.096</td>
<td valign="middle" align="center">82.841</td>
<td valign="middle" align="center">74.361</td>
<td valign="middle" align="center">72.879</td>
<td valign="middle" align="center">79.044</td>
<td valign="middle" align="center">+7.555</td>
<td valign="middle" align="center">198</td>
</tr>
<tr>
<td valign="middle" align="center">EfficientNet-B0 (<xref ref-type="bibr" rid="B32">Tan, 2019</xref>)</td>
<td valign="middle" align="center">90.625</td>
<td valign="middle" align="center">84.352</td>
<td valign="middle" align="center">81.944</td>
<td valign="middle" align="center">76.236</td>
<td valign="middle" align="center">73.377</td>
<td valign="middle" align="center">78.977</td>
<td valign="middle" align="center">+7.622</td>
<td valign="middle" align="center">231</td>
</tr>
<tr>
<td valign="middle" align="center">DenseNet121 (<xref ref-type="bibr" rid="B17">Huang et&#xa0;al., 2017</xref>)</td>
<td valign="middle" align="center">90.745</td>
<td valign="middle" align="center">79.973</td>
<td valign="middle" align="center">76.389</td>
<td valign="middle" align="center">71.213</td>
<td valign="middle" align="center">70.911</td>
<td valign="middle" align="center">74.622</td>
<td valign="middle" align="center">+11.977</td>
<td valign="middle" align="center">375</td>
</tr>
<tr>
<td valign="middle" align="center">MBConv (<xref ref-type="bibr" rid="B15">Howard et&#xa0;al., 2017</xref>)</td>
<td valign="middle" align="center">88.094</td>
<td valign="middle" align="center">84.057</td>
<td valign="middle" align="center">78.157</td>
<td valign="middle" align="center">75.426</td>
<td valign="middle" align="center">70.361</td>
<td valign="middle" align="center">77.000</td>
<td valign="middle" align="center">+9.599</td>
<td valign="middle" align="center">652</td>
</tr>
<tr>
<td valign="middle" align="center"><bold>ResNet18</bold> (<xref ref-type="bibr" rid="B13">He et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="center"><bold>95.000</bold></td>
<td valign="middle" align="center"><bold>91.701</bold></td>
<td valign="middle" align="center">84.873</td>
<td valign="middle" align="center">82.516</td>
<td valign="middle" align="center">78.906</td>
<td valign="middle" align="center">86.599</td>
<td valign="middle" align="center"/>
<td valign="middle" align="center"><bold>131</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values indicates the optimal solution among all methods.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s4_6_2" sec-type="results">
<label>4.6.2</label>
<title>Results of different number of classes</title>
<p>To evaluate the learning ability and generalization ability of our model to incremental classes, experiments with different numbers of classes are performed for the basic training. This paper compares the base 12 classes with the 16 classes. When the base classes are 12, the number of classes in each incremental subsequent is 4, and N-way is set to 4. When the base classes are 16, the number of classes in each incremental subsequent is 3, and N-way is set to 3. The experimental results are shown in <xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref>.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>The results of the comparison of the different numbers of classes.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Base classes</th>
<th valign="middle" rowspan="2" align="center">Base accuracy</th>
<th valign="middle" colspan="4" align="center">Accuracy in each sequence (%)</th>
<th valign="middle" rowspan="2" align="center">HM (%)</th>
<th valign="middle" rowspan="2" align="center">Improve (%)</th>
</tr>
<tr>
<th valign="middle" align="center">1</th>
<th valign="middle" align="center">2</th>
<th valign="middle" align="center">3</th>
<th valign="middle" align="center">4</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">12</td>
<td valign="middle" align="center"><bold>96.875</bold></td>
<td valign="middle" align="center">81.971</td>
<td valign="middle" align="center">74.375</td>
<td valign="middle" align="center">74.041</td>
<td valign="middle" align="center">65.98</td>
<td valign="middle" align="center">74.092</td>
<td valign="middle" align="center">+12.507</td>
</tr>
<tr>
<td valign="middle" align="center">16</td>
<td valign="middle" align="center">95.000</td>
<td valign="middle" align="center"><bold>92.509</bold></td>
<td valign="middle" align="center"><bold>86.071</bold></td>
<td valign="middle" align="center"><bold>80.234</bold></td>
<td valign="middle" align="center"><bold>79.072</bold></td>
<td valign="middle" align="center"><bold>86.599</bold></td>
<td valign="middle" align="center"/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values indicates the optimal solution among all methods.</p>
</table-wrap-foot>
</table-wrap>
<p>For base 12 classes, the base accuracy is a better 1.875% than the 16 classes. However, base 12 classes perform 10.380% worse than base 16 classes in the incremental sequences. The experimental results demonstrate that a greater number of base classes enhances the ability of the model to learn and capture fine-grained features more effectively. Finally, we set the base class to 16, N-way is 3.</p>
</sec>
<sec id="s4_6_3" sec-type="results">
<label>4.6.3</label>
<title>Results of different sizes of cropping</title>
<p>In the original settings, images are initially cropped to enrich the fine-grained feature space and increase image diversity. To evaluate the impact of cropping sizes on model performance, our experiment examines the different cropping sizes. The comparison results are shown in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>The results of the comparison of the different sizes of cropping.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Crop size</th>
<th valign="middle" rowspan="2" align="center">Base accuracy</th>
<th valign="middle" colspan="4" align="center">Accuracy in each sequence (%)</th>
<th valign="middle" rowspan="2" align="center">HM (%)</th>
<th valign="middle" rowspan="2" align="center">Improve (%)</th>
</tr>
<tr>
<th valign="middle" align="center">1</th>
<th valign="middle" align="center">2</th>
<th valign="middle" align="center">3</th>
<th valign="middle" align="center">4</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center"><bold>128</bold></td>
<td valign="middle" align="center"><bold>95.000</bold></td>
<td valign="middle" align="center"><bold>92.509</bold></td>
<td valign="middle" align="center"><bold>86.071</bold></td>
<td valign="middle" align="center"><bold>80.234</bold></td>
<td valign="middle" align="center"><bold>79.072</bold></td>
<td valign="middle" align="center"><bold>86.599</bold></td>
<td valign="middle" align="center"/>
</tr>
<tr>
<td valign="middle" align="center">96</td>
<td valign="middle" align="center">92.909</td>
<td valign="middle" align="center">85.849</td>
<td valign="middle" align="center">81.771</td>
<td valign="middle" align="center">75.184</td>
<td valign="middle" align="center">71.956</td>
<td valign="middle" align="center">78.690</td>
<td valign="middle" align="center">+7.909</td>
</tr>
<tr>
<td valign="middle" align="center">64</td>
<td valign="middle" align="center">90.625</td>
<td valign="middle" align="center">84.352</td>
<td valign="middle" align="center">81.944</td>
<td valign="middle" align="center">76.236</td>
<td valign="middle" align="center">73.377</td>
<td valign="middle" align="center">78.977</td>
<td valign="middle" align="center">+7.622</td>
</tr>
<tr>
<td valign="middle" align="center">32</td>
<td valign="middle" align="center">90.745</td>
<td valign="middle" align="center">79.973</td>
<td valign="middle" align="center">76.389</td>
<td valign="middle" align="center">71.213</td>
<td valign="middle" align="center">70.911</td>
<td valign="middle" align="center">74.622</td>
<td valign="middle" align="center">+11.977</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values indicates the optimal solution among all methods.</p>
</table-wrap-foot>
</table-wrap>
<p>From <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>, it is evident that crop size significantly affects feature diversity. The 128-crop size achieves the highest discrimination, surpassing the 96-crop and 32-crop settings by 7.909% and 11.980%. Additionally, the similar HM scores for crop sizes 96 and 64 suggest a consistent feature distribution at these scales. Given these results, we fixed the crop size at 128 to ensure optimal model performance.</p>
</sec>
<sec id="s4_6_4" sec-type="results">
<label>4.6.4</label>
<title>Results of different numbers of base sequences</title>
<p>To verify the effectiveness of our networks for few-shot images, we also compared the impact of different numbers of base classes on model performance. To ensure fairness, the remaining parameters remain unchanged in the comparison. The experimental results are shown in <xref ref-type="table" rid="T6"><bold>Table&#xa0;6</bold></xref>.</p>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>The results of the comparison for the different numbers of base sequences.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" rowspan="2" align="center">Number</th>
<th valign="middle" rowspan="2" align="center">Base accuracy</th>
<th valign="middle" colspan="4" align="center">Accuracy in each sequence (%)</th>
<th valign="middle" rowspan="2" align="center">HM (%)</th>
<th valign="middle" rowspan="2" align="center">Improve (%)</th>
</tr>
<tr>
<th valign="middle" align="center">1</th>
<th valign="middle" align="center">2</th>
<th valign="middle" align="center">3</th>
<th valign="middle" align="center">4</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="center">300</td>
<td valign="middle" align="center"><bold>95.312</bold></td>
<td valign="middle" align="center">88.916</td>
<td valign="middle" align="center">85.822</td>
<td valign="middle" align="center">76.565</td>
<td valign="middle" align="center">72.423</td>
<td valign="middle" align="center">80.932</td>
<td valign="middle" align="center">+5.667</td>
</tr>
<tr>
<td valign="middle" align="center">250</td>
<td valign="middle" align="center"><bold>95.533</bold></td>
<td valign="middle" align="center">86.809</td>
<td valign="middle" align="center">82.350</td>
<td valign="middle" align="center">72.891</td>
<td valign="middle" align="center">70.201</td>
<td valign="middle" align="center">78.063</td>
<td valign="middle" align="center">+8.536</td>
</tr>
<tr>
<td valign="middle" align="center">200</td>
<td valign="middle" align="center">95.000</td>
<td valign="middle" align="center"><bold>92.509</bold></td>
<td valign="middle" align="center"><bold>86.071</bold></td>
<td valign="middle" align="center"><bold>80.234</bold></td>
<td valign="middle" align="center"><bold>79.072</bold></td>
<td valign="middle" align="center"><bold>86.599</bold></td>
<td valign="middle" align="center"/>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold values indicates the optimal solution among all methods.</p>
</table-wrap-foot>
</table-wrap>
<p>The results indicate that the accuracy for base classes remains relatively stable regardless of the class count. However, regarding the identification of incremental classes, the configuration with 200 base classes yields an accuracy 8.536% higher than that with 250 base classes and 5.667% higher than that with 300 base classes. These findings suggest that 200 base classes offer the optimal balance between feature diversity and model generalization.</p>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="conclusions">
<label>5</label>
<title>Conclusion</title>
<p>In this paper, we proposed the FGDE address the challenge of FSCIL within the context of fine-grained images. This method innovatively synergizes the low-frequency and high-frequency components with dual-domain contrastive learning to enhance feature discriminability. Unlike existing methods that struggle with subtle inter-class differences, our approach effectively sharpens decision boundaries while maintaining the stability of base classes. Extensive experiments on both proprietary and public TCM datasets demonstrate that FGDE outperforms state-of-the-art methods, offering a robust solution for balancing plasticity and stability. In future work, we aim to further refine identification performance for highly fine-grained categories. Promising directions include integrating diffusion models to address data imbalance via high-fidelity sample generation and employing Graph Convolutional Networks (GCNs) to capture neighborhood structures, thereby further alleviating catastrophic forgetting.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>CT: Formal Analysis, Investigation, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. ZQ: Methodology, Software, Writing &#x2013; review &amp; editing. ZT: Resources, Software, Writing &#x2013; review &amp; editing. YH: Funding acquisition, Methodology, Supervision, Writing &#x2013; review &amp; editing. KL: Funding acquisition, Project administration, Resources, Supervision, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Ahmed</surname> <given-names>N.</given-names></name>
<name><surname>Kukleva</surname> <given-names>A.</given-names></name>
<name><surname>Schiele</surname> <given-names>B.</given-names></name>
</person-group> (<year>2024</year>). &#x201c;
<article-title>OrCo: towards better generalization via orthogonality and contrast for few-shot class-incremental learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>. (CVPR), (<publisher-loc>Washington</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>28762</fpage>&#x2013;<lpage>28771</lpage>.
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Arg&#xfc;eso</surname> <given-names>D.</given-names></name>
<name><surname>Picon</surname> <given-names>A.</given-names></name>
<name><surname>Irusta</surname> <given-names>U.</given-names></name>
<name><surname>Medela</surname> <given-names>A.</given-names></name>
<name><surname>San-Emeterio</surname> <given-names>M. G.</given-names></name>
<name><surname>Bereciartua</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). 
<article-title>Few-Shot Learning approach for plant disease classification using images taken in the field</article-title>. <source>Comput. Electron. Agr.</source> <volume>175</volume>, <fpage>105542</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2020.105542</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Armijos</surname> <given-names>C.</given-names></name>
<name><surname>Ram&#xed;rez</surname> <given-names>J.</given-names></name>
<name><surname>Vidari</surname> <given-names>G.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Poorly investigated Ecuadorian medicinal plants</article-title>. <source>Plants</source> <volume>11</volume>, <fpage>1590</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/plants11121590</pub-id>, PMID: <pub-id pub-id-type="pmid">35736741</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Attri</surname> <given-names>I.</given-names></name>
<name><surname>Awasthi</surname> <given-names>L. K.</given-names></name>
<name><surname>Sharma</surname> <given-names>T. P.</given-names></name>
<name><surname>Rathee</surname> <given-names>P.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>A review of deep learning techniques used in agriculture</article-title>. <source>Ecol. Inform.</source> <volume>77</volume>, <fpage>102217</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2023.102217</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cai</surname> <given-names>Z.</given-names></name>
<name><surname>He</surname> <given-names>M.</given-names></name>
<name><surname>Li</surname> <given-names>C.</given-names></name>
<name><surname>Qi</surname> <given-names>H.</given-names></name>
<name><surname>Bai</surname> <given-names>R.</given-names></name>
<name><surname>Yang</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). 
<article-title>Identification of chrysanthemum using hyperspectral imaging based on few-shot class incremental learning</article-title>. <source>Comput. Electron. Agr.</source> <volume>215</volume>, <fpage>108371</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2023.108371</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>G.</given-names></name>
<name><surname>Xia</surname> <given-names>Z.</given-names></name>
<name><surname>Ma</surname> <given-names>X.</given-names></name>
<name><surname>Jiang</surname> <given-names>Y.</given-names></name>
<name><surname>He</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>MobileNet-GDR: A lightweight algorithm for grape leaf disease identification based on improved mobileNetV4-small</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>, <elocation-id>1702071</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1702071</pub-id>, PMID: <pub-id pub-id-type="pmid">41281328</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Dvornik</surname> <given-names>N.</given-names></name>
<name><surname>Schmid</surname> <given-names>C.</given-names></name>
<name><surname>Mairal</surname> <given-names>J.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Diversity with cooperation: Ensemble methods for few-shot classification</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF international conference on computer vision</conf-name>. (CVPR), (<publisher-loc>Long Beach, CA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>3723</fpage>&#x2013;<lpage>3731</lpage>.
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fei-Fei</surname> <given-names>L.</given-names></name>
<name><surname>Fergus</surname> <given-names>R.</given-names></name>
<name><surname>Perona</surname> <given-names>P.</given-names></name>
</person-group> (<year>2006</year>). 
<article-title>One-shot learning of object categories</article-title>. <source>IEEE T. Pattern Anal.</source> <volume>28</volume>, <fpage>594</fpage>&#x2013;<lpage>611</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TPAMI.2006.79</pub-id>, PMID: <pub-id pub-id-type="pmid">16566508</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Fitzgerald</surname> <given-names>M.</given-names></name>
<name><surname>Heinrich</surname> <given-names>M.</given-names></name>
<name><surname>Booker</surname> <given-names>A.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Medicinal plant analysis: A historical and regional discussion of emergent complex techniques</article-title>. <source>Front. Pharmacol.</source> <volume>10</volume>, <elocation-id>1480</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fphar.2019.01480</pub-id>, PMID: <pub-id pub-id-type="pmid">31998121</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gao</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>H.</given-names></name>
<name><surname>Fu</surname> <given-names>W.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Few-shot learning for image-based bridge damage detection</article-title>. <source>Eng. Appl. Artif. Intel.</source> <volume>126</volume>, <fpage>107078</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2023.107078</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Han</surname> <given-names>W.</given-names></name>
<name><surname>Huang</surname> <given-names>K.</given-names></name>
<name><surname>Geng</surname> <given-names>J.</given-names></name>
<name><surname>Jiang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Semi-supervised few-shot class-incremental learning based on dynamic topology evolution</article-title>. <source>Eng. Appl. Artif. Intel.</source> (CVPR), (
<publisher-name>IEEE</publisher-name>), <volume>133</volume>, <fpage>108528</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.engappai.2024.108528</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Fan</surname> <given-names>H.</given-names></name>
<name><surname>Wu</surname> <given-names>Y.</given-names></name>
<name><surname>Xie</surname> <given-names>S.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Momentum contrast for unsupervised visual representation learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. <fpage>9729</fpage>&#x2013;<lpage>9738</lpage>.
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Zhang</surname> <given-names>X.</given-names></name>
<name><surname>Ren</surname> <given-names>S.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Deep residual learning for image recognition</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>. (CVPR), (<publisher-loc>Las Vegas</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>770</fpage>&#x2013;<lpage>778</lpage>.
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Hersche</surname> <given-names>M.</given-names></name>
<name><surname>Karunaratne</surname> <given-names>G.</given-names></name>
<name><surname>Cherubini</surname> <given-names>G.</given-names></name>
<name><surname>Benini</surname> <given-names>L.</given-names></name>
<name><surname>Sebastian</surname> <given-names>A.</given-names></name>
<name><surname>Rahimi</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>Constrained few-shot class-incremental learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. (CVPR), (<publisher-loc>(New Orleans: IEEE)</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>9057</fpage>&#x2013;<lpage>9067</lpage>.
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Howard</surname> <given-names>A. G.</given-names></name>
<name><surname>Zhu</surname> <given-names>M.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<name><surname>Kalenichenko</surname> <given-names>D.</given-names></name>
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Weyand</surname> <given-names>T.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). &#x201c;
<article-title>MobileNets: efficient convolutional neural networks for mobile vision applications</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition</source>. <volume>26</volume>, <page-range>1&#x2013;9</page-range>. (CVPR), (<publisher-loc>Hawaii</publisher-loc>: 
<publisher-name>IEEE</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1704.04861</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Huang</surname> <given-names>H.</given-names></name>
<name><surname>Geng</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>L.</given-names></name>
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Liu</surname> <given-names>F.</given-names></name>
<name><surname>Peng</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Metabolic profiling and pharmacological evaluation of alkaloids in three Murraya species</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>, <elocation-id>1675533</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1675533</pub-id>, PMID: <pub-id pub-id-type="pmid">41169715</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Huang</surname> <given-names>G.</given-names></name>
<name><surname>Liu</surname> <given-names>Z.</given-names></name>
<name><surname>van der Maaten</surname> <given-names>L.</given-names></name>
<name><surname>Weinberger</surname> <given-names>K. Q.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Densely connected convolutional networks</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>.  (CVPR), (<publisher-loc>Hawaii</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>4700</fpage>&#x2013;<lpage>4708</lpage>.
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Huang</surname> <given-names>M. L.</given-names></name>
<name><surname>Xu</surname> <given-names>Y. X.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Image classification of Chinese medicinal flowers based on convolutional neural network</article-title>. <source>Math. Biosci. Eng.</source> <volume>20</volume>, <fpage>14978</fpage>&#x2013;<lpage>14994</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3934/mbe.2023671</pub-id>, PMID: <pub-id pub-id-type="pmid">37679168</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Kim</surname> <given-names>D. Y.</given-names></name>
<name><surname>Han</surname> <given-names>D. J.</given-names></name>
<name><surname>Seo</surname> <given-names>J.</given-names></name>
<name><surname>Moon</surname> <given-names>J.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>Warping the space: Weight space rotation for class-incremental few-shot learning</article-title>,&#x201d; in <conf-name>The Eleventh International Conference on Learning Representations</conf-name>. (ICLR), (<publisher-loc>Kigali</publisher-loc>: 
<publisher-name>ICLR</publisher-name>). 
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>LeCun</surname> <given-names>Y.</given-names></name>
<name><surname>Bengio</surname> <given-names>Y.</given-names></name>
<name><surname>Hinton</surname> <given-names>G.</given-names></name>
</person-group> (<year>2015</year>). 
<article-title>Deep learning</article-title>. <source>Nature</source> <volume>521</volume>, <fpage>436</fpage>&#x2013;<lpage>444</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/nature14539</pub-id>, PMID: <pub-id pub-id-type="pmid">26017442</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>M.</given-names></name>
<name><surname>Wang</surname> <given-names>D.</given-names></name>
<name><surname>Liu</surname> <given-names>X.</given-names></name>
<name><surname>Zeng</surname> <given-names>Z.</given-names></name>
<name><surname>Lu</surname> <given-names>R.</given-names></name>
<name><surname>Chen</surname> <given-names>B.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;
<article-title>Patchct: Aligning patch set and label set with conditional transport for multi-label image classification</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF International Conference on Computer Vision</conf-name>. (CVPR), (<publisher-loc>Vancouver</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>15348</fpage>&#x2013;<lpage>15358</lpage>.
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Martinetz</surname> <given-names>T.</given-names></name>
<name><surname>Schulten</surname> <given-names>K.</given-names></name>
</person-group> (<year>1991</year>). 
<article-title>A&#x201d; neural-gas&#x201d; network learns topologies</article-title>. <source>Artificial Neural Networks</source>. <volume>1</volume>, <page-range>397&#x2013;402</page-range>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Mazumder</surname> <given-names>P.</given-names></name>
<name><surname>Singh</surname> <given-names>P.</given-names></name>
<name><surname>Rai</surname> <given-names>P.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Few-shot lifelong learning</article-title>,&#x201d; in <conf-name>In Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>. (AAAI), (
<publisher-name>AAAI Press</publisher-name>) <fpage>2337</fpage>&#x2013;<lpage>2345</lpage>.
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pandey</surname> <given-names>A.</given-names></name>
<name><surname>Jain</surname> <given-names>K.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>A robust deep attention dense convolutional neural network for plant leaf disease identification and classification from smart phone captured real world images</article-title>. <source>Ecol. Inform.</source> <volume>70</volume>, <fpage>101725</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2022.101725</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Prudent</surname> <given-names>Y.</given-names></name>
<name><surname>Ennaji</surname> <given-names>A.</given-names></name>
</person-group> (<year>2005</year>). &#x201c;
<article-title>An incremental growing neural gas learns topologies</article-title>,&#x201d; in <conf-name>Proceedings. 2005 IEEE International Joint Conference on Neural Networks, 2005</conf-name>. (IJCNN), (<publisher-loc>Canada, QC</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>1211</fpage>&#x2013;<lpage>1216</lpage>.
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Raichur</surname> <given-names>N. L.</given-names></name>
<name><surname>Heublein</surname> <given-names>L.</given-names></name>
<name><surname>Feigl</surname> <given-names>T.</given-names></name>
<name><surname>R&#xfc;gamer</surname> <given-names>A.</given-names></name>
<name><surname>Mutschler</surname> <given-names>C.</given-names></name>
<name><surname>Ott</surname> <given-names>F.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Bayesian learning-driven prototypical contrastive loss for class-incremental learning</article-title>. <source>Transact. Mach. Learn. Res</source>. <volume>2025</volume>, <fpage>03</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2405.11067</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Rebuffi</surname> <given-names>S. A.</given-names></name>
<name><surname>Kolesnikov</surname> <given-names>A.</given-names></name>
<name><surname>Sperl</surname> <given-names>G.</given-names></name>
<name><surname>Lampert</surname> <given-names>C. H.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>icarl: Incremental classifier and representation learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on Computer Vision and Pattern Recognition</conf-name>. (CVPR), (<publisher-loc>Hawaii</publisher-loc>: 
<publisher-name>IEEE</publisher-name>) <fpage>2001</fpage>&#x2013;<lpage>2010</lpage>.
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Rezaei</surname> <given-names>M.</given-names></name>
<name><surname>Diepeveen</surname> <given-names>D.</given-names></name>
<name><surname>Laga</surname> <given-names>H.</given-names></name>
<name><surname>Jones</surname> <given-names>M. G.</given-names></name>
<name><surname>Sohel</surname> <given-names>F.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Plant disease recognition in a low data scenario using few-shot learning</article-title>. <source>Comput. Electron. Agr.</source> <volume>219</volume>, <fpage>108812</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.108812</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Simonyan</surname> <given-names>K.</given-names></name>
<name><surname>Zisserman</surname> <given-names>A.</given-names></name>
</person-group> (<year>2014</year>). 
<article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <source>Comput. Sci</source> arXiv preprint arXiv:1409.1556. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1409.1556</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Song</surname> <given-names>Z.</given-names></name>
<name><surname>Zhao</surname> <given-names>Y.</given-names></name>
<name><surname>Shi</surname> <given-names>Y.</given-names></name>
<name><surname>Peng</surname> <given-names>P.</given-names></name>
<name><surname>Yuan</surname> <given-names>L.</given-names></name>
<name><surname>Tian</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>Learning with fantasy: Semantic-aware virtual contrastive constraint for few-shot class-incremental learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. (CVPR), (<publisher-loc>Vancouver</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>24183</fpage>&#x2013;<lpage>24192</lpage>.
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>M.</given-names></name>
<name><surname>Xu</surname> <given-names>S.</given-names></name>
<name><surname>Mei</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Gu</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>W.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>MicroRNAs in medicinal plants</article-title>. <source>Int. J. Mol. Sci.</source> <volume>23</volume>, <fpage>10477</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/ijms231810477</pub-id>, PMID: <pub-id pub-id-type="pmid">36142389</pub-id>
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>M.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Efficientnet: Rethinking model scaling for convolutional neural networks</article-title>.<source>International Conference on Machine Learning</source> arXiv preprint arXiv:1905.11946. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1905.11946</pub-id>
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Tan</surname> <given-names>C.</given-names></name>
<name><surname>Tian</surname> <given-names>L.</given-names></name>
<name><surname>Wu</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>K.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Rapid identification of medicinal plants via visual feature-based deep learning</article-title>. <source>Plant Methods</source> <volume>20</volume>, <fpage>81</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13007-024-01202-6</pub-id>, PMID: <pub-id pub-id-type="pmid">38822406</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Tao</surname> <given-names>X.</given-names></name>
<name><surname>Hong</surname> <given-names>X.</given-names></name>
<name><surname>Chang</surname> <given-names>X.</given-names></name>
<name><surname>Dong</surname> <given-names>S.</given-names></name>
<name><surname>Wei</surname> <given-names>X.</given-names></name>
<name><surname>Gong</surname> <given-names>Y.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Few-shot class-incremental learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>. (CVPR), (
<publisher-name>IEEE</publisher-name>), <fpage>12183</fpage>&#x2013;<lpage>12192</lpage>.
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Thella</surname> <given-names>P. K.</given-names></name>
<name><surname>Ulagamuthalvi</surname> <given-names>V.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>An efficient double labelling image segmentation model for leaf pixel extraction for medical plant detection</article-title>. <source>Ann. Romanian Soc. Cell Biol.</source> <volume>22</volume> (5), <fpage>2241</fpage>&#x2013;<lpage>2251</lpage>.
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Vani</surname> <given-names>K.</given-names> <suffix>Sree</suffix></name>
<name><surname>Sudharshanam</surname> <given-names>U.</given-names></name>
<name><surname>Mallela Venkata</surname> <given-names>N. K.</given-names></name>
<name><surname>Mandla</surname> <given-names>R.</given-names></name>
<name><surname>Sreenivas</surname> <given-names>A.</given-names></name>
<name><surname>Bedika</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Smart agriculture: A climate-driven approach to modelling and forecasting fall armyworm populations in maize using machine learning algorithms</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>, <elocation-id>1636412</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1636412</pub-id>, PMID: <pub-id pub-id-type="pmid">41245445</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Liu</surname> <given-names>B.</given-names></name>
<name><surname>Liu</surname> <given-names>L.</given-names></name>
<name><surname>Zhu</surname> <given-names>Y.</given-names></name>
<name><surname>Hou</surname> <given-names>J.</given-names></name>
<name><surname>Liu</surname> <given-names>P.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). 
<article-title>A review of deep learning used in the hyperspectral image analysis for agriculture</article-title>. <source>Artif. Intell. Rev.</source> <volume>54</volume>, <fpage>5205</fpage>&#x2013;<lpage>5253</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10462-021-10018-y</pub-id>
</mixed-citation>
</ref>
<ref id="B38">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>X.</given-names></name>
<name><surname>Sun</surname> <given-names>J.</given-names></name>
<name><surname>Tian</surname> <given-names>P.</given-names></name>
<name><surname>Wu</surname> <given-names>M.</given-names></name>
<name><surname>Zhao</surname> <given-names>J.</given-names></name>
<name><surname>Chen</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Intelligent grading of sugarcane leaf disease severity by integrating physiological traits with the SSA-XGBoost algorithm</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>, <elocation-id>1698808</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1698808</pub-id>, PMID: <pub-id pub-id-type="pmid">41169726</pub-id>
</mixed-citation>
</ref>
<ref id="B39">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>W.</given-names></name>
<name><surname>Xu</surname> <given-names>J.</given-names></name>
<name><surname>Fang</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>Z.</given-names></name>
<name><surname>Li</surname> <given-names>M.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Advances and challenges in medicinal plant breeding</article-title>. <source>Plant Sci.</source> <volume>298</volume>, <fpage>110573</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.plantsci.2020.110573</pub-id>, PMID: <pub-id pub-id-type="pmid">32771174</pub-id>
</mixed-citation>
</ref>
<ref id="B40">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>Q. W.</given-names></name>
<name><surname>Zhou</surname> <given-names>D. W.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y. K.</given-names></name>
<name><surname>Zhan</surname> <given-names>D. C.</given-names></name>
<name><surname>Ye</surname> <given-names>H. J.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Few-shot class-incremental learning via training-free prototype calibration</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>36</volume>:<page-range>15060&#x2013;15076</page-range>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2312.05229</pub-id>
</mixed-citation>
</ref>
<ref id="B41">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wu</surname> <given-names>E.</given-names></name>
<name><surname>Chen</surname> <given-names>Y.</given-names></name>
<name><surname>Ma</surname> <given-names>R.</given-names></name>
<name><surname>Zhao</surname> <given-names>X.</given-names></name>
</person-group> (<year>2025</year>). 
<article-title>A review of weed image identification based on deep few-shot learning</article-title>. <source>Comput. Electron. Agr.</source> <volume>237</volume>, <fpage>110675</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.110675</pub-id>
</mixed-citation>
</ref>
<ref id="B42">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xiao</surname> <given-names>Q.</given-names></name>
<name><surname>Mu</surname> <given-names>X.</given-names></name>
<name><surname>Liu</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>B.</given-names></name>
<name><surname>Liu</surname> <given-names>H.</given-names></name>
<name><surname>Zhang</surname> <given-names>B.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). 
<article-title>Plant metabolomics: a new strategy and tool for quality evaluation of Chinese medicinal materials</article-title>. <source>Chin. Med-UK</source> <volume>17</volume>, <fpage>45</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1186/s13020-022-00601-y</pub-id>, PMID: <pub-id pub-id-type="pmid">35395803</pub-id>
</mixed-citation>
</ref>
<ref id="B43">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Xiao</surname> <given-names>Y.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
<name><surname>Xiong</surname> <given-names>H.</given-names></name>
<name><surname>Xiao</surname> <given-names>F.</given-names></name>
<name><surname>Huang</surname> <given-names>R.</given-names></name>
<name><surname>Hong</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>A large-scale lychee image parallel classification algorithm based on spark and deep learning</article-title>. <source>Comput. Electron. Agr.</source> <volume>230</volume>, <fpage>109952</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2025.109952</pub-id>
</mixed-citation>
</ref>
<ref id="B44">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Yang</surname> <given-names>Y.</given-names></name>
<name><surname>Yuan</surname> <given-names>H.</given-names></name>
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Lin</surname> <given-names>Z.</given-names></name>
<name><surname>Torr</surname> <given-names>P.</given-names></name>
<name><surname>Tao</surname> <given-names>D.</given-names></name>
</person-group> (<year>2023</year>). &#x201c;
<article-title>Neural collapse inspired feature classifier alignment for few-shot class-incremental learning</article-title>,&#x201d; in <conf-name>International Conference on Learning Representations</conf-name>.  (ICLR), (<publisher-loc>Kigali</publisher-loc>: 
<publisher-name>ICLR</publisher-name>).
</mixed-citation>
</ref>
<ref id="B45">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zang</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Peng</surname> <given-names>Y.</given-names></name>
<name><surname>Han</surname> <given-names>S.</given-names></name>
<name><surname>Zhao</surname> <given-names>Q.</given-names></name>
<name><surname>Zhang</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>Automatic detection and counting of wheat seedling based on unmanned aerial vehicle images</article-title>. <source>Front. Plant Sci.</source> <volume>16</volume>, <elocation-id>1665672</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2025.1665672</pub-id>, PMID: <pub-id pub-id-type="pmid">41195154</pub-id>
</mixed-citation>
</ref>
<ref id="B46">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>M.</given-names></name>
<name><surname>Shi</surname> <given-names>Z.</given-names></name>
<name><surname>Zhang</surname> <given-names>S.</given-names></name>
<name><surname>Gao</surname> <given-names>J.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>A database on mycorrhizal traits of Chinese medicinal plants</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>, <elocation-id>840343</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2022.840343</pub-id>, PMID: <pub-id pub-id-type="pmid">35300014</pub-id>
</mixed-citation>
</ref>
<ref id="B47">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Zhou</surname> <given-names>D. W.</given-names></name>
<name><surname>Wang</surname> <given-names>F. Y.</given-names></name>
<name><surname>Ye</surname> <given-names>H. J.</given-names></name>
<name><surname>Ma</surname> <given-names>L.</given-names></name>
<name><surname>Pu</surname> <given-names>S.</given-names></name>
<name><surname>Zhan</surname> <given-names>D. C.</given-names></name>
</person-group> (<year>2022</year>). &#x201c;
<article-title>Forward compatible few-shot class-incremental learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>.  (CVPR), (<publisher-loc>New Orleans</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>9046</fpage>&#x2013;<lpage>9056</lpage>.
</mixed-citation>
</ref>
<ref id="B48">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhou</surname> <given-names>Y.</given-names></name>
<name><surname>Zhu</surname> <given-names>H.</given-names></name>
<name><surname>Xu</surname> <given-names>C.</given-names></name>
<name><surname>Zhang</surname> <given-names>R.</given-names></name>
<name><surname>Hua</surname> <given-names>G.</given-names></name>
<name><surname>Yang</surname> <given-names>W.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Class-incremental novel category discovery in remote sensing image scene classification via contrastive learning</article-title>. <source>IEEE J. Stars</source>. <volume>17</volume>, <page-range>9214&#x2013;9225</page-range>.  doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JSTARS.2024.3391512</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3152000">Sathishkumar Samiappan</ext-link>, The University of Tennessee, United States</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/276802">Panagiotis Madesis</ext-link>, University of Thessaly, Greece</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3261105">Long Chen</ext-link>, Chinese Academy of Forestry, China</p></fn>
</fn-group>
</back>
</article>