<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Mar. Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Marine Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Mar. Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-7745</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmars.2025.1729254</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Self-supervised transfer learning for few-shot classification on marine plankton images</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Zhong</surname><given-names>Xuxiang</given-names></name>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" equal-contrib="yes">
<name><surname>Lin</surname><given-names>Yingzhen</given-names></name>
<xref ref-type="author-notes" rid="fn003"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3205804/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Feng</surname><given-names>Zhenghui</given-names></name>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3248142/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Ling</surname><given-names>Feng</given-names></name>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>School of Science, Faculty of Frontier Sciences, Harbin Institute of Technology (Shenzhen)</institution>, <city>Shenzhen</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Zhenghui Feng, <email xlink:href="mailto:fengzhenghui@hit.edu.cn">fengzhenghui@hit.edu.cn</email></corresp>
<fn fn-type="equal" id="fn003">
<label>&#x2020;</label>
<p>These authors have contributed equally to this work and share first authorship</p></fn>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-01-22">
<day>22</day>
<month>01</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1729254</elocation-id>
<history>
<date date-type="received">
<day>21</day>
<month>10</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>24</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>08</day>
<month>12</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Zhong, Lin, Feng and Ling.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Zhong, Lin, Feng and Ling</copyright-holder>
<license>
<ali:license_ref start_date="2026-01-22">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>The classification of marine plankton images is of great significance in ecological studies and environmental monitoring. In practical applications, plankton image classification faces several challenges, including sample imbalance, distinguishing between-class and within-class differences, and recognizing fine-grained features. To address these issues, we propose a few-shot self-supervised transfer learning (FSTL) framework. In FSTL, we design a new loss function that incorporates both supervised and self-supervised learning. The core of FSTL is a hybrid learning objective that integrates self-supervised contrastive learning for robust feature representation. Operating within a transfer learning paradigm, FSTL effectively adapts knowledge from head-classes to boost the few-shot classification performance on tail-classes. We applied FSTL to two datasets, in which plankton images were collected from Daya Bay and provided by the Woods Hole Oceanographic Institution (WHOI) datasets respectively. The experimental results demonstrated that our method showed better adaptability in the classification of plankton images. The findings of this study not only apply to the classification of plankton images but also offer the potential for classifying small-sample categories within long-tailed datasets.</p>
</abstract>
<kwd-group>
<kwd>few-shot learning</kwd>
<kwd>image classification</kwd>
<kwd>long-tail distribution</kwd>
<kwd>marine plankton</kwd>
<kwd>self-supervised learning</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This work is supported by the Basic Research Fund in Shenzhen Natural Science Foundation (No. JCYJ20240813104924033), and Guangdong Basic and Applied Basic Research Foundation (No. 2023A1515010884).</funding-statement>
</funding-group>
<counts>
<fig-count count="5"/>
<table-count count="12"/>
<equation-count count="11"/>
<ref-count count="37"/>
<page-count count="13"/>
<word-count count="8256"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Ocean Observation</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Over the past three decades, <italic>in situ</italic> monitoring has generated vast amounts of data for marine plankton observation. <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref> shows images of four categories of marine plankton collected using this technique (<xref ref-type="bibr" rid="B18">Li et&#xa0;al., 2021</xref>). This surge in data has facilitated the application of machine learning and deep learning methods to the automated analysis of plankton image datasets (<xref ref-type="bibr" rid="B4">Benfield et&#xa0;al., 2007</xref>; <xref ref-type="bibr" rid="B23">MacLeod et&#xa0;al., 2010</xref>; <xref ref-type="bibr" rid="B14">Irisson et&#xa0;al., 2022</xref>). With advancements in convolutional neural networks (CNNs), deep learning methods have significantly enhanced feature extraction and image classification capabilities, thereby achieving classification tasks across various plankton image datasets (<xref ref-type="bibr" rid="B17">Lee et&#xa0;al., 2016</xref>; <xref ref-type="bibr" rid="B19">Li and Cui, 2016</xref>; <xref ref-type="bibr" rid="B26">Pedraza et&#xa0;al., 2017</xref>; <xref ref-type="bibr" rid="B5">Bochinski et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B22">Luo et&#xa0;al., 2018</xref>; <xref ref-type="bibr" rid="B21">Lumini and Nanni, 2019</xref>; <xref ref-type="bibr" rid="B13">Henrichs et&#xa0;al., 2021</xref>). However, their performance significantly degrades in real-world scenarios characterized by severe class imbalance and data scarcity. This is precisely the challenge of plankton image analysis.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>Marine plankton images of four categories from DYB dataset. <bold>(a)</bold> Ostracoda; <bold>(b)</bold> Calanoid Type B; <bold>(c)</bold> Gammarids Type A; <bold>(d)</bold> Polychaeta Type D.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1729254-g001.tif">
<alt-text content-type="machine-generated">Four microscopic images of marine organisms against a black background. (a) Shows a small, oval-shaped organism with a bright center. (b) Depicts a translucent organism with antennae. (c) Features an elongated organism with visible segmentation and red eye spots. (d) Displays a slender, segmented organism with a slightly blurred outline.</alt-text>
</graphic></fig>
<p>The automated classification of plankton species faces intrinsic challenges because of morphological convergence, where phylogenetically distinct organisms exhibit similar global shapes. As depicted in the <xref ref-type="fig" rid="f1"><bold>Figures&#xa0;1a, b</bold></xref>, the specimens share elliptical silhouettes but diverge in microstructural patterns. An analogous case occurs in the <xref ref-type="fig" rid="f1"><bold>Figures&#xa0;1c, d</bold></xref>, both of which are elongated in shape. This necessitates models capable of capturing fine-grained morphological signatures beyond coarse shape attributes. Furthermore, intra-class morphological plasticity driven by imaging-angle variations amplifies the challenge. Identical species may present drastically different projections under oblique versus axial views, which demands the simultaneous modeling of inter-class separability and intra-class invariance.</p>
<p>Because of natural plankton distribution patterns and limitations in monitoring technology, some categories are frequently observed and well-represented (referred to as dominant or head class), whereas others contain only limited samples (known as disadvantaged or tail class). <xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref> presents the sample sizes of different classes in the DYB dataset (<xref ref-type="bibr" rid="B18">Li et&#xa0;al., 2021</xref>), and clearly demonstrates a long-tailed distribution and data imbalance. Although CNNs possess powerful feature extraction capabilities, they are inherently biased toward head classes, leading to poor recognition of rare but ecologically important species. Standard transfer learning, which fine-tunes models pre-trained on large-scale datasets, often fails because the source and target domains are too distinct, and the model&#x2019;s bias toward head classes persists. Similarly, class re-balancing strategies (e.g., re-sampling, loss re-weighting) are ineffective at the extreme &#x201c;tail&#x201d; of the distribution. These strategies require a minimum number of tail class samples to be effective, yet the core problem here is the sheer lack of such samples&#x2014;many rare plankton categories have only a handful of instances, making re-balancing infeasible and often leading to overfitting. Motivated by these challenges, we introduce few-shot learning as a solution for this kind of data.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>The long-tailed distribution of the DYB-plankton dataset.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1729254-g002.tif">
<alt-text content-type="machine-generated">Bar chart showing the number of samples for various categories. The y-axis is logarithmic, labeled &#x201c;Number of Samples,&#x201d; ranging from ten to ten thousand. The x-axis lists different categories such as particles, bubbles, and plankton. The legend on the right shows color codes for various particle types, including filamentous Type A and B, bluish, molts, translucent flocs, yellowish flocs, yellowish rods, bubbles, and plankton.</alt-text>
</graphic></fig>
<p>Samples from tail classes are not only scarce but also often sparse, appearing infrequently and offering limited examples for model training. This sparsity obstructs effective feature extraction, which ultimately constrains the generalization capability. In this paper, the Few-shot Self-supervised Transfer Learning (FSTL) framework is designed to overcome these limitations by explicitly transferring robust feature representations from data-rich head classes to enable accurate classification of data-scarce tail classes. First, we train the model on head-class data, and subsequently fine-tune it on tail-class data, with the goal of developing a classification model that performs effectively on tail-class data. To address these issues, we propose a novel loss function for FSTL and investigate the selection of hyperparameters. The main contribution of this paper can be summarized as follows: (1) We design a novel FSTL framework to achieve few-shot classification on marine plankton images. (2) Through a carefully designed toy experiment, we further demonstrate the robustness of FSTL in handling low-quality plankton images. The results confirm that our proposed loss function enhances classification accuracy, particularly for the tail class with limited samples, highlighting the method&#x2019;s capability to address practical challenges in marine image analysis.</p>
<p>The rest of this paper is organized into four sections. Section 2 describes related work. Section 3 details the proposed FSTL framework and its new loss function. Section 4 evaluates the method by comparing it with baselines on two main datasets and a toy example. Section 5 concludes with a discussion.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related works</title>
<sec id="s2_1">
<label>2.1</label>
<title>Few-shot classification</title>
<p><xref ref-type="bibr" rid="B35">Wang et&#xa0;al. (2020)</xref> identified the core challenge of few-shot learning as the disparity between the small sample size and the complexity of the data. Two common approaches for addressing this challenge are data augmentation and model architecture redesign.</p>
<p>For data augmentation, <xref ref-type="bibr" rid="B24">Miller et&#xa0;al. (2000)</xref> leveraged similarities between existing categories to learn geometric transformations, thus enriching the small sample size dataset. <xref ref-type="bibr" rid="B16">Kwitt et&#xa0;al. (2016)</xref> developed a set of independent attribute strength regressors based on scene images with fine-grained annotations. Additionally, generative adversarial networks (GANs) (<xref ref-type="bibr" rid="B9">Gao et&#xa0;al., 2018</xref>) and autoencoders (<xref ref-type="bibr" rid="B28">Schwartz et&#xa0;al., 2018</xref>) are also commonly used in data augmentation.</p>
<p>Few-shot learning models, such as model-agnostic meta-learning (MAML) (<xref ref-type="bibr" rid="B8">Finn et&#xa0;al., 2017</xref>), first-order meta-learning (Reptile) (<xref ref-type="bibr" rid="B25">Nichol and Schulman, 2018</xref>), and Prototypical Network (<xref ref-type="bibr" rid="B30">Snell et&#xa0;al., 2017</xref>), have been widely applied and achieved promising results. <xref ref-type="bibr" rid="B32">Sun et&#xa0;al. (2019)</xref> combined meta-learning and transfer learning to achieve meta-transfer learning, and proposed a more advanced version for few-shot fine-grained classification (<xref ref-type="bibr" rid="B33">Sun et&#xa0;al., 2020</xref>). Another promising method for small-sample image classification is self-supervised learning, which can learn useful feature representations from data without relying on labels. Related methods can be divided into self-supervised generative learning and self-supervised contrastive learning (SCL) (<xref ref-type="bibr" rid="B2">Albelwi, 2022</xref>; <xref ref-type="bibr" rid="B6">Chen et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B20">Lim et&#xa0;al., 2023</xref>; <xref ref-type="bibr" rid="B31">Su et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B12">He et&#xa0;al., 2020</xref>). However, the methods mentioned above were not tailored for marine plankton images, overlooking their distinctive characteristics, particularly the fine-grained nature of the classification task.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Long-tailed learning</title>
<p>Recently, several effective long-tailed learning strategies have emerged: (<xref ref-type="bibr" rid="B15">Kang et&#xa0;al., 2020</xref>) decoupled representation and classifier learning; (<xref ref-type="bibr" rid="B29">Shi et&#xa0;al., 2023</xref>) revealed the limitations of re-sampling under contextual bias and introduced context-shift augmentation; (<xref ref-type="bibr" rid="B1">Aditya et&#xa0;al., 2021</xref>) boosted tail-class performance via logit adjustment; and (<xref ref-type="bibr" rid="B7">Cui et&#xa0;al., 2019</xref>) designed a class-balanced loss based on effective sample numbers. The aforementioned methods all operate under the premise that the training set itself has a long-tailed distribution. In contrast, in this paper, we aim to address the few-shot problem in the tail classes not by directly training on the tail data, but by training on the head classes and leveraging a transfer learning idea. While recent transfer-learning works (<xref ref-type="bibr" rid="B37">You et&#xa0;al., 2020</xref>; <xref ref-type="bibr" rid="B3">Atanov et&#xa0;al., 2022</xref>) also explore head-to-tail knowledge transfer by re-using the source classifier, they are not designed to handle fine-grained details or low-clarity images. Our method therefore differs in both objective and technical focus, extending transfer learning to the challenging scenario of plankton classification with very few samples.</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Plankton image classification</title>
<p>In addition to sample imbalance, classification methods also need to address issues such as distinguishing between-class and within-class differences, as well as recognizing fine-grained features. Additionally, because of the difficulty in obtaining plankton images, the available datasets are limited, and related research is still in its early stages. <xref ref-type="bibr" rid="B17">Lee et&#xa0;al. (2016)</xref> proposed a fine-grained plankton classification method based on CNNs. <xref ref-type="bibr" rid="B36">Wang et&#xa0;al. (2017)</xref> introduced a model based on GANs, which mitigates class imbalance in plankton image datasets by generating samples for few-shot categories. <xref ref-type="bibr" rid="B27">Schr&#xf6;der et&#xa0;al. (2018)</xref> applied weight imprinting techniques to enable neural networks to recognize the tail class without the need for retraining. <xref ref-type="bibr" rid="B33">Sun et&#xa0;al. (2020)</xref> proposed a feature fusion model that uses a focal region localization mechanism to identify perceptually similar areas between objects and extract discriminative features. <xref ref-type="bibr" rid="B11">Guo et&#xa0;al. (2023)</xref> addressed distribution differences between few-shot and non-few-shot classes, adopting a cross-domain few-shot classification approach.</p>
<p>Although existing methods have achieved certain results in few-shot plankton image classification, they have not incorporated self-supervised learning. This is particularly important for fine-grained tasks like plankton classification, which is a key motivation and contribution of our adoption of self-supervise contrastive learning in this work.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Methodology</title>
<sec id="s3_1">
<label>3.1</label>
<title>Problem formulation</title>
<p>The proposed method is in a framework of few-shot classification using transfer learning. As shown in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>, a few-shot classification task involves two datasets: the training set <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula> and test set <inline-formula>
<mml:math display="inline" id="im2"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>t</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>, where <inline-formula>
<mml:math display="inline" id="im3"><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im4"><mml:mrow><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> represent the sample and its corresponding label, respectively. When dealing with long-tailed datasets, <italic>D<sub>train</sub></italic> corresponds to the head-class with sufficient samples, whereas <italic>D<sub>test</sub></italic> corresponds to the tail-class with sparse samples. <italic>N</italic> and <italic>N<sub>t</sub></italic> epresent the sample sizes of these two datasets, respectively, and <italic>N</italic> &#x226b; <italic>N<sub>t</sub></italic>. <italic>C</italic> categories are chosen from <italic>D<sub>test</sub></italic> and <italic>S</italic> samples are selected from each category to form <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>p</mml:mi><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>t</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>s</mml:mi></mml:msub></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>. Then, the remaining samples of the <italic>C</italic> chosen categories become <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>q</mml:mi><mml:mi>u</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mi>q</mml:mi></mml:msub></mml:mrow></mml:msubsup></mml:mrow></mml:math></inline-formula>. Therefore, <italic>D<sub>support</sub></italic> and <italic>D<sub>query</sub></italic> share the same label space, which is disjoint from the label space of <italic>D<sub>train</sub></italic>. Such a few-shot classification task can be referred to as a <italic>C</italic>-way <italic>S</italic>-shot task.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Framework of few-shot learning.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1729254-g003.tif">
<alt-text content-type="machine-generated">Diagram illustrating a machine learning model process. The top left shows a dataset divided into training and test sets. The model receives input from the training set and outputs to a test set, which further connects to support and query sets. Arrows indicate processes of testing, fine-tuning, and data flow between components.</alt-text>
</graphic></fig>
<p>Few-shot classification aims to learn a transferable model <italic>M</italic><sub>1</sub> on <italic>D<sub>train</sub></italic>, which can be further fine-tuned using <italic>D<sub>support</sub></italic> to obtain model <italic>M</italic><sub>2</sub>. Finally, we test the few-shot classification accuracy of model <italic>M</italic><sub>2</sub> on the query set <italic>D<sub>query</sub></italic>.</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Self-supervised learning framework of FSTL</title>
<p>FSTL improves the original loss function of SCL (<xref ref-type="bibr" rid="B20">Lim et&#xa0;al., 2023</xref>) and obtains a lightweight loss function that performs better in extracting features of plankton images at a fine-grained level and reflecting differences between classes.</p>
<p>As shown in <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref>, the core of FSTL is a self-supervised feature extractor <italic>f<sub>c</sub></italic>, which is built on ResNet-12. <italic>f<sub>c</sub></italic> can map a two-dimensional image to a latent space and use a one-dimensional vector to represent the extracted image features. In the training phase, both supervised and self-supervised learning are used to create a classification task and a generation task, respectively. Then, the classifier <italic>f<sub>&#x3b8;</sub></italic> and generator <italic>f<sub>g</sub></italic> are used to compute a loss function, whose minimization yields the optimal estimates of parameters in <italic>f<sub>c</sub></italic>, <italic>f<sub>&#x3b8;</sub></italic>, and <italic>f<sub>g</sub></italic>. In the testing phase, all parameters in <italic>f<sub>c</sub></italic> are no longer changed, and <italic>f<sub>c</sub></italic> converts samples from <italic>D<sub>support</sub></italic> and <italic>D<sub>query</sub></italic> into feature vectors. Then a logistic regressor is trained on <italic>D<sub>support</sub></italic>. Its classification accuracy on <italic>D<sub>query</sub></italic> is tested.</p>
<fig id="f4" position="float">
<label>Figure&#xa0;4</label>
<caption>
<p>The overall design of FSTL.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1729254-g004.tif">
<alt-text content-type="machine-generated">Diagram illustrating a machine learning process divided into training and testing phases. The training phase involves a training set processed through a function to create a latent space, followed by calculations for prediction probabilities and feature representations, resulting in loss calculated by a formula. The testing phase uses support and query sets with shared weights, fine-tuning with logistic regression, leading to predictions. Arrows indicate the flow of data between these elements.</alt-text>
</graphic></fig>
<sec id="s3_2_1">
<label>3.2.1</label>
<title>Classification task</title>
<p>For an image sample <italic>x</italic> in <italic>D<sub>train</sub></italic>, data augmentation transformations and rotation transformations are applied to generate three derived images from <italic>x</italic>. The original image <italic>x</italic> and its three derived images (rotate 90&#xb0;, 180&#xb0;, 270&#xb0;) are concatenated into a tensor x. Similarly, a class label vector y is obtained, whose four components share the same value. This process results in a new dataset <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msubsup><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>x</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>y</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>.</p>
<p>We use a single-layer fully connected network to implement the classifier <italic>f<sub>&#x3b8;</sub></italic>. The feature extractor <italic>f<sub>c</sub></italic> transforms a single image into a feature vector v first, then <italic>f<sub>&#x3b8;</sub></italic> outputs a class prediction vector <inline-formula>
<mml:math display="inline" id="im8"><mml:mtext>p</mml:mtext></mml:math></inline-formula>. The length of <inline-formula>
<mml:math display="inline" id="im9"><mml:mtext>p</mml:mtext></mml:math></inline-formula> is equal to the number of categories in the total dataset and the sum of all its elements is equal to 1. By maximizing the overall performance of <italic>f<sub>&#x3b8;</sub></italic>, we force <italic>f<sub>c</sub></italic> to focus on between-class differences, thus improving the quality of the extracted features.</p>
</sec>
<sec id="s3_2_2">
<label>3.2.2</label>
<title>Generation task</title>
<p>The generator <italic>f<sub>g</sub></italic> is composed of a lightweight CNN. Each sample <italic>x</italic> in <italic>D<sub>train</sub></italic> undergoes a data augmentation transformation <italic>T</italic>, which produces an augmented version <italic>x<sub>t</sub></italic>. The augmented image is then fed into <italic>f<sub>c</sub></italic> to form a feature vector <inline-formula>
<mml:math display="inline" id="im10"><mml:mrow><mml:mtext>v</mml:mtext><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. Using vector v and <italic>f<sub>g</sub></italic>, a generated image <inline-formula>
<mml:math display="inline" id="im11"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is produced. The goal of the generation task is to find the optimal <italic>f<sub>c</sub></italic> and <italic>f<sub>g</sub></italic> such that the difference between <italic>x<sub>t</sub></italic> and <inline-formula>
<mml:math display="inline" id="im12"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is minimized. The generation task forces <italic>f<sub>c</sub></italic> to pay more attention to pixel-level differences within the images, further improving the ability of fine-grained classification. <xref ref-type="fig" rid="f4"><bold>Figure&#xa0;4</bold></xref> shows the overall design of FSTL.</p>
</sec>
<sec id="s3_2_3">
<label>3.2.3</label>
<title>Loss function</title>
<p>During the training phase, the tensor sample <inline-formula>
<mml:math display="inline" id="im13"><mml:mrow><mml:msub><mml:mtext>x</mml:mtext><mml:mtext>i</mml:mtext></mml:msub></mml:mrow></mml:math></inline-formula> from <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is fed into <inline-formula>
<mml:math display="inline" id="im15"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> to obtain the corresponding feature tensor <inline-formula>
<mml:math display="inline" id="im16"><mml:mrow><mml:msub><mml:mtext>v</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mtext>v</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>v</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>v</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>v</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mi>T</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>c</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>, where each element <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msub><mml:mtext>v</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:math></inline-formula> represents the corresponding <inline-formula>
<mml:math display="inline" id="im18"><mml:mtext>v</mml:mtext></mml:math></inline-formula> from four rotations (<inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mn>0</mml:mn><mml:mo>&#x2218;</mml:mo></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mn>90</mml:mn></mml:mrow><mml:mo>&#x2218;</mml:mo></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mn>180</mml:mn></mml:mrow><mml:mo>&#x2218;</mml:mo></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mn>270</mml:mn></mml:mrow><mml:mo>&#x2218;</mml:mo></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>). Next, the feature tensor <inline-formula>
<mml:math display="inline" id="im20"><mml:mrow><mml:msub><mml:mtext>v</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> is input into the classifier <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>&#x3b8;</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> to obtain the class label prediction tensor <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mi>T</mml:mi></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mi>&#x3b8;</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>v</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>. Simultaneously, <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:msub><mml:mtext>v</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is also input into the generator <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mi>g</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula> to obtain <inline-formula>
<mml:math display="inline" id="im25"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>.</p>
<p>A loss function consisting of three parts (named CCM loss) is proposed. By minimizing this loss function, the optimal parameters of <italic>f<sub>c</sub></italic>, <italic>f<sub>&#x3b8;</sub></italic>, and <italic>f<sub>g</sub></italic> can be obtained.</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>y</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3b3;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3b2;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>L<sub>bc</sub></italic> in <xref ref-type="disp-formula" rid="eq1">Equation 1</xref> is the cross-entropy loss function with respect to the class label y<italic><sub>i</sub></italic> and class label prediction vector <inline-formula>
<mml:math display="inline" id="im26"><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, which enhances the feature extractor fc&#x2019;s ability to discriminate between classes. It is defined in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>,</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>y</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>&#x2212;</mml:mo><mml:mstyle displaystyle="true"><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mi>j</mml:mi></mml:msub><mml:mrow><mml:mtext>log</mml:mtext></mml:mrow></mml:mstyle><mml:mtext>&#xa0;</mml:mtext><mml:mfrac><mml:mrow><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:msub><mml:mi>y</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow><mml:mrow><mml:msub><mml:mo>&#x2211;</mml:mo><mml:mi>k</mml:mi></mml:msub><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:mfrac><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mn>2</mml:mn><mml:mo>,</mml:mo><mml:mn>3</mml:mn><mml:mo>,</mml:mo><mml:mn>4</mml:mn><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im27"><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi><mml:mo>,</mml:mo><mml:mi>k</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> represents the <inline-formula>
<mml:math display="inline" id="im28"><mml:mi>k</mml:mi></mml:math></inline-formula>th element in vector <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mi>j</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>. <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is a cosine similarity loss function for <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>, see <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>,</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mtable columnalign="left"><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>2</mml:mi></mml:mfrac><mml:mo stretchy="false">(</mml:mo><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mtext>cos&#xa0;</mml:mtext><mml:mo>&lt;</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub><mml:mo>&gt;</mml:mo><mml:mo>+</mml:mo><mml:mtext>&#xa0;cos&#xa0;</mml:mtext><mml:mo>&lt;</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub><mml:mo>&gt;</mml:mo><mml:mo>+</mml:mo><mml:mtext>&#xa0;cos&#xa0;</mml:mtext></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign="left"><mml:mtd columnalign="left"><mml:mrow><mml:mo>&lt;</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msub><mml:mo>&gt;</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">/</mml:mo><mml:mn>3</mml:mn><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math>
</disp-formula>
<p>where cos<italic>&lt;</italic> &#xb7;,&#xb7; <italic>&gt;</italic> denotes the cosine similarity function. <italic>L<sub>cd</sub></italic> aims to bring the feature vectors of different samples with the same label as close as possible, thereby enabling the feature extractor <italic>f<sub>c</sub></italic> to learn invariant features within a category. <italic>L<sub>ge</sub></italic> represents the mean squared error (MSE) between the augmented image <italic>x<sub>t</sub></italic> and the generated image <inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow></mml:math></inline-formula>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo></mml:mrow></mml:math>
</disp-formula>
<p>where in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>, <italic>MSE</italic> is a binary function that takes tensors as input and computes the mean of the squared differences between corresponding elements of the input tensors. <italic>L<sub>ge</sub></italic> allows the loss function <italic>L</italic> in (1) to focus more on pixel-level feature differences, thereby enhancing <italic>f<sub>c</sub></italic>&#x2019;s ability to recognize subtle feature variations.</p>
<p>In the CCM loss function (1), <italic>&#x3b3;</italic> and <italic>&#x3b2;</italic> are two hyperparameters used to control the model&#x2019;s tendencies during the learning process. The model <italic>M</italic><sub>1</sub> is trained according to <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>.</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:munder><mml:mrow><mml:mtext>arg&#xa0;min</mml:mtext></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x3b8;</mml:mi><mml:mo>,</mml:mo><mml:mi>g</mml:mi></mml:mrow></mml:munder><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:msub><mml:mtext>x</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x223c;</mml:mo><mml:msub><mml:mi>D</mml:mi><mml:mrow><mml:mi>a</mml:mi><mml:mi>u</mml:mi><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>y</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3b3;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3b2;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>g</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>t</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent="true"><mml:mi>x</mml:mi><mml:mo>^</mml:mo></mml:mover><mml:mi>t</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <italic>c</italic>, <italic>&#x3b8;</italic>, and <italic>g</italic> denote the trainable parameters in <italic>f<sub>c</sub></italic>, <italic>f<sub>&#x3b8;</sub></italic>, and <italic>f<sub>g</sub></italic>, respectively.</p>
</sec>
<sec id="s3_2_4">
<label>3.2.4</label>
<title>Difference between FSTL and SCL</title>
<p>The structure of SCL is similar to that of FSTL, with the main difference being that SCL does not have a generation task. There are two classification tasks in the training phase of SCL: one for category labels and the other for rotation labels; in FSTL, we do not use the rotation label.</p>
<p>The loss function of SCL also consists of three terms in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:mi>L</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mtext>y</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3bc;</mml:mi><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>&#x3bb;</mml:mi><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>q</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>r</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <xref ref-type="disp-formula" rid="eq7">Equation 7</xref> is</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mo>&#x2225;</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x2225;</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo>&#x2225;</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x2225;</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msub><mml:mo>+</mml:mo><mml:msub><mml:mrow><mml:mrow><mml:mo>&#x2225;</mml:mo><mml:mrow><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mtext>p</mml:mtext><mml:mrow><mml:mi>i</mml:mi><mml:mo>,</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo>&#x2225;</mml:mo></mml:mrow></mml:mrow><mml:mn>2</mml:mn></mml:msub></mml:mrow><mml:mn>3</mml:mn></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>In the loss function of SCL (6), the first term is the same as that of FSTL, and the last two terms are determined by the result of the two classification tasks. However, the third term in the loss of SCL is totally different from that of FSTL, where <italic>r<sub>i</sub></italic>is the rotation parameter (<inline-formula>
<mml:math display="inline" id="im33"><mml:mrow><mml:msub><mml:mi>r</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msup><mml:mn>0</mml:mn><mml:mo>&#x2218;</mml:mo></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mn>90</mml:mn></mml:mrow><mml:mo>&#x2218;</mml:mo></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mn>180</mml:mn></mml:mrow><mml:mo>&#x2218;</mml:mo></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mn>270</mml:mn></mml:mrow><mml:mo>&#x2218;</mml:mo></mml:msup></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>). Additionally, FSTL replaces the binary norm with cosine similarity in the second term in the loss function.</p>
<p>In summary, the effectiveness of our framework stems from the following aspects. First, by incorporating the self-supervised contrastive learning, we empower the model with a more powerful feature representation capability, which is crucial for fine-grained classification. Second, we enhance this foundation by designing a novel loss function with two dedicated terms: <italic>L<sub>bc</sub></italic> sharpens the model&#x2019;s focus on between-class differences, while <italic>L<sub>ge</sub></italic> directs attention to pixel-level, within-class variations. These components work synergistically within a transfer learning paradigm, enabling knowledge acquired from the head class to be effectively adapted for analyzing the tail class, thereby achieving robust few-shot classification of plankton images.</p>
</sec>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Experiment</title>
<sec id="s4_1">
<label>4.1</label>
<title>Datasets</title>
<p>We use two datasets to conduct experiments. The DYB-plankton dataset, collected from Daya Bay in Shenzhen (<xref ref-type="bibr" rid="B18">Li et&#xa0;al., 2021</xref>), contains 93 classes, including various plankton species and non-biological particles such as bubbles and debris. After classes with fewer than 20 samples are removed, 74 classes remain. The WHOI-plankton dataset, provided by the Woods Hole Oceanographic Institution (<ext-link ext-link-type="uri" xlink:href="https://hdl.handle.net/10.1575/1912/7341">https://hdl.handle.net/10.1575/1912/7341</ext-link>), contains 53 classes after the exclusion of classes with fewer than 20 samples. Images in both datasets are normalized to a size of 84&#xd7;84 pixels.</p>
<p>To achieve the partition of datasets, a residual neural network is initially used to classify all categories in the datasets. The 10 classes with relatively lower accuracy are selected as <italic>D<sub>test</sub></italic> (these 10 classes are with small sample sizes), and the remaining constitute <italic>D<sub>train</sub></italic>. Samples for the support set <italic>D<sub>support</sub></italic> and query set <italic>D<sub>query</sub></italic> are selected from <italic>D<sub>test</sub></italic> based on the <italic>C</italic>-way <italic>S</italic>-shot experimental setting introduced in Section 3.1. To prevent overfitting during the training phase, a validation set <italic>D<sub>val</sub></italic> is separated from <italic>D<sub>train</sub></italic> in a 1:3 ratio, as shown in <xref ref-type="fig" rid="f5"><bold>Figure&#xa0;5</bold></xref>.</p>
<fig id="f5" position="float">
<label>Figure&#xa0;5</label>
<caption>
<p>Partition of datasets.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmars-12-1729254-g005.tif">
<alt-text content-type="machine-generated">A schematic illustrating data partitioning for model training and evaluation. The left section, labeled \(D_{\text{train}}\), consists of teal blocks for training. The adjacent yellow section, labeled \(D_{\text{val}}\), represents validation data. The right section, labeled \(D_{\text{test}}\), contains red blocks divided into \(D_{\text{support}}\) and \(D_{\text{query}}\).</alt-text>
</graphic></fig>
<p>For the DYB dataset, classes containing over 500 samples are subsampled to 500 images each via random selection. All available images from smaller classes are included. Subsequently, the data are partitioned into training and testing subsets in a 4:1 ratio, and then a ResNet model was used for classification.</p>
<p>Although the overall test set accuracy reached 85.96%, 20 classes exhibited test accuracy below 50%. We randomly selected 10 of these classes as <italic>D<sub>test</sub></italic> for the subsequent few-shot experiments. Their sample sizes and classification accuracy is listed in <xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref>. We applied a similar approach to the WHOI dataset. The results are listed in <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Accuracy of each category in the test set of the DYB dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Index of category</th>
<th valign="middle" align="center">Sample size</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Index of category</th>
<th valign="middle" align="center">Sample size</th>
<th valign="middle" align="center">Accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">1</td>
<td valign="middle" align="left">80</td>
<td valign="middle" align="left">28.57%</td>
<td valign="middle" align="left">6</td>
<td valign="middle" align="left">45</td>
<td valign="middle" align="left">0%</td>
</tr>
<tr>
<td valign="middle" align="left">2</td>
<td valign="middle" align="left">76</td>
<td valign="middle" align="left">18.18%</td>
<td valign="middle" align="left">7</td>
<td valign="middle" align="left">44</td>
<td valign="middle" align="left">25%</td>
</tr>
<tr>
<td valign="middle" align="left">3</td>
<td valign="middle" align="left">72</td>
<td valign="middle" align="left">28.57%</td>
<td valign="middle" align="left">8</td>
<td valign="middle" align="left">37</td>
<td valign="middle" align="left">39.13%</td>
</tr>
<tr>
<td valign="middle" align="left">4</td>
<td valign="middle" align="left">70</td>
<td valign="middle" align="left">35.71%</td>
<td valign="middle" align="left">9</td>
<td valign="middle" align="left">28</td>
<td valign="middle" align="left">25%</td>
</tr>
<tr>
<td valign="middle" align="left">5</td>
<td valign="middle" align="left">50</td>
<td valign="middle" align="left">40%</td>
<td valign="middle" align="left">10</td>
<td valign="middle" align="left">25</td>
<td valign="middle" align="left">50%</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Accuracy of each category in the test set of the WHOI dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Index of category</th>
<th valign="middle" align="center">Sample size</th>
<th valign="middle" align="center">Accuracy</th>
<th valign="middle" align="center">Index of category</th>
<th valign="middle" align="center">Sample size</th>
<th valign="middle" align="center">Accuracy</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">1</td>
<td valign="middle" align="left">107</td>
<td valign="middle" align="left">20.00%</td>
<td valign="middle" align="left">6</td>
<td valign="middle" align="left">44</td>
<td valign="middle" align="left">0%</td>
</tr>
<tr>
<td valign="middle" align="left">2</td>
<td valign="middle" align="left">66</td>
<td valign="middle" align="left">23.53%</td>
<td valign="middle" align="left">7</td>
<td valign="middle" align="left">35</td>
<td valign="middle" align="left">25.00%</td>
</tr>
<tr>
<td valign="middle" align="left">3</td>
<td valign="middle" align="left">61</td>
<td valign="middle" align="left">0%</td>
<td valign="middle" align="left">8</td>
<td valign="middle" align="left">24</td>
<td valign="middle" align="left">18.18%</td>
</tr>
<tr>
<td valign="middle" align="left">4</td>
<td valign="middle" align="left">58</td>
<td valign="middle" align="left">10.53%</td>
<td valign="middle" align="left">9</td>
<td valign="middle" align="left">22</td>
<td valign="middle" align="left">0%</td>
</tr>
<tr>
<td valign="middle" align="left">5</td>
<td valign="middle" align="left">55</td>
<td valign="middle" align="left">0%</td>
<td valign="middle" align="left">10</td>
<td valign="middle" align="left">22</td>
<td valign="middle" align="left">9.09%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Experimental settings</title>
<sec id="s4_2_1">
<label>4.2.1</label>
<title>Data augmentation</title>
<p>The following data augmentation (<italic>T</italic>) methods are used: 1) random cropping of the image with a scale ratio between 0.5 and 1.0, and then resizing it back to the original size using bilinear interpolation; 2) application of random color jitter with an 80% probability, involving random adjustments to brightness, contrast, saturation, and hue; and 3) conversion of the image to grayscale with a 20% probability.</p>
</sec>
<sec id="s4_2_2">
<label>4.2.2</label>
<title>Feature extractor</title>
<p>The feature extractor <italic>f<sub>c</sub></italic> is built using ResNet-12. This network consists of four residual blocks, each containing three convolutional layers. Each convolutional layer has 64 convolutional filters with a (3, 3) kernel. A max-pooling layer with a size of (2, 2) is added after the last convolutional layer in the first three residual blocks. Following the fourth residual block, a global average pooling layer is applied. To prevent overfitting, the DropBlock (<xref ref-type="bibr" rid="B10">Ghiasi et&#xa0;al., 2018</xref>) technique is used, with block sizes of (64, 160, 320, 640).</p>
</sec>
<sec id="s4_2_3">
<label>4.2.3</label>
<title>Classifier</title>
<p>In the training phase, <italic>f<sub>&#x3b8;</sub></italic> is a simple classifier that consists of a fully connected layer. The input consists of the feature vectors, and the output size varies depending on the number of classes in the few-shot classification task.</p>
</sec>
<sec id="s4_2_4">
<label>4.2.4</label>
<title>Generator</title>
<p>In the training phase, the generator <italic>f<sub>g</sub></italic> is a lightweight convolutional network. It includes a fully connected layer to upsample the feature vectors, followed by four convolutional layers that transform the upsampled feature vectors back to the original image size. The convolutional layers have the following sizes in sequence: (64, 32, 3, 3), (32, 16, 3, 3), (16, 3, 3, 3), and (3, 3, 5, 5).</p>
</sec>
<sec id="s4_2_5">
<label>4.2.5</label>
<title>Logistic regressor</title>
<p>In the testing phase, the logistic classifier is a single-layer neural network. The feature vectors are fed into the network and a softmax function is applied to obtain a probability score for each class. The class with the highest score is selected as the predicted label.</p>
</sec>
<sec id="s4_2_6">
<label>4.2.6</label>
<title>Optimization strategy</title>
<p>During the training phase, the feature extractor <italic>f<sub>c</sub></italic>, classifier <italic>f<sub>&#x3b8;</sub></italic>, and generator <italic>f<sub>g</sub></italic> can be considered as three sub-blocks of one neural network, which allows them to share the same optimizer. The network is optimized using the SGD optimizer. The initial learning rate is set to 0.05, with an <italic>L</italic><sub>2</sub> penalty coefficient of 5<italic>e</italic><sup>&#x2212;4</sup>, and the batch size is set to 32. The maximum number of iterations is set to 100, and the learning rate is multiplied by 0.1 at the 60th and 80th iterations.</p>
<p>During the testing phase, all parameters of <italic>f<sub>c</sub></italic> remain fixed. Initially, <italic>f<sub>c</sub></italic> converts the images in <italic>D<sub>support</sub></italic> into their corresponding feature vectors. A logistic regression classifier is then trained on these extracted features to perform multi-class classification.</p>
<p>The classifier is optimized using the L-BFGS algorithm, using an <italic>L</italic><sub>2</sub> penalty term with a maximum iteration number limit of 1000. Finally, the entire assembled model is evaluated on <italic>D<sub>query</sub></italic> to assess its generalization performance.</p>
<p>All experiments were conducted on a high-performance workstation equipped with six NVIDIA GeForce RTX 3090 GPUs (each with 24GB memory), totaling 144GB GPU memory. The training process utilized multiple GPUs in parallel. The average GPU utilization was above 60%, and the power consumption was approximately 300W per card under full load. All experiments were conducted under Python 3.8, PyTorch 2.1.2 and CUDA 12.2.</p>
</sec>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Metrics</title>
<p>In classification tasks, accuracy and precision are two commonly used metrics. They are generally applied to binary classification tasks, where samples are typically labeled as either positive or negative. Their calculation is shown in <xref ref-type="disp-formula" rid="eq8">Equations 8</xref>, <xref ref-type="disp-formula" rid="eq9">9</xref>, and TP, FP, TN, and FN correspond to the true positive rate, false positive rate, true negative rate, and false negative rate, respectively.</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext></mml:mrow><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>TN</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FN</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq9"><label>(9)</label>
<mml:math display="block" id="M9"><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mtext>TP</mml:mtext></mml:mrow><mml:mrow><mml:mtext>TP</mml:mtext><mml:mo>+</mml:mo><mml:mtext>FP</mml:mtext></mml:mrow></mml:mfrac></mml:mrow></mml:math>
</disp-formula>
<p>When the number of categories exceeds three, it is necessary to use macro-accuracy and macro-precision. For a specific category <italic>k</italic>, we label all samples that belong to that category as positive and label all samples that do not belong to it as negative. This approach enables us to calculate accuracy and precision for each category individually, which allows us to obtain the overall macro-accuracy and macro-precision. <italic>S</italic> denotes the number of categories in <italic>D<sub>query</sub></italic>, and the calculation is as <xref ref-type="disp-formula" rid="eq10">Equation 10</xref>, <xref ref-type="disp-formula" rid="eq11">11</xref>:</p>
<disp-formula id="eq10"><label>(10)</label>
<mml:math display="block" id="M10"><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>S</mml:mi></mml:munderover><mml:mrow><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:msub><mml:mi>y</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mrow><mml:mi>S</mml:mi></mml:mfrac><mml:mo>&#xd7;</mml:mo><mml:mn>100</mml:mn></mml:mrow></mml:math>
</disp-formula>
<disp-formula id="eq11"><label>(11)</label>
<mml:math display="block" id="M11"><mml:mrow><mml:mi>M</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>r</mml:mi><mml:mi>o</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mstyle displaystyle="true"><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>k</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>S</mml:mi></mml:munderover><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:msub><mml:mi>n</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:mstyle></mml:mrow><mml:mi>S</mml:mi></mml:mfrac><mml:mo>&#xd7;</mml:mo><mml:mn>100</mml:mn></mml:mrow></mml:math>
</disp-formula>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Experimental results</title>
<p>To thoroughly evaluate the performance of FSTL, we designed three experiments. In Experiment 1, we tested four existing few-shot classification methods on plankton images. The results demonstrated that SCL (<xref ref-type="bibr" rid="B20">Lim et&#xa0;al., 2023</xref>) achieved the best performance among them. Experiment 2 involved comparing the performance of FSTL and SCL for various hyperparameter settings, which demonstrated the effectiveness of FSTL in the context of plankton few-shot classification. Finally, Experiment 3 was a preliminary investigation of the relationship between the performance of FSTL and the quality of the images used.</p>
<sec id="s4_4_1">
<label>4.4.1</label>
<title>Experiment 1</title>
<p>We used four state-of-the-art few-shot classification models as baseline methods: SCL (<xref ref-type="bibr" rid="B20">Lim et&#xa0;al., 2023</xref>), MAML (<xref ref-type="bibr" rid="B8">Finn et&#xa0;al., 2017</xref>), prototypical network (<xref ref-type="bibr" rid="B30">Snell et&#xa0;al., 2017</xref>), and relation network (<xref ref-type="bibr" rid="B34">Sung et&#xa0;al., 2018</xref>).</p>
<p>We evaluated the classification performance of these four baselines on the DYB and WHOI datasets. The results for the DYB dataset are presented in <xref ref-type="table" rid="T3"><bold>Tables&#xa0;3</bold></xref>, <xref ref-type="table" rid="T4"><bold>4</bold></xref>. The experimental results indicate that SCL achieved the highest performance among the baselines. Therefore, we primarily compare the performance of SCL with that of FSTL.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Macro-accuracy of different methods on DYB query set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Configuration</th>
<th valign="middle" align="center">SCL</th>
<th valign="middle" align="center">MAML</th>
<th valign="middle" align="center">Prototypical network</th>
<th valign="middle" align="center">Relation network</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">5-way 1-shot</td>
<td valign="middle" align="left">72.69</td>
<td valign="middle" align="left">52.70</td>
<td valign="middle" align="left">42.61</td>
<td valign="middle" align="left">51.80</td>
</tr>
<tr>
<td valign="middle" align="left">5-way 3-shot</td>
<td valign="middle" align="left">84.66</td>
<td valign="middle" align="left">63.43</td>
<td valign="middle" align="left">54.71</td>
<td valign="middle" align="left">63.79</td>
</tr>
<tr>
<td valign="middle" align="left">5-way 5-shot</td>
<td valign="middle" align="left">87.81</td>
<td valign="middle" align="left">67.70</td>
<td valign="middle" align="left">60.08</td>
<td valign="middle" align="left">66.58</td>
</tr>
<tr>
<td valign="middle" align="left">5-way 10-shot</td>
<td valign="middle" align="left">91.17</td>
<td valign="middle" align="left">73.83</td>
<td valign="middle" align="left">63.98</td>
<td valign="middle" align="left">69.06</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 1-shot</td>
<td valign="middle" align="left">59.48</td>
<td valign="middle" align="left">35.38</td>
<td valign="middle" align="left">27.82</td>
<td valign="middle" align="left">35.66</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 3-shot</td>
<td valign="middle" align="left">73.99</td>
<td valign="middle" align="left">42.85</td>
<td valign="middle" align="left">35.65</td>
<td valign="middle" align="left">46.70</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 5-shot</td>
<td valign="middle" align="left">78.86</td>
<td valign="middle" align="left">48.80</td>
<td valign="middle" align="left">39.59</td>
<td valign="middle" align="left">50.60</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 10-shot</td>
<td valign="middle" align="left">84.01</td>
<td valign="middle" align="left">57.60</td>
<td valign="middle" align="left">41.44</td>
<td valign="middle" align="left">53.48</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Macro-precision of different methods on DYB query set.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Configuration</th>
<th valign="middle" align="center">SCL</th>
<th valign="middle" align="center">MAML</th>
<th valign="middle" align="center">Prototypical network</th>
<th valign="middle" align="center">Relation network</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">5-way 1-shot</td>
<td valign="middle" align="left">74.95</td>
<td valign="middle" align="left">53.47</td>
<td valign="middle" align="left">45.85</td>
<td valign="middle" align="left">53.11</td>
</tr>
<tr>
<td valign="middle" align="left">5-way 3-shot</td>
<td valign="middle" align="left">86.04</td>
<td valign="middle" align="left">64.80</td>
<td valign="middle" align="left">57.38</td>
<td valign="middle" align="left">65.32</td>
</tr>
<tr>
<td valign="middle" align="left">5-way 5-shot</td>
<td valign="middle" align="left">88.86</td>
<td valign="middle" align="left">68.85</td>
<td valign="middle" align="left">61.98</td>
<td valign="middle" align="left">68.28</td>
</tr>
<tr>
<td valign="middle" align="left">5-way 10-shot</td>
<td valign="middle" align="left">91.99</td>
<td valign="middle" align="left">75.15</td>
<td valign="middle" align="left">65.03</td>
<td valign="middle" align="left">75.01</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 1-shot</td>
<td valign="middle" align="left">61.97</td>
<td valign="middle" align="left">35.47</td>
<td valign="middle" align="left">30.79</td>
<td valign="middle" align="left">37.15</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 3-shot</td>
<td valign="middle" align="left">76.49</td>
<td valign="middle" align="left">44.70</td>
<td valign="middle" align="left">38.41</td>
<td valign="middle" align="left">48.91</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 5-shot</td>
<td valign="middle" align="left">80.51</td>
<td valign="middle" align="left">50.20</td>
<td valign="middle" align="left">41.48</td>
<td valign="middle" align="left">53.31</td>
</tr>
<tr>
<td valign="middle" align="left">10-way 10-shot</td>
<td valign="middle" align="left">85.35</td>
<td valign="middle" align="left">59.23</td>
<td valign="middle" align="left">43.01</td>
<td valign="middle" align="left">58.10</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4_4_2">
<label>4.4.2</label>
<title>Experiment 2</title>
<p>We compared the performance of FSTL and SCL for various hyperparameter settings. For each <italic>C</italic>-way <italic>S</italic>-shot experiment, we randomly sampled <italic>D<sub>support</sub></italic> from <italic>D<sub>test</sub></italic>. We repeated this process 2,000 times to calculate the average macro-accuracy and average macro-precision. The results are presented in <xref ref-type="table" rid="T5"><bold>Tables&#xa0;5</bold></xref>&#x2013;<xref ref-type="table" rid="T8"><bold>8</bold></xref>. The values in parentheses represent the half-length of the 95% confidence interval for each metric.</p>
<table-wrap id="T5" position="float">
<label>Table&#xa0;5</label>
<caption>
<p>Query set macro-accuracy on DYB dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model configuration</th>
<th valign="middle" align="center">5-way 1-shot</th>
<th valign="middle" align="center">5-way 3-shot</th>
<th valign="middle" align="center">5-way 5-shot</th>
<th valign="middle" align="center">5-way 10-shot</th>
<th valign="middle" align="center">10-way 1-shot</th>
<th valign="middle" align="center">10-way 3-shot</th>
<th valign="middle" align="center">10-way 5-shot</th>
<th valign="middle" align="center">10-way 10-shot</th>
<th valign="middle" align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="10" align="left">FSTL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">72.69 (0.43)</td>
<td valign="middle" align="left">84.66 (0.31)</td>
<td valign="middle" align="left">87.81 (0.26)</td>
<td valign="middle" align="left">91.17 (0.22)</td>
<td valign="middle" align="left">59.48 (0.25)</td>
<td valign="middle" align="left">73.99 (0.17)</td>
<td valign="middle" align="left">78.86 (0.15)</td>
<td valign="middle" align="left">84.01 (0.16)</td>
<td valign="middle" align="left">79.08</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left">71.83 (0.43)</td>
<td valign="middle" align="left">84.68 (0.30)</td>
<td valign="middle" align="left">87.86 (0.26)</td>
<td valign="middle" align="left">90.07 (0.24)</td>
<td valign="middle" align="left">59.73 (0.24)</td>
<td valign="middle" align="left">73.93 (0.18)</td>
<td valign="middle" align="left">78.73 (0.15)</td>
<td valign="middle" align="left">84.19 (0.16)</td>
<td valign="middle" align="left">78.88</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">72.69 (0.43)</td>
<td valign="middle" align="left">84.76 (0.30)</td>
<td valign="middle" align="left">87.64 (0.26)</td>
<td valign="middle" align="left">91.34 (0.22)</td>
<td valign="middle" align="left">58.92 (0.26)</td>
<td valign="middle" align="left">73.75 (0.17)</td>
<td valign="middle" align="left">78.84 (0.15)</td>
<td valign="middle" align="left">84.17 (0.15)</td>
<td valign="middle" align="left">79.01</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">71.98 (0.42)</td>
<td valign="middle" align="left">84.65 (0.30)</td>
<td valign="middle" align="left">88.06 (0.25)</td>
<td valign="middle" align="left">91.39 (0.22)</td>
<td valign="middle" align="left">59.52 (0.25)</td>
<td valign="middle" align="left">73.16 (0.18)</td>
<td valign="middle" align="left">78.61 (0.15)</td>
<td valign="middle" align="left">82.62 (0.16)</td>
<td valign="middle" align="left">78.75</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">72.73 (0.42)</td>
<td valign="middle" align="left">84.51 (0.31)</td>
<td valign="middle" align="left">87.10 (0.26)</td>
<td valign="middle" align="left">91.38 (0.22)</td>
<td valign="middle" align="left">59.12 (0.25)</td>
<td valign="middle" align="left">73.56 (0.17)</td>
<td valign="middle" align="left">78.55 (0.15)</td>
<td valign="middle" align="left">84.44 (0.15)</td>
<td valign="middle" align="left">78.92</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">72.18 (0.43)</td>
<td valign="middle" align="left">83.87 (0.33)</td>
<td valign="middle" align="left">87.48 (0.25)</td>
<td valign="middle" align="left">90.81 (0.22)</td>
<td valign="middle" align="left">60.41 (0.24)</td>
<td valign="middle" align="left">73.55 (0.17)</td>
<td valign="middle" align="left">79.23 (0.15)</td>
<td valign="middle" align="left">84.20 (0.15)</td>
<td valign="middle" align="left">78.97</td>
</tr>
<tr>
<th valign="middle" colspan="10" align="left">SCL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">72.69 (0.43)</td>
<td valign="middle" align="left">84.66 (0.31)</td>
<td valign="middle" align="left">87.81 (0.26)</td>
<td valign="middle" align="left">91.17 (0.22)</td>
<td valign="middle" align="left">59.48 (0.25)</td>
<td valign="middle" align="left">73.99 (0.17)</td>
<td valign="middle" align="left">78.86 (0.15)</td>
<td valign="middle" align="left">84.01 (0.16)</td>
<td valign="middle" align="left">79.08</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>br</sub></italic></td>
<td valign="middle" align="left">72.39 (0.43)</td>
<td valign="middle" align="left">84.20 (0.31)</td>
<td valign="middle" align="left">87.65 (0.27)</td>
<td valign="middle" align="left">90.00 (0.24)</td>
<td valign="middle" align="left">57.98 (0.24)</td>
<td valign="middle" align="left">72.57 (0.18)</td>
<td valign="middle" align="left">77.36 (0.15)</td>
<td valign="middle" align="left">83.88 (0.15)</td>
<td valign="middle" align="left">78.25</td>
</tr>
<tr>
<td valign="middle" align="left"><inline-formula>
<mml:math display="inline" id="im34"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>0.1</mml:mn><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="left">57.79 (0.53)</td>
<td valign="middle" align="left">68.27 (0.42)</td>
<td valign="middle" align="left">72.01 (0.37)</td>
<td valign="middle" align="left">75.56 (0.34)</td>
<td valign="middle" align="left">44.41 (0.28)</td>
<td valign="middle" align="left">53.82 (0.19)</td>
<td valign="middle" align="left">57.61 (0.17)</td>
<td valign="middle" align="left">64.78 (0.19)</td>
<td valign="middle" align="left">61.78</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T6" position="float">
<label>Table&#xa0;6</label>
<caption>
<p>Query set macro-precision on DYB dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model configuration</th>
<th valign="middle" align="center">5-way 1-shot</th>
<th valign="middle" align="center">5-way 3-shot</th>
<th valign="middle" align="center">5-way 5-shot</th>
<th valign="middle" align="center">5-way 10-shot</th>
<th valign="middle" align="center">10-way 1-shot</th>
<th valign="middle" align="center">10-way 3-shot</th>
<th valign="middle" align="center">10-way 5-shot</th>
<th valign="middle" align="center">10-way 10-shot</th>
<th valign="middle" align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="10" align="left">FSTL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">74.95 (0.45)</td>
<td valign="middle" align="left">86.04 (0.29)</td>
<td valign="middle" align="left">88.86 (0.24)</td>
<td valign="middle" align="left">91.99 (0.21)</td>
<td valign="middle" align="left">61.97 (0.30)</td>
<td valign="middle" align="left">76.49 (0.18)</td>
<td valign="middle" align="left">80.51 (0.15)</td>
<td valign="middle" align="left">85.35 (0.15)</td>
<td valign="middle" align="left">80.77</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left">74.08 (0.44)</td>
<td valign="middle" align="left">86.16 (0.28)</td>
<td valign="middle" align="left">88.93 (0.24)</td>
<td valign="middle" align="left">90.96 (0.23)</td>
<td valign="middle" align="left">62.23 (0.29)</td>
<td valign="middle" align="left">75.91 (0.19)</td>
<td valign="middle" align="left">79.71 (0.15)</td>
<td valign="middle" align="left">85.51 (0.16)</td>
<td valign="middle" align="left">80.44</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">75.00 (0.44)</td>
<td valign="middle" align="left">85.84 (0.30)</td>
<td valign="middle" align="left">88.64 (0.24)</td>
<td valign="middle" align="left">92.10 (0.21)</td>
<td valign="middle" align="left">61.70 (0.29)</td>
<td valign="middle" align="left">75.92 (0.18)</td>
<td valign="middle" align="left">80.76 (0.15)</td>
<td valign="middle" align="left">85.60 (0.14)</td>
<td valign="middle" align="left">80.70</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">74.35 (0.46)</td>
<td valign="middle" align="left">85.90 (0.29)</td>
<td valign="middle" align="left">89.00 (0.23)</td>
<td valign="middle" align="left">92.15 (0.20)</td>
<td valign="middle" align="left">62.27 (0.29)</td>
<td valign="middle" align="left">75.12 (0.19)</td>
<td valign="middle" align="left">80.40 (0.15)</td>
<td valign="middle" align="left">84.02 (0.16)</td>
<td valign="middle" align="left">80.40</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">74.93 (0.45)</td>
<td valign="middle" align="left">85.91 (0.29)</td>
<td valign="middle" align="left">88.19 (0.24)</td>
<td valign="middle" align="left">92.17 (0.20)</td>
<td valign="middle" align="left">61.39 (0.30)</td>
<td valign="middle" align="left">75.97 (0.18)</td>
<td valign="middle" align="left">80.29 (0.15)</td>
<td valign="middle" align="left">85.71 (0.14)</td>
<td valign="middle" align="left">80.57</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">74.67 (0.45)</td>
<td valign="middle" align="left">85.31 (0.30)</td>
<td valign="middle" align="left">88.54 (0.23)</td>
<td valign="middle" align="left">91.68 (0.20)</td>
<td valign="middle" align="left">63.00 (0.29)</td>
<td valign="middle" align="left">75.77 (0.18)</td>
<td valign="middle" align="left">80.76 (0.15)</td>
<td valign="middle" align="left">85.63 (0.15)</td>
<td valign="middle" align="left">80.67</td>
</tr>
<tr>
<th valign="middle" colspan="10" align="left">SCL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">74.95 (0.45)</td>
<td valign="middle" align="left">86.04 (0.29)</td>
<td valign="middle" align="left">88.86 (0.24)</td>
<td valign="middle" align="left">91.99 (0.21)</td>
<td valign="middle" align="left">61.97 (0.30)</td>
<td valign="middle" align="left">76.49 (0.18)</td>
<td valign="middle" align="left">80.51 (0.15)</td>
<td valign="middle" align="left">85.35 (0.15)</td>
<td valign="middle" align="left">80.77</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>br</sub></italic></td>
<td valign="middle" align="left">74.88 (0.44)</td>
<td valign="middle" align="left">85.67 (0.29)</td>
<td valign="middle" align="left">88.75 (0.25)</td>
<td valign="middle" align="left">90.92 (0.22)</td>
<td valign="middle" align="left">60.70 (0.29)</td>
<td valign="middle" align="left">74.76 (0.19)</td>
<td valign="middle" align="left">78.40 (0.15)</td>
<td valign="middle" align="left">85.15 (0.15)</td>
<td valign="middle" align="left">79.90</td>
</tr>
<tr>
<td valign="middle" align="left"><inline-formula>
<mml:math display="inline" id="im35"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>0.1</mml:mn><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="left">59.07 (0.57)</td>
<td valign="middle" align="left">69.82 (0.44)</td>
<td valign="middle" align="left">73.59 (0.38)</td>
<td valign="middle" align="left">77.19 (0.35)</td>
<td valign="middle" align="left">44.81 (0.32)</td>
<td valign="middle" align="left">54.37 (0.24)</td>
<td valign="middle" align="left">58.67 (0.20)</td>
<td valign="middle" align="left">66.79 (0.22)</td>
<td valign="middle" align="left">63.04</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T7" position="float">
<label>Table&#xa0;7</label>
<caption>
<p>Query set macro-accuracy on WHOI dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model configuration</th>
<th valign="middle" align="center">5-way 1-shot</th>
<th valign="middle" align="center">5-way 3-shot</th>
<th valign="middle" align="center">5-way 5-shot</th>
<th valign="middle" align="center">5-way 10-shot</th>
<th valign="middle" align="center">10-way 1-shot</th>
<th valign="middle" align="center">10-way 3-shot</th>
<th valign="middle" align="center">10-way 5-shot</th>
<th valign="middle" align="center">10-way 10-shot</th>
<th valign="middle" align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="10" align="left">FSTL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">60.89 (0.43)</td>
<td valign="middle" align="left">75.37 (0.31)</td>
<td valign="middle" align="left">79.45 (0.27)</td>
<td valign="middle" align="left">83.89 (0.25)</td>
<td valign="middle" align="left">48.30 (0.28)</td>
<td valign="middle" align="left">62.08 (0.21)</td>
<td valign="middle" align="left">68.75 (0.17)</td>
<td valign="middle" align="left">74.48 (0.18)</td>
<td valign="middle" align="left">69.15</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left"><bold>61.37</bold> (<bold>0.43</bold>)</td>
<td valign="middle" align="left">75.35 (0.31)</td>
<td valign="middle" align="left">79.42 (0.28)</td>
<td valign="middle" align="left"><bold>84.15 (0.25)</bold></td>
<td valign="middle" align="left"><bold>49.01 (0.28)</bold></td>
<td valign="middle" align="left"><bold>63.63 (0.21)</bold></td>
<td valign="middle" align="left"><bold>69.92 (0.17)</bold></td>
<td valign="middle" align="left"><bold>74.52 (0.17)</bold></td>
<td valign="middle" align="left"><bold>69.67</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">60.08 (0.43)</td>
<td valign="middle" align="left">74.40 (0.33)</td>
<td valign="middle" align="left">79.22 (0.28)</td>
<td valign="middle" align="left">83.74 (0.25)</td>
<td valign="middle" align="left">47.15 (0.28)</td>
<td valign="middle" align="left">62.56 (0.21)</td>
<td valign="middle" align="left">68.61 (0.17)</td>
<td valign="middle" align="left">74.39 (0.18)</td>
<td valign="middle" align="left">68.77</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">60.51 (0.43)</td>
<td valign="middle" align="left">74.92 (0.32)</td>
<td valign="middle" align="left"><bold>79.50 (0.26)</bold></td>
<td valign="middle" align="left">83.58 (0.26)</td>
<td valign="middle" align="left">48.05 (0.27)</td>
<td valign="middle" align="left"><bold>63.79 (0.21)</bold></td>
<td valign="middle" align="left">68.69 (0.17)</td>
<td valign="middle" align="left"><bold>75.05 (0.18)</bold></td>
<td valign="middle" align="left"><bold>69.26</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">60.81 (0.42)</td>
<td valign="middle" align="left">74.60 (0.32)</td>
<td valign="middle" align="left"><bold>79.99 (0.27)</bold></td>
<td valign="middle" align="left">83.69 (0.25)</td>
<td valign="middle" align="left"><bold>48.81 (0.28)</bold></td>
<td valign="middle" align="left"><bold>63.78 (0.20)</bold></td>
<td valign="middle" align="left">68.66 (0.17)</td>
<td valign="middle" align="left"><bold>74.91 (0.17)</bold></td>
<td valign="middle" align="left"><bold>69.41</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 2<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left"><bold>61.14</bold> (<bold>0.43</bold>)</td>
<td valign="middle" align="left">75.36 (0.31)</td>
<td valign="middle" align="left">79.41 (0.27)</td>
<td valign="middle" align="left">83.79 (0.26)</td>
<td valign="middle" align="left"><bold>48.66 (0.28)</bold></td>
<td valign="middle" align="left"><bold>63.65 (0.20)</bold></td>
<td valign="middle" align="left">68.70 (0.17)</td>
<td valign="middle" align="left">74.10 (0.17)</td>
<td valign="middle" align="left"><bold>69.35</bold></td>
</tr>
<tr>
<th valign="middle" colspan="10" align="left">SCL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">60.89 (0.43)</td>
<td valign="middle" align="left">75.37 (0.31)</td>
<td valign="middle" align="left">79.45 (0.26)</td>
<td valign="middle" align="left">83.89 (0.22)</td>
<td valign="middle" align="left">48.30 (0.25)</td>
<td valign="middle" align="left">62.08 (0.17)</td>
<td valign="middle" align="left">68.75 (0.15)</td>
<td valign="middle" align="left">74.48 (0.16)</td>
<td valign="middle" align="left">69.15</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>br</sub></italic></td>
<td valign="middle" align="left">60.14 (0.43)</td>
<td valign="middle" align="left">74.20 (0.31)</td>
<td valign="middle" align="left">78.61 (0.27)</td>
<td valign="middle" align="left">83.47 (0.24)</td>
<td valign="middle" align="left">47.28 (0.24)</td>
<td valign="middle" align="left">61.61 (0.18)</td>
<td valign="middle" align="left">67.92 (0.15)</td>
<td valign="middle" align="left">73.58 (0.15)</td>
<td valign="middle" align="left">68.35</td>
</tr>
<tr>
<td valign="middle" align="left"><inline-formula>
<mml:math display="inline" id="im36"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>0.1</mml:mn><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="left">57.41 (0.53)</td>
<td valign="middle" align="left">72.04 (0.42)</td>
<td valign="middle" align="left">77.30 (0.37)</td>
<td valign="middle" align="left">82.56 (0.34)</td>
<td valign="middle" align="left">43.32 (0.28)</td>
<td valign="middle" align="left">57.25 (0.19)</td>
<td valign="middle" align="left">63.02 (0.17)</td>
<td valign="middle" align="left">70.20 (0.19)</td>
<td valign="middle" align="left">65.39</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate that these experimental results are superior to the results obtained by using only the first term of the loss function.</p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T8" position="float">
<label>Table&#xa0;8</label>
<caption>
<p>Query set macro-precision on WHOI dataset.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center">Model configuration</th>
<th valign="middle" align="center">5-way 1-shot</th>
<th valign="middle" align="center">5-way 3-shot</th>
<th valign="middle" align="center">5-way 5-shot</th>
<th valign="middle" align="center">5-way 10-shot</th>
<th valign="middle" align="center">10-way 1-shot</th>
<th valign="middle" align="center">10-way 3-shot</th>
<th valign="middle" align="center">10-way 5-shot</th>
<th valign="middle" align="center">10-way 10-shot</th>
<th valign="middle" align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<th valign="middle" colspan="10" align="left">FSTL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">62.97 (0.46)</td>
<td valign="middle" align="left">77.54 (0.30)</td>
<td valign="middle" align="left">81.34 (0.27)</td>
<td valign="middle" align="left">85.57 (0.24)</td>
<td valign="middle" align="left">50.64 (0.34)</td>
<td valign="middle" align="left">64.94 (0.24)</td>
<td valign="middle" align="left">71.19 (0.18)</td>
<td valign="middle" align="left">77.05 (0.19)</td>
<td valign="middle" align="left">71.41</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left"><bold>63.83 (0.49)</bold></td>
<td valign="middle" align="left">77.47 (0.31)</td>
<td valign="middle" align="left">81.21 (0.27)</td>
<td valign="middle" align="left"><bold>85.84 (0.23)</bold></td>
<td valign="middle" align="left"><bold>51.49 (0.34)</bold></td>
<td valign="middle" align="left"><bold>66.19 (0.23)</bold></td>
<td valign="middle" align="left"><bold>72.65 (0.18)</bold></td>
<td valign="middle" align="left"><bold>77.11 (0.17)</bold></td>
<td valign="middle" align="left"><bold>71.97</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">62.83 (0.48)</td>
<td valign="middle" align="left">76.43 (0.32)</td>
<td valign="middle" align="left">81.08 (0.27)</td>
<td valign="middle" align="left">85.35 (0.24)</td>
<td valign="middle" align="left">49.43 (0.34)</td>
<td valign="middle" align="left"><bold>65.63 (0.23)</bold></td>
<td valign="middle" align="left"><bold>71.42 (0.18)</bold></td>
<td valign="middle" align="left">76.81 (0.18)</td>
<td valign="middle" align="left">71.12</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left"><bold>63.18 (0.48)</bold></td>
<td valign="middle" align="left">77.13 (0.32)</td>
<td valign="middle" align="left"><bold>81.38 (0.25)</bold></td>
<td valign="middle" align="left">85.29 (0.24)</td>
<td valign="middle" align="left"><bold>50.65 (0.33)</bold></td>
<td valign="middle" align="left"><bold>66.34 (0.23)</bold></td>
<td valign="middle" align="left"><bold>71.45 (0.18)</bold></td>
<td valign="middle" align="left"><bold>77.61 (0.17)</bold></td>
<td valign="middle" align="left"><bold>71.63</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 1.5<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">62.97 (0.48)</td>
<td valign="middle" align="left">76.71 (0.32)</td>
<td valign="middle" align="left"><bold>81.77 (0.26)</bold></td>
<td valign="middle" align="left">85.45 (0.23)</td>
<td valign="middle" align="left"><bold>51.29 (0.33)</bold></td>
<td valign="middle" align="left"><bold>66.60 (0.23)</bold></td>
<td valign="middle" align="left">71.08 (0.18)</td>
<td valign="middle" align="left"><bold>77.49 (0.17)</bold></td>
<td valign="middle" align="left"><bold>71.67</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 2<italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left"><bold>63.68 (0.49)</bold></td>
<td valign="middle" align="left">77.38 (0.32)</td>
<td valign="middle" align="left">81.11 (0.26)</td>
<td valign="middle" align="left">85.45 (0.25)</td>
<td valign="middle" align="left"><bold>50.94 (0.33)</bold></td>
<td valign="middle" align="left"><bold>66.33 (0.23)</bold></td>
<td valign="middle" align="left"><bold>71.28 (0.19)</bold></td>
<td valign="middle" align="left">76.64 (0.18)</td>
<td valign="middle" align="left"><bold>71.60</bold></td>
</tr>
<tr>
<th valign="middle" colspan="10" align="left">SCL</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">62.97 (0.46)</td>
<td valign="middle" align="left">77.54 (0.30)</td>
<td valign="middle" align="left">81.34 (0.27)</td>
<td valign="middle" align="left">85.57 (0.24)</td>
<td valign="middle" align="left">50.64 (0.34)</td>
<td valign="middle" align="left">64.94 (0.24)</td>
<td valign="middle" align="left">71.19 (0.18)</td>
<td valign="middle" align="left">77.05 (0.19)</td>
<td valign="middle" align="left">71.41</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>br</sub></italic></td>
<td valign="middle" align="left">62.66 (0.47)</td>
<td valign="middle" align="left">76.42 (0.31)</td>
<td valign="middle" align="left">80.43 (0.27)</td>
<td valign="middle" align="left">85.43 (0.22)</td>
<td valign="middle" align="left">49.50 (0.33)</td>
<td valign="middle" align="left">64.37 (0.23)</td>
<td valign="middle" align="left">70.53 (0.18)</td>
<td valign="middle" align="left">75.91 (0.18)</td>
<td valign="middle" align="left">70.66</td>
</tr>
<tr>
<td valign="middle" align="left"><inline-formula>
<mml:math display="inline" id="im37"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>0.1</mml:mn><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="left">60.08 (0.49)</td>
<td valign="middle" align="left">74.18 (0.33)</td>
<td valign="middle" align="left">78.98 (0.29)</td>
<td valign="middle" align="left">84.14 (0.25)</td>
<td valign="middle" align="left">45.52 (0.33)</td>
<td valign="middle" align="left">59.69 (0.25)</td>
<td valign="middle" align="left">65.65 (0.20)</td>
<td valign="middle" align="left">72.77 (0.20)</td>
<td valign="middle" align="left">67.63</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate that these experimental results are superior to the results obtained by using only the first term of the loss function.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>As indicated in <xref ref-type="table" rid="T5"><bold>Table&#xa0;5</bold></xref>, the classification performance of SCL on the DYB dataset reached its optimal level when only the first term in the loss function was used. However, when the last two terms were added, SCL&#x2019;s performance declined significantly. This decline is likely to be caused by the high similarity among different plankton categories, which made the relatively coarse loss function ineffective at capturing fine-grained features. By contrast, after we added the last two terms to the loss function of FSTL, its performance remained stable.</p>
<p>In <xref ref-type="table" rid="T7"><bold>Table&#xa0;7</bold></xref>, FSTL demonstrates significant advantages over SCL. On the WHOI dataset, three settings of the loss function outperformed the use of <italic>L<sub>bc</sub></italic> alone; these results are highlighted in bold. Notably, the combination of <italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> achieved the best overall performance. The second-best combination was <italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 1.5 &#x2217; <italic>L<sub>ge</sub></italic>, which significantly improved the average macro-accuracy in several few-shot experiments, including 5-way 5-shot, 10-way 1-shot, 10-way 3-shot, and 10-way 10-shot. Therefore, it is advisable to use either <italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> or <italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic> + 1.5 &#x2217; <italic>L<sub>ge</sub></italic> in practical applications. The results measure by macro-precision are entirely consistent with this conclusion. To assess the overall statistical significance, we performed the Friedman test based on the macro-accuracy of the WHOI dataset across different ways and shots, comparing FSTL and SCL. The resulting p-value was 5.519<italic>e</italic><sup>&#x2212;11</sup>. Similarly, the p-value for the macro-accuracy was 5.91<italic>e</italic><sup>&#x2212;11</sup>. Subsequently, pairwise Nemenyi-Wilcoxon-Wilcox tests were performed, and all corresponding p-values are reported in <xref ref-type="table" rid="T9"><bold>Tables&#xa0;9</bold></xref>, <xref ref-type="table" rid="T10"><bold>10</bold></xref>.</p>
<table-wrap id="T9" position="float">
<label>Table&#xa0;9</label>
<caption>
<p>Pairwise Nemenyi-Wilcoxon-Wilcox test on macro-accuracy on WHOI dataset (p-value).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center"/>
<th valign="middle" align="center">FSTL(<italic>L<sub>bc</sub></italic>+ <italic>L<sub>cd</sub></italic>+ 1.5<italic>L<sub>ge</sub></italic>)</th>
<th valign="middle" align="center">FSTL(<italic>L<sub>bc</sub></italic>+ <italic>L<sub>cd</sub></italic>+ 2<italic>L<sub>ge</sub></italic>)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>br</sub></italic></td>
<td valign="middle" align="left">0.0837</td>
<td valign="middle" align="left">0.1344</td>
</tr>
<tr>
<td valign="middle" align="left"><inline-formula>
<mml:math display="inline" id="im38"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>0.1</mml:mn><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="left">0.0040</td>
<td valign="middle" align="left">0.0080</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T10" position="float">
<label>Table&#xa0;10</label>
<caption>
<p>Pairwise Nemenyi-Wilcoxon-Wilcox test on macro-precision on WHOI dataset (p-value).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left"/>
<th valign="middle" align="left">FSTL(<italic>L<sub>bc</sub></italic>+ <italic>L<sub>cd</sub></italic>+ 1.5<italic>L<sub>ge</sub></italic>)</th>
<th valign="middle" align="left">FSTL(<italic>L<sub>bc</sub></italic>+ <italic>L<sub>cd</sub></italic>+ 2<italic>L<sub>ge</sub></italic>)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>br</sub></italic></td>
<td valign="middle" align="left">0.0738</td>
<td valign="middle" align="left">0.2267</td>
</tr>
<tr>
<td valign="middle" align="left"><inline-formula>
<mml:math display="inline" id="im39"><mml:mrow><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>b</mml:mi><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mn>0.1</mml:mn><mml:msubsup><mml:mi>L</mml:mi><mml:mrow><mml:mi>c</mml:mi><mml:mi>d</mml:mi></mml:mrow><mml:mo>*</mml:mo></mml:msubsup></mml:mrow></mml:math></inline-formula></td>
<td valign="middle" align="left">0.0023</td>
<td valign="middle" align="left">0.0131</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To summarize, the performance of FSTL and SCL remained similar when we considered only the initial terms of their loss functions. However, the last two terms of SCL&#x2019;s loss function did not contribute to plankton image classification and may have even worsened the results. By contrast, the last two terms of FSTL&#x2019;s loss function enhanced the performance of the transfer learning framework in few-shot plankton image classification. This improvement was particularly noticeable on the WHOI dataset.</p>
</sec>
<sec id="s4_4_3">
<label>4.4.3</label>
<title>Experiment 3</title>
<p>Improvements observed on the WHOI dataset, but not on the DYB dataset, may be attributed to differences in image quality. For high-quality images, even a basic loss function can effectively extract optimal features, whereas adding more components may dilute the model&#x2019;s focus. By contrast, for lower-quality images, a more complex loss function might help the model to concentrate on various levels of features. We calculated the average image entropy of the training sets for both datasets and found that the DYB dataset had an entropy of 2.49, whereas the WHOI dataset had an entropy of 5.01. In the context of marine plankton, lower information entropy suggests clearer images with less noise.</p>
<p>We conducted a toy experiment to further validate this hypothesis. By randomly introducing Gaussian noise, the average image entropies of DYB and WHOI increased to 5.14 and 6.64, respectively. Then, we conducted few-shot classification on these two datasets based on different loss function settings. The results are presented in <xref ref-type="table" rid="T11"><bold>Tables&#xa0;11</bold></xref>, <xref ref-type="table" rid="T12"><bold>12</bold></xref>. We observed that, on the datasets with added noise, both <italic>L<sub>cd</sub></italic> and <italic>L<sub>ge</sub></italic> improved the few-shot classification performance of FSTL; the improvement was particularly significant on the DYB dataset.</p>
<table-wrap id="T11" position="float">
<label>Table&#xa0;11</label>
<caption>
<p>Query set macro-accuracy of FSTL in the toy experiment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" colspan="2" align="center">DYB-plankton</th>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
</tr>
<tr>
<th valign="middle" align="center">Model configuration</th>
<th valign="middle" align="center">5-way 1-shot</th>
<th valign="middle" align="center">5-way 3-shot</th>
<th valign="middle" align="center">5-way 5-shot</th>
<th valign="middle" align="center">5-way 10-shot</th>
<th valign="middle" align="center">10-way 1-shot</th>
<th valign="middle" align="center">10-way 3-shot</th>
<th valign="middle" align="center">10-way 5-shot</th>
<th valign="middle" align="center">10-way 10-shot</th>
<th valign="middle" align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">69.47 (0.43)</td>
<td valign="middle" align="left">80.93 (0.34)</td>
<td valign="middle" align="left">85.01 (0.30)</td>
<td valign="middle" align="left">87.95 (0.28)</td>
<td valign="middle" align="left">55.67 (0.25)</td>
<td valign="middle" align="left">67.98 (0.18)</td>
<td valign="middle" align="left">72.68 (0.15)</td>
<td valign="middle" align="left">77.35 (0.17)</td>
<td valign="middle" align="left">74.63</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left">69.42 (0.44)</td>
<td valign="middle" align="left">80.80 (0.35)</td>
<td valign="middle" align="left">84.22 (0.30)</td>
<td valign="middle" align="left"><bold>88.34 (0.28)</bold></td>
<td valign="middle" align="left">55.25 (0.25)</td>
<td valign="middle" align="left">67.90 (0.17)</td>
<td valign="middle" align="left"><bold>72.96 (0.16)</bold></td>
<td valign="middle" align="left"><bold>78.34 (0.17)</bold></td>
<td valign="middle" align="left"><bold>74.65</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left"><bold>70.18 (0.44)</bold></td>
<td valign="middle" align="left">80.74 (0.35)</td>
<td valign="middle" align="left">84.31 (0.31)</td>
<td valign="middle" align="left">87.56 (0.29)</td>
<td valign="middle" align="left">54.89 (0.25)</td>
<td valign="middle" align="left">67.71 (0.18)</td>
<td valign="middle" align="left"><bold>72.81 (0.16)</bold></td>
<td valign="middle" align="left"><bold>78.68 (0.17)</bold></td>
<td valign="middle" align="left">74.61</td>
</tr>
<tr>
<th valign="middle" colspan="10" align="center">WHOI</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">57.60 (0.43)</td>
<td valign="middle" align="left">71.12 (0.36)</td>
<td valign="middle" align="left">76.63 (0.30)</td>
<td valign="middle" align="left">80.84 (0.29)</td>
<td valign="middle" align="left">44.74 (0.27)</td>
<td valign="middle" align="left">57.78 (0.20)</td>
<td valign="middle" align="left">63.59 (0.18)</td>
<td valign="middle" align="left">69.89 (0.18)</td>
<td valign="middle" align="left">65.27</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left"><bold>57.97 (0.44)</bold></td>
<td valign="middle" align="left"><bold>72.26 (0.35)</bold></td>
<td valign="middle" align="left">76.12 (0.31)</td>
<td valign="middle" align="left"><bold>81.77 (0.28)</bold></td>
<td valign="middle" align="left"><bold>45.26 (0.27)</bold></td>
<td valign="middle" align="left"><bold>59.68 (0.20)</bold></td>
<td valign="middle" align="left"><bold>64.10 (0.17)</bold></td>
<td valign="middle" align="left"><bold>71.42 (0.17)</bold></td>
<td valign="middle" align="left"><bold>66.07</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">57.44 (0.42)</td>
<td valign="middle" align="left">70.91 (0.35)</td>
<td valign="middle" align="left">75.88 (0.29)</td>
<td valign="middle" align="left"><bold>81.05 (0.29)</bold></td>
<td valign="middle" align="left">44.27 (0.27)</td>
<td valign="middle" align="left"><bold>58.72 (0.20)</bold></td>
<td valign="middle" align="left">63.33 (0.17)</td>
<td valign="middle" align="left">69.09 (0.18)</td>
<td valign="middle" align="left">65.09</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate that these experimental results are superior to the results obtained by using only the first term of the loss function.</p></fn>
</table-wrap-foot>
</table-wrap>
<table-wrap id="T12" position="float">
<label>Table&#xa0;12</label>
<caption>
<p>Query set macro-precision of FSTL in the toy experiment.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" colspan="2" align="center">DYB-plankton</th>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
<th valign="middle" align="center"/>
</tr>
<tr>
<th valign="middle" align="center">Model configuration</th>
<th valign="middle" align="center">5-way 1-shot</th>
<th valign="middle" align="center">5-way 3-shot</th>
<th valign="middle" align="center">5-way 5-shot</th>
<th valign="middle" align="center">5-way 10-shot</th>
<th valign="middle" align="center">10-way 1-shot</th>
<th valign="middle" align="center">10-way 3-shot</th>
<th valign="middle" align="center">10-way 5-shot</th>
<th valign="middle" align="center">10-way 10-shot</th>
<th valign="middle" align="center">Average</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">71.88 (0.47)</td>
<td valign="middle" align="left">82.53 (0.33)</td>
<td valign="middle" align="left">86.13 (0.29)</td>
<td valign="middle" align="left">88.92 (0.27)</td>
<td valign="middle" align="left">57.73 (0.29)</td>
<td valign="middle" align="left">69.92 (0.19)</td>
<td valign="middle" align="left">74.20 (0.18)</td>
<td valign="middle" align="left">78.69 (0.18)</td>
<td valign="middle" align="left">76.25</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left">71.70 (0.47)</td>
<td valign="middle" align="left">82.43 (0.35)</td>
<td valign="middle" align="left">85.45 (0.29)</td>
<td valign="middle" align="left"><bold>89.34 (0.27)</bold></td>
<td valign="middle" align="left">57.31 (0.30)</td>
<td valign="middle" align="left"><bold>69.96 (0.20)</bold></td>
<td valign="middle" align="left"><bold>74.73 (0.18)</bold></td>
<td valign="middle" align="left"><bold>79.73 (0.19)</bold></td>
<td valign="middle" align="left"><bold>76.33</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left"><bold>72.50 (0.46)</bold></td>
<td valign="middle" align="left">82.18 (0.35)</td>
<td valign="middle" align="left">85.48 (0.31)</td>
<td valign="middle" align="left">88.56 (0.28)</td>
<td valign="middle" align="left">57.02 (0.29)</td>
<td valign="middle" align="left"><bold>70.05 (0.21)</bold></td>
<td valign="middle" align="left"><bold>74.55 (0.18)</bold></td>
<td valign="middle" align="left"><bold>80.17 (0.17)</bold></td>
<td valign="middle" align="left"><bold>76.31</bold></td>
</tr>
<tr>
<th valign="middle" colspan="10" align="center">WHOI</th>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic></td>
<td valign="middle" align="left">60.13 (0.47)</td>
<td valign="middle" align="left">73.16 (0.37)</td>
<td valign="middle" align="left">78.36 (0.31)</td>
<td valign="middle" align="left">82.74 (0.28)</td>
<td valign="middle" align="left">47.17 (0.31)</td>
<td valign="middle" align="left">59.98 (0.24)</td>
<td valign="middle" align="left">66.14 (0.20)</td>
<td valign="middle" align="left">72.48 (0.19)</td>
<td valign="middle" align="left">67.52</td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>cd</sub></italic></td>
<td valign="middle" align="left"><bold>60.65 (0.47)</bold></td>
<td valign="middle" align="left"><bold>74.44 (0.36)</bold></td>
<td valign="middle" align="left">77.82 (0.32)</td>
<td valign="middle" align="left"><bold>83.55 (0.27)</bold></td>
<td valign="middle" align="left"><bold>47.48 (0.32)</bold></td>
<td valign="middle" align="left"><bold>62.06 (0.24)</bold></td>
<td valign="middle" align="left"><bold>66.66 (0.20)</bold></td>
<td valign="middle" align="left"><bold>74.08 (0.18)</bold></td>
<td valign="middle" align="left"><bold>68.34</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><italic>L<sub>bc</sub></italic> + <italic>L<sub>ge</sub></italic></td>
<td valign="middle" align="left">59.76 (0.49)</td>
<td valign="middle" align="left">73.00 (0.35)</td>
<td valign="middle" align="left">77.70 (0.30)</td>
<td valign="middle" align="left"><bold>82.80 (0.28)</bold></td>
<td valign="middle" align="left">46.43 (0.32)</td>
<td valign="middle" align="left"><bold>61.13 (0.24)</bold></td>
<td valign="middle" align="left">66.04 (0.20)</td>
<td valign="middle" align="left">71.77 (0.20)</td>
<td valign="middle" align="left">67.33</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>The bold values indicate that these experimental results are superior to the results obtained by using only the first term of the loss function.</p></fn>
</table-wrap-foot>
</table-wrap>
<p>Therefore, we infer the following conclusion: For high-quality plankton images, a simple loss function can achieve satisfactory classification performance, whereas the inclusion of additional components in the loss function may detract from the model&#x2019;s learning focus, leading to overfitting. Conversely, for lower-quality images, more complex loss functions enable the model to concentrate on different layers of features, thereby enhancing classification performance.</p>
</sec>
</sec>
</sec>
<sec id="s5" sec-type="discussion">
<label>5</label>
<title>Discussion</title>
<p>To address the classification of tail-class samples in marine plankton images, we proposed a novel few-shot classification method, FSTL, based on self-supervised transfer learning. Compared with previous methods, we added a self-supervised generation task in FSTL and modified its loss function to better capture between class and within-class differences, while also achieving a better fine-grained feature extraction capability. Compared with other few-shot classification methods, FSTL demonstrated clear advantages in classifying the tail data of plankton images. Compared with long-tailed research, such as using logit adjust focal loss, which is designed for a long-tailed learning scenario where all classes (head and tail) are present in the training set. Our FSTL framework, in contrast, operates under a few-shot transfer paradigm: the training set is composed solely of head classes, while the tail classes are exclusively reserved for the testing phase.</p>
<p>Additionally, we analyzed the impact of image quality on classification performance. For plankton images with good quality, a simple loss function was sufficient to effectively extract image features. For images with lower quality, we recommend a more complex loss function. The last two components of FSTL&#x2019;s CCM loss function had a minimal effect on the DYB dataset, whereas we observed improvements on the WHOI dataset. In practice, it is difficult to know the image quality in advance or to ensure optimal conditions. Our proposed method, which incorporates a three-term loss, offers broad practical applicability, as it is capable of adapting to image data of varying quality, achieving a robust balance between generalization and stability.</p>
</sec>
</body>
<back>
<sec id="s6" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s7" sec-type="author-contributions">
<title>Author contributions</title>
<p>XZ: Methodology, Software, Writing &#x2013; original draft, Writing &#x2013; review &amp; editing. YL: Data curation, Formal Analysis, Investigation, Methodology, Validation, Writing &#x2013; review &amp; editing. ZF: Conceptualization, Data curation, Funding acquisition, Investigation, Methodology, Project administration, Resources, Validation, Writing &#x2013; review &amp; editing. FL: Software, Methodology, Writing &#x2013; original draft.</p></sec>
<ack>
<title>Acknowledgments</title>
<p>We acknowledge the support from our laboratory members for their insightful discussions.</p>
</ack>
<sec id="s9" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s10" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. To improve the language, grammar, and clarity of certain sentences. The AI tool (DeepSeek) was used solely for this purpose, and the authors have thoroughly reviewed and edited all AI-generated content. The scientific content, ideas, and conclusions of the work remain the sole responsibility of the authors.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s11" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Aditya</surname> <given-names>M.</given-names></name>
<name><surname>Krishna Sadeep</surname> <given-names>J.</given-names></name>
<name><surname>Ankit</surname> <given-names>R.</given-names></name>
<name><surname>Singh Himanshu</surname> <given-names>J.</given-names></name>
<name><surname>Andreas</surname> <given-names>V.</given-names></name>
<name><surname>Sanjiv</surname> <given-names>K.</given-names></name>
</person-group> (<year>2021</year>). &#x201c;
<article-title>Long-tail learning via logit adjustment</article-title>,&#x201d; in <conf-name>Ninth international conference on learning representations (ICLR)</conf-name>, (<conf-loc>Virtual Event, Austria</conf-loc>: 
<publisher-name>OpenReview.net</publisher-name>).
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Albelwi</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Survey on self-supervised learning: auxiliary pretext tasks and contrastive learning methods in imaging</article-title>. <source>Entropy</source> <volume>24</volume>, <fpage>551</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/e24040551</pub-id>, PMID: <pub-id pub-id-type="pmid">35455214</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Atanov</surname> <given-names>A.</given-names></name>
<name><surname>Xu</surname> <given-names>S.</given-names></name>
<name><surname>Beker</surname> <given-names>O.</given-names></name>
<name><surname>Filatov</surname> <given-names>A.</given-names></name>
<name><surname>Zamir</surname> <given-names>A.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Simple control baselines for evaluating transfer learning</article-title>. <source>arXiv preprint arXiv:2202.03365</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2202.03365</pub-id>
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Benfield</surname> <given-names>M. C.</given-names></name>
<name><surname>Grosjean</surname> <given-names>P.</given-names></name>
<name><surname>Culverhouse</surname> <given-names>P. F.</given-names></name>
<name><surname>Irigoien</surname> <given-names>X.</given-names></name>
<name><surname>Sieracki</surname> <given-names>M. E.</given-names></name>
<name><surname>Lopez-Urrutia</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2007</year>). 
<article-title>Rapid: research on automated plankton identification</article-title>. <source>Oceanography</source> <volume>20</volume>, <fpage>172</fpage>&#x2013;<lpage>187</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.5670/oceanog.2007.63</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Bochinski</surname> <given-names>E.</given-names></name>
<name><surname>Bacha</surname> <given-names>G.</given-names></name>
<name><surname>Eiselein</surname> <given-names>V.</given-names></name>
<name><surname>Walles</surname> <given-names>T. J.</given-names></name>
<name><surname>Nejstgaard</surname> <given-names>J. C.</given-names></name>
<name><surname>Sikora</surname> <given-names>T.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Deep active learning for in <italic>situ</italic> plankton classification</article-title>,&#x201d; in <source>Pattern recognition and information forensics: ICPR 2018 international workshops, CVAUI, IWCF, and MIPPSNA, beijing, China, august 20-24, 2018, <italic>revised selected papers</italic> 24</source> (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>5</fpage>&#x2013;<lpage>15</lpage>.
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Chen</surname> <given-names>T.</given-names></name>
<name><surname>Kornblith</surname> <given-names>S.</given-names></name>
<name><surname>Norouzi</surname> <given-names>M.</given-names></name>
<name><surname>Hinton</surname> <given-names>G.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>A simple framework for contrastive learningof visual representations</article-title>,&#x201d; in <source>International conference on machine learning</source> (
<publisher-name>PMLR</publisher-name>), <fpage>1597</fpage>&#x2013;<lpage>1607</lpage>.
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Cui</surname> <given-names>Y.</given-names></name>
<name><surname>Jia</surname> <given-names>M.</given-names></name>
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Song</surname> <given-names>Y.</given-names></name>
<name><surname>Belongie</surname> <given-names>S.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Class-balanced loss based on effective number of samples</article-title>. <source>CVPR</source><fpage>9268</fpage>&#x2013;<lpage>9277</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/CVPR.2019.00949</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Finn</surname> <given-names>C.</given-names></name>
<name><surname>Abbeel</surname> <given-names>P.</given-names></name>
<name><surname>Levine</surname> <given-names>S.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Model-agnostic meta-learning for fast adaptation of deep networks</article-title>,&#x201d; in <conf-name>34th International Conference on Machine Learning (ICML 2017)</conf-name>, (<conf-loc>Sydney, NSW, Australia</conf-loc>:  
<publisher-name>International conference on machine learning (PMLR)</publisher-name>), <fpage>1126</fpage>&#x2013;<lpage>1135</lpage>.
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gao</surname> <given-names>H.</given-names></name>
<name><surname>Shou</surname> <given-names>Z.</given-names></name>
<name><surname>Zareian</surname> <given-names>A.</given-names></name>
<name><surname>Zhang</surname> <given-names>H.</given-names></name>
<name><surname>Chang</surname> <given-names>S.-F.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Low-shot learning via covariance-preserving adversarial augmentation networks</article-title>,&#x201d; in <source>Proceedings of the 32nd International Conference on Neural Information Processing Systems,</source><fpage>983</fpage>&#x2013;<lpage>993</lpage>.
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ghiasi</surname> <given-names>G.</given-names></name>
<name><surname>Lin</surname> <given-names>T.-Y.</given-names></name>
<name><surname>Le</surname> <given-names>Q. V.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>DropBlock: a regularization method for convolutional networks</article-title>,&#x201d; in <source>Proceedings of the 32nd International Conference on Neural Information Processing Systems</source>, <fpage>10750</fpage>&#x2013;<lpage>10760</lpage>.
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Guo</surname> <given-names>J.</given-names></name>
<name><surname>Li</surname> <given-names>W.</given-names></name>
<name><surname>Guan</surname> <given-names>J.</given-names></name>
<name><surname>Gao</surname> <given-names>H.</given-names></name>
<name><surname>Liu</surname> <given-names>B.</given-names></name>
<name><surname>Gong</surname> <given-names>L.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Cdfm: A cross-domain few-shot model for marine plankton classification</article-title>. <source>IET Comput. Vision</source> <volume>17</volume>, <fpage>111</fpage>&#x2013;<lpage>121</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1049/cvi2.12137</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>He</surname> <given-names>K.</given-names></name>
<name><surname>Fan</surname> <given-names>H.</given-names></name>
<name><surname>Wu</surname> <given-names>Y.</given-names></name>
<name><surname>Xie</surname> <given-names>S.</given-names></name>
<name><surname>Girshick</surname> <given-names>R.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>Momentum contrast for unsupervised visual representation learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, (<publisher-loc>Seattle, WA, USA</publisher-loc>: 
<publisher-name>Institute of Electrical and Electronics Engineers (IEEE))</publisher-name>, <fpage>9729</fpage>&#x2013;<lpage>9738</lpage>.
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Henrichs</surname> <given-names>D. W.</given-names></name>
<name><surname>Angl&#xe8;s</surname> <given-names>S.</given-names></name>
<name><surname>Gaonkar</surname> <given-names>C. C.</given-names></name>
<name><surname>Campbell</surname> <given-names>L.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Application of a convolutional neural network to improve automated early warning of harmful algal blooms</article-title>. <source>Environ. Sci. pollut. Res.</source> <volume>28</volume>, <fpage>28544</fpage>&#x2013;<lpage>28555</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s11356-021-12471-2</pub-id>, PMID: <pub-id pub-id-type="pmid">33547607</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Irisson</surname> <given-names>J.-O.</given-names></name>
<name><surname>Ayata</surname> <given-names>S.-D.</given-names></name>
<name><surname>Lindsay</surname> <given-names>D. J.</given-names></name>
<name><surname>Karp-Boss</surname> <given-names>L.</given-names></name>
<name><surname>Stemmann</surname> <given-names>L.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Machine learning for the study of plankton and marine snow from images</article-title>. <source>Annu. Rev. Mar. Sci.</source> <volume>14</volume>, <fpage>277</fpage>&#x2013;<lpage>301</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1146/annurev-marine-041921-013023</pub-id>, PMID: <pub-id pub-id-type="pmid">34460314</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Kang</surname> <given-names>B.</given-names></name>
<name><surname>Xie</surname> <given-names>S.</given-names></name>
<name><surname>Rohrbach</surname> <given-names>M.</given-names></name>
<name><surname>Yan</surname> <given-names>Z.</given-names></name>
<name><surname>Gordo</surname> <given-names>A.</given-names></name>
<name><surname>Feng</surname> <given-names>J.</given-names></name>
<etal/>
</person-group>. (<year>2020</year>). &#x201c;
<article-title>Decoupling representation and classifier for long-tailed recognition</article-title>,&#x201d; in <conf-name>Eighth international conference on learning representations (ICLR)</conf-name>, (<conf-loc>Addis Ababa, Ethiopia</conf-loc>: 
<publisher-name>OpenReview.net</publisher-name>).
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Kwitt</surname> <given-names>R.</given-names></name>
<name><surname>Hegenbart</surname> <given-names>S.</given-names></name>
<name><surname>Niethammer</surname> <given-names>M.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>One-shot learning of scene locations via feature trajectory transfer</article-title>,&#x201d; in <source>Proceedings of The IEEE conference on computer vision and pattern recognition</source>. (<publisher-loc>Las Vegas, NV, USA</publisher-loc>: 
<publisher-name>IEEE Computer Society</publisher-name>), <fpage>78</fpage>&#x2013;<lpage>86</lpage>.
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Lee</surname> <given-names>H.</given-names></name>
<name><surname>Park</surname> <given-names>M.</given-names></name>
<name><surname>Kim</surname> <given-names>J.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Plankton classification on imbalanced large scale database via convolutional neural networks with transfer learning</article-title>,&#x201d; in <conf-name>2016 IEEE international conference on image processing (ICIP)</conf-name>, (<conf-loc>Phoenix, AZ, USA</conf-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>3713</fpage>&#x2013;<lpage>3717</lpage>.
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>J.</given-names></name>
<name><surname>Chen</surname> <given-names>T.</given-names></name>
<name><surname>Yang</surname> <given-names>Z.</given-names></name>
<name><surname>Chen</surname> <given-names>L.</given-names></name>
<name><surname>Liu</surname> <given-names>P.</given-names></name>
<name><surname>Zhang</surname> <given-names>Y.</given-names></name>
<etal/>
</person-group>. (<year>2021</year>). 
<article-title>Development of a buoy-borne underwater imaging system for in <italic>situ</italic> mesoplankton monitoring of coastal waters</article-title>. <source>IEEE J. Oceanic Eng.</source> <volume>47</volume>, <fpage>88</fpage>&#x2013;<lpage>110</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/JOE.2021.3106122</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>X.</given-names></name>
<name><surname>Cui</surname> <given-names>Z.</given-names></name>
</person-group> (<year>2016</year>). &#x201c;
<article-title>Deep residual networks for plankton classification</article-title>,&#x201d; in <source>OCEANS 2016 MTS/IEEE monterey</source>. (<publisher-loc>Monterey, California, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>4</lpage>.
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lim</surname> <given-names>J. Y.</given-names></name>
<name><surname>Lim</surname> <given-names>K. M.</given-names></name>
<name><surname>Lee</surname> <given-names>C. P.</given-names></name>
<name><surname>Tan</surname> <given-names>Y. X.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>Scl: Self-supervised contrastive learning for few-shot image classification</article-title>. <source>Neural Networks</source> <volume>165</volume>, <fpage>19</fpage>&#x2013;<lpage>30</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.neunet.2023.05.037</pub-id>, PMID: <pub-id pub-id-type="pmid">37263089</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Lumini</surname> <given-names>A.</given-names></name>
<name><surname>Nanni</surname> <given-names>L.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>Deep learning and transfer learning features for plankton classification</article-title>. <source>Ecol. Inf.</source> <volume>51</volume>, <fpage>33</fpage>&#x2013;<lpage>43</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2019.02.007</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Luo</surname> <given-names>J. Y.</given-names></name>
<name><surname>Irisson</surname> <given-names>J.-O.</given-names></name>
<name><surname>Graham</surname> <given-names>B.</given-names></name>
<name><surname>Guigand</surname> <given-names>C.</given-names></name>
<name><surname>Sarafraz</surname> <given-names>A.</given-names></name>
<name><surname>Mader</surname> <given-names>C.</given-names></name>
<etal/>
</person-group>. (<year>2018</year>). 
<article-title>Automated plankton image analysis using convolutional neural networks</article-title>. <source>Limnology Oceanography: Methods</source> <volume>16</volume>, <fpage>814</fpage>&#x2013;<lpage>827</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1002/lom3.10285</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>MacLeod</surname> <given-names>N.</given-names></name>
<name><surname>Benfield</surname> <given-names>M.</given-names></name>
<name><surname>Culverhouse</surname> <given-names>P.</given-names></name>
</person-group> (<year>2010</year>). 
<article-title>Time to automate identification</article-title>. <source>Nature</source> <volume>467</volume>, <fpage>154</fpage>&#x2013;<lpage>155</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1038/467154a</pub-id>, PMID: <pub-id pub-id-type="pmid">20829777</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Miller</surname> <given-names>E. G.</given-names></name>
<name><surname>Matsakis</surname> <given-names>N. E.</given-names></name>
<name><surname>Viola</surname> <given-names>P. A.</given-names></name>
</person-group> (<year>2000</year>). &#x201c;
<article-title>Learning from one example through shared densities on transforms</article-title>,&#x201d; in <source>Proceedings IEEE conference on computer vision and pattern recognition. CVPR 2000 (Cat. No. PR00662)</source>, vol. <volume>1</volume>. (<publisher-loc>Hilton Head, SC, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>464</fpage>&#x2013;<lpage>471</lpage>.
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Nichol</surname> <given-names>A.</given-names></name>
<name><surname>Schulman</surname> <given-names>J.</given-names></name>
</person-group> (<year>2018</year>). 
<article-title>Reptile: a scalable metalearning algorithm</article-title>. <source>arXiv preprint arXiv:1803.02999</source> <volume>2</volume>, <fpage>4</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.1803.02999</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Pedraza</surname> <given-names>A.</given-names></name>
<name><surname>Bueno</surname> <given-names>G.</given-names></name>
<name><surname>Deniz</surname> <given-names>O.</given-names></name>
<name><surname>Crist&#xf3;bal</surname> <given-names>G.</given-names></name>
<name><surname>Blanco</surname> <given-names>S.</given-names></name>
<name><surname>Borrego-Ramos</surname> <given-names>M.</given-names></name>
</person-group> (<year>2017</year>). 
<article-title>Automated diatom classification (part b): a deep learning approach</article-title>. <source>Appl. Sci.</source> <volume>7</volume>, <fpage>460</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/app7050460</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Schr&#xf6;der</surname> <given-names>S.-M.</given-names></name>
<name><surname>Kiko</surname> <given-names>R.</given-names></name>
<name><surname>Irisson</surname> <given-names>J.-O.</given-names></name>
<name><surname>Koch</surname> <given-names>R.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Low-shot learning of plankton categories</article-title>,&#x201d; in <source>German conference on pattern recognition</source> (<publisher-loc>Cham, Switzerland</publisher-loc>: 
<publisher-name>Springer</publisher-name>), <fpage>391</fpage>&#x2013;<lpage>404</lpage>.
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Schwartz</surname> <given-names>E.</given-names></name>
<name><surname>Karlinsky</surname> <given-names>L.</given-names></name>
<name><surname>Shtok</surname> <given-names>J.</given-names></name>
<name><surname>Harary</surname> <given-names>S.</given-names></name>
<name><surname>Marder</surname> <given-names>M.</given-names></name>
<name><surname>Kumar</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2018</year>). 
<article-title>Delta-encoder: an effective sample synthesis method for few-shot object recognition</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>31</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2310.18236</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Shi</surname> <given-names>J.-X.</given-names></name>
<name><surname>Wei</surname> <given-names>T.</given-names></name>
<name><surname>Xiang</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>Y.-F.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>How re-sampling helps for long-tail learning</article-title>? <source>Adv. Neural Inf. Process. Syst.</source> <volume>36</volume>, <fpage>75669</fpage>&#x2013;<lpage>75687</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.48550/arXiv.2310.18236</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Snell</surname> <given-names>J.</given-names></name>
<name><surname>Swersky</surname> <given-names>K.</given-names></name>
<name><surname>Zemel</surname> <given-names>R.</given-names></name>
</person-group> (<year>2017</year>). 
<article-title>Prototypical networks for few-shot learning</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>30</volume>.
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Su</surname> <given-names>J.-C.</given-names></name>
<name><surname>Maji</surname> <given-names>S.</given-names></name>
<name><surname>Hariharan</surname> <given-names>B.</given-names></name>
</person-group> (<year>2020</year>). &#x201c;
<article-title>When does self-supervision improve few-shot learning</article-title>?,&#x201d; in <source>In <italic>European conference on computer vision</italic></source> (
<publisher-name>Springer</publisher-name>), <fpage>645</fpage>&#x2013;<lpage>666</lpage>.
</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>Q.</given-names></name>
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>Chua</surname> <given-names>T.-S.</given-names></name>
<name><surname>Schiele</surname> <given-names>B.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Meta-transfer learning for few-shot learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</conf-name>, (<conf-loc>Long Beach, California, USA</conf-loc>: 
<publisher-name>IEEE Computer Society</publisher-name>), <fpage>403</fpage>&#x2013;<lpage>412</lpage>.
</mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Sun</surname> <given-names>X.</given-names></name>
<name><surname>Xv</surname> <given-names>H.</given-names></name>
<name><surname>Dong</surname> <given-names>J.</given-names></name>
<name><surname>Zhou</surname> <given-names>H.</given-names></name>
<name><surname>Chen</surname> <given-names>C.</given-names></name>
<name><surname>Li</surname> <given-names>Q.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Few-shot learning for domain-specific fine-grained image classification</article-title>. <source>IEEE Trans. Ind. Electron.</source> <volume>68</volume>, <fpage>3588</fpage>&#x2013;<lpage>3598</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TIE.2020.2977553</pub-id>
</mixed-citation>
</ref>
<ref id="B34">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Sung</surname> <given-names>F.</given-names></name>
<name><surname>Yang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhang</surname> <given-names>L.</given-names></name>
<name><surname>Xiang</surname> <given-names>T.</given-names></name>
<name><surname>Torr</surname> <given-names>P. H.</given-names></name>
<name><surname>Hospedales</surname> <given-names>T. M.</given-names></name>
</person-group> (<year>2018</year>). &#x201c;
<article-title>Learning to compare: Relation network for few-shot learning</article-title>,&#x201d; in <conf-name>Proceedings of the IEEE conference on computer vision and pattern recognition</conf-name>, (<conf-loc>Salt Lake City, UT, USA</conf-loc>: 
<publisher-name>Computer Vision Foundation / IEEE Computer Society</publisher-name>), <fpage>1199</fpage>&#x2013;<lpage>1208</lpage>.
</mixed-citation>
</ref>
<ref id="B35">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Yao</surname> <given-names>Q.</given-names></name>
<name><surname>Kwok</surname> <given-names>J. T.</given-names></name>
<name><surname>Ni</surname> <given-names>L. M.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Generalizing from a few examples: A survey on few-shot learning</article-title>. <source>ACM computing surveys (csur)</source> <volume>53</volume>, <fpage>1</fpage>&#x2013;<lpage>34</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3386252</pub-id>
</mixed-citation>
</ref>
<ref id="B36">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Wang</surname> <given-names>C.</given-names></name>
<name><surname>Yu</surname> <given-names>Z.</given-names></name>
<name><surname>Zheng</surname> <given-names>H.</given-names></name>
<name><surname>Wang</surname> <given-names>N.</given-names></name>
<name><surname>Zheng</surname> <given-names>B.</given-names></name>
</person-group> (<year>2017</year>). &#x201c;
<article-title>Cgan-plankton: Towards large-scale imbalanced class generation and fine-grained classification</article-title>,&#x201d; in <conf-name>2017 IEEE International Conference on Image Processing (ICIP)</conf-name>, Beijing China. (<publisher-loc>Piscataway, NJ, USA</publisher-loc>: 
<publisher-name>IEEE</publisher-name>), <fpage>855</fpage>&#x2013;<lpage>859</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1145/3386252</pub-id>
</mixed-citation>
</ref>
<ref id="B37">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>You</surname> <given-names>K.</given-names></name>
<name><surname>Kou</surname> <given-names>Z.</given-names></name>
<name><surname>Long</surname> <given-names>M.</given-names></name>
<name><surname>Wang</surname> <given-names>J.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Co-tuning for transfer learning</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>33</volume>, <fpage>17236</fpage>&#x2013;<lpage>17246</lpage>.
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/644449">Huiyu Zhou</ext-link>, University of Leicester, United Kingdom</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2022852">Jiande Sun</ext-link>, Shandong Normal University, China</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3272263">Zoey (Zhiyao) Shu</ext-link>, George Mason University, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3289731">Yang Nan</ext-link>, Imperial College London, United Kingdom</p></fn>
</fn-group>
</back>
</article>