<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Med.</journal-id>
<journal-title>Frontiers in Medicine</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Med.</abbrev-journal-title>
<issn pub-type="epub">2296-858X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fmed.2025.1519280</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Medicine</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Medical short text classification via Soft Prompt-tuning</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Xiao</surname> <given-names>Xiao</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2721461/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Han</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2524226/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Jiang</surname> <given-names>Feng</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Qi</surname> <given-names>Tingyue</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname> <given-names>Wei</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/987745/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Ultrasound, The Affiliated Hospital of Yangzhou University, Yangzhou University</institution>, <addr-line>Yangzhou</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Information Engineering, Yangzhou University</institution>, <addr-line>Yangzhou</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>Department of Ultrasound, The First Affiliated Hospital of Wannan Medical College</institution>, <addr-line>Wuhu</addr-line>, <country>China</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Radiology, The Affiliated Hospital of Yangzhou University, Yangzhou University</institution>, <addr-line>Yangzhou</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Hui Zheng, National Institute on Drug Abuse (NIH), United States</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Guosheng Han, Xiangtan University, China</p>
<p>Stefano Silvestri, National Research Council (CNR), Italy</p>
<p>Prasan Yapa, Kyoto University of Advanced Science (KUAS), Japan</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Wei Wang <email>waywang&#x00040;126.com</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>14</day>
<month>04</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>12</volume>
<elocation-id>1519280</elocation-id>
<history>
<date date-type="received">
<day>20</day>
<month>12</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>28</day>
<month>03</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2025 Xiao, Wang, Jiang, Qi and Wang.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Xiao, Wang, Jiang, Qi and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>In recent decades, medical short texts, such as medical conversations and online medical inquiries, have garnered significant attention and research. The advances in the medical short text have profound implications in practical applications, particularly for classifying in-patient discharge summaries and medical text reports, leading to improved understandability for medical professionals. However, the challenges posed by the short length, professional medical vocabulary, complex medical measures, and feature sparsity are further magnified in medical short text classification compared to general domains. This paper introduces a novel soft prompt-tuning method designed specifically for medical short text classification. Inspired by the recent success of prompt- tuning, which has been extensively explored to enhance semantic modeling in various natural language processing tasks with the appearance of GPT-3, our method incorporates an automatic template generation method to address the issues related to short length and feature sparsity. Additionally, we propose two different strategies to expand the label word space, effectively handling the challenges associated with specialized medical vocabulary and complex medical measures in medical short texts. The experimental results demonstrate the effectiveness of our method and its potential as a significant advancement in medical short text classification. By addressing issues related to short text length, feature sparsity, and specialized medical terminology, it offers a promising advancement toward more accurate and interpretable medical text classification.</p></abstract>
<kwd-group>
<kwd>medical short text</kwd>
<kwd>short text classification</kwd>
<kwd>prompt-tuning</kwd>
<kwd>soft prompt</kwd>
<kwd>NLP</kwd>
</kwd-group>
<counts>
<fig-count count="6"/>
<table-count count="2"/>
<equation-count count="6"/>
<ref-count count="48"/>
<page-count count="0"/>
<word-count count="7554"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Precision Medicine</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>In the past few years, short texts have been posted at unprecedented rates, which emphasizes the significance of learning tasks and also highlights the challenges stemming from the vast feature space (<xref ref-type="bibr" rid="B1">1</xref>). Different from traditional documents, short texts present considerable obstacles to the effectiveness of mainstream text classification solutions due to their short length, feature sparsity, and high ambiguity. The advances in short text classification have significant implications in practical applications including medical-aided diagnosis (<xref ref-type="bibr" rid="B2">2</xref>), thereby necessitating an urgent need to comprehend and address the characteristics of short texts.</p>
<p>With the rapid development of the Internet, online medical inquiries have garnered significant attention in the real world. These inquiries are presented in narrative formats that retain the characteristics of ambiguity and informality (<xref ref-type="bibr" rid="B3">3</xref>). Within these medical short texts, the presence of professional medical vocabulary and complex medical measures varies depending on the responses provided to different users. Moreover, the use of abbreviations and diverse forms of expression complicates the discovery of underlying patterns using conventional short text methods (<xref ref-type="bibr" rid="B4">4</xref>). Additionally, due to the requirements on professional expert knowledge, it is time-consuming and labor-intensive to collect enough labeled data for medical short text, which remains prohibitively expensive and impractical to learn in existing deep learning methods (<xref ref-type="bibr" rid="B5">5</xref>).</p>
<p>In recent decades, the research paradigm of medical short text classification has transformed, shifting from early feature engineering-based methods to the adoption of neural networks, which have been extensively applied and demonstrated superiority in this task. Prominent deep neural networks, such as Convolutional Neural Network (CNN) (<xref ref-type="bibr" rid="B6">6</xref>), Recurrent Neural Network (RNN) (<xref ref-type="bibr" rid="B7">7</xref>), Long Short-Term Memory (LSTM) (<xref ref-type="bibr" rid="B8">8</xref>), and autoencoder (<xref ref-type="bibr" rid="B9">9</xref>), have exhibited impressive performance in learning more abstract and higher-level representations for medical short texts. For instance, (<xref ref-type="bibr" rid="B10">10</xref>) proposed the optimized TextCNN model for Chinese medicine text classification, which yielded highly promising results. Recently, Pre-trained Language Models (PLMs) such as BERT (<xref ref-type="bibr" rid="B11">11</xref>), RoBERTa (<xref ref-type="bibr" rid="B12">12</xref>), T5 (<xref ref-type="bibr" rid="B13">13</xref>), and GPT (<xref ref-type="bibr" rid="B14">14</xref>) have emerged as powerful tools for language understanding and generation. To leverage the rich knowledge embedded in PLMs for various natural language processing (NLP) tasks, the fine-tuning method, coupled with an additional classifier, has been widely employed and achieved remarkable performance across various downstream tasks, including medical short text classification (<xref ref-type="bibr" rid="B15">15</xref>).</p>
<p>More recently, inspired by the success of GPT-3, prompt-tuning has gained extensive attention to enhance semantic modeling in various Natural Language Processing (NLP) tasks (<xref ref-type="bibr" rid="B16">16</xref>, <xref ref-type="bibr" rid="B17">17</xref>). In prompt-tuning, hand-crafted or auto-generated templates are employed to formalize downstream NLP tasks into cloze-style filling tasks. For instance, in the context of medical short text classification, given the input sentence <italic>x</italic> as &#x0201C;Which department of the hospital should be registered for epilepsy?,&#x0201D; prompt-tuning with a hand-crafted template wraps it into &#x0201C;A problem for [MASK]: <italic>x</italic>.&#x0201D; The probability of different topic words, such as &#x0201C;Neurology&#x0201D; or &#x0201C;Endocrinology,&#x0201D; is then calculated to fill the &#x0201C;[MASK]&#x0201D; token. Remarkably, without the need for fine-tuning on large volumes of labeled data for the downstream task, prompt-tuning has exhibited promising performance, even in few-shot or zero-shot learning scenarios. Despite the effectiveness of prompt-tuning methods with hand-crafted templates in various NLP downstream tasks, their construction remains time-consuming and labor-intensive, requiring substantial human effort. Moreover, poorly designed templates can degrade model performance. More recently, soft prompt-tuning methods have been explored (<xref ref-type="bibr" rid="B18">18</xref>). Unlike hand-crafted templates, soft prompts are continuous representations, typically encoded as vectors, that can be optimized during training to achieve better performance.</p>
<p>In this paper, we present a novel method for Medical short text classification via Soft Prompt-tuning (short for MSP). Our method aims to address the challenges posed by professional medical vocabulary and complex medical measures in medical short texts. Specifically, we construct the mapping from the expanded label words (e.g., breast, sterility, obstetrics, Cervical diseases, gynecologist, etc.) to their corresponding categories (e.g., gynecology and obstetrics) in prompt-tuning. This mapping, referred to as &#x0201C;verbalizer,&#x0201D; has been proven effective in reducing the discrepancy between the text and label spaces (<xref ref-type="bibr" rid="B19">19</xref>). In our method, we propose two strategies, i.e., Concepts Retrieval and Context Information, to construct the verbalizer, each capturing different characteristics of the expanded words. The integration of these two strategies yields the final verbalizer, which significantly enhances the accuracy of classification. Moreover, to accommodate the requirements for large-scale labeled training datasets, our MSP method is grounded in soft prompt-tuning. Soft prompt-tuning incorporates the vector of the input sentence, the mask, and the soft tokens, enabling the model to achieve robust performance even in few-shot scenarios. The experimental results on online medical inquiries demonstrated the effectiveness of our MSP method compared to other state-of-the-art methods for medical short text classification. The contributions of our method can be summarized as follows:</p>
<list list-type="bullet">
<list-item><p>Our MSP is a novel medical short text classification method based on prompt-tuning. Compared with existing methods, MSP can achieve better performance even in few-shot scenarios.</p></list-item>
<list-item><p>The verbalizer is constructed for the professional medical field, and two strategies are employed to capture different characteristics of the expanded words. The integration of these strategies is utilized as the final verbalizer.</p></list-item>
<list-item><p>The experimental results on real-world online medical inquiries demonstrated that our MSP obtains new state-of-the-art results compared to other deep neural networks and fine-tuned PLMs methods.</p></list-item>
</list>
</sec>
<sec id="s2">
<title>2 Related work</title>
<p>In this section, we review the related work on medical short text classification and prompt-tuning in detail, respectively.</p>
<sec>
<title>2.1 Medical short text classification</title>
<p>The medical short text has garnered significant attention and research in recent decades, and the advancements in this area have had far-reaching implications in various practical applications, such as medical-aided diagnosis (<xref ref-type="bibr" rid="B20">20</xref>) and online medical inquiries (<xref ref-type="bibr" rid="B21">21</xref>). Specifically, medical short text classification focuses on predicting accurate labels for texts with limited length. For instance, in the context of online medical inquiries, both the problem and its corresponding answer usually consist of fewer than 20 words (<xref ref-type="bibr" rid="B22">22</xref>).</p>
<p>In recent years, deep neural networks have demonstrated remarkable performance in medical short text classification tasks, attributed to their ability to learn abstract and higher-level feature representations. For instance, Kim proposed the TextCNN model, which achieved substantial performance in sentence-level classification tasks by training a CNN with a single layer on top of pre-trained word vectors (<xref ref-type="bibr" rid="B23">23</xref>). The model kept the word vectors static while tuning and learning the parameters of only one CNN layer. In the domain of medical short text classification, Li et al. introduced the convolutional layer to extract features from sentences and utilized bidirectional gated recurrent unit (BIGRU) to learn both preceding and succeeding sentence features. Additionally, an attention mechanism was employed to obtain sentence representations with important word weights (<xref ref-type="bibr" rid="B5">5</xref>). BIGRU and the attention mechanism were also leveraged for document representation learning, serving as both encoding and decoding layers. (<xref ref-type="bibr" rid="B24">24</xref>) proposed the incorporation of word-cluster embedding into deep neural networks to address the problem of semantic feature scarcity and ambiguity. The method involved hierarchical agglomerative clustering to cluster word embeddings in the semantic space. The resulting cluster center vectors served as potential theme information, and CNN and LSTM were employed for classification using the cluster-based features. In the context of online medical inquiries, (<xref ref-type="bibr" rid="B3">3</xref>) proposed a three-stage hybrid system for classification. The system combined a regular expression-based classifier with attentive bidirectional Long Short-Term Memory (ABLSTM) to achieve high classification results. ABLSTM was introduced to extract feature words of high quality, and the word weights were then utilized in constructing regular expression-based text classifiers. Furthermore, (<xref ref-type="bibr" rid="B2">2</xref>) proposed a method for Chinese electronic medical record classification based on an improved capsule network. In this approach, Chinese medical short texts were initially processed using an LSTM network, followed by the utilization of the Capsule network to achieve improved performance.</p>
<p>Recently, fine-tuned Pre-trained Language Models (PLMs), including BERT (<xref ref-type="bibr" rid="B11">11</xref>), ALBERT (<xref ref-type="bibr" rid="B25">25</xref>), RoBERTa (<xref ref-type="bibr" rid="B12">12</xref>), T5 (<xref ref-type="bibr" rid="B13">13</xref>), and GPT (<xref ref-type="bibr" rid="B14">14</xref>), have emerged as powerful tools for leveraging rich knowledge in NLP downstream tasks. Through fine-tuning PLMs with specific downstream tasks, latent information can be learned, leading to tremendous success in various NLP tasks, including medical short text classification. Given the exceptional performance of fine-tuned PLM methods, it is widely acknowledged that training new models from scratch can be avoided. For instance, (<xref ref-type="bibr" rid="B26">26</xref>) proposed a Knowledge Graph enhanced multiType text BERT method for medical text classification. This approach integrates the medical knowledge graph to extract standard entity names from medical text. Initially, the same BERT-Encoder is employed to process multi-type text, and then multiple encodings are concatenated to form the representation matrix. Different types of pooling layers are explored for information summation. (<xref ref-type="bibr" rid="B27">27</xref>) proposed an optimal deep learning model based on BERT and hyperparameter selection for medical text classification. The BERT model is used to learn the feature representations of medical texts, followed by the utilization of the Particle Swarm Optimization (PSO) algorithm for selecting hyperparameters for the deep learning classifier. (<xref ref-type="bibr" rid="B4">4</xref>) proposed an ALBERT-based fusion Kalman-filter model to address word-level and sentence-level noises for medical short texts. They first employ a sliding window scheme to handle the coupling relationships of large sequences and then use the fusion block to integrate features of multiple segment sequences. The ALBERT architecture with four iterative encoder layers is leveraged as PLMs for word embedding learning. (<xref ref-type="bibr" rid="B28">28</xref>) proposed a Traditional Chinese Medicine (TCM) text classification method based on RoBERTa. They fine-tuned the RoBERTa model with TCM medical records data and then tokenized the classified sample data using the Tokenizer based on the pre-trained RoBERTa model. Wang et al. propose the use of a discriminative pre-training language model called ERNIE-Health for classifying medical texts. Specifically, the authors attempt prompt tuning based on a multi-token selection task, wrapping the original text in a template into a new sequence where category labels are replaced with [UNK] tokens. The model is then trained to compute the probability distribution of candidate categories. Adapting prompt-tuning methods designed primarily for English to Chinese text classification tasks presents challenges (<xref ref-type="bibr" rid="B29">29</xref>). Li et al. introduce Knowledge Enhanced Multi-Token Prompt Tuning (KMPT). The implementation involves initially using multiple tokens as label words with complete Chinese semantics. Subsequently, external knowledge is utilized to expand the set of label words, improving coverage and reducing bias (<xref ref-type="bibr" rid="B30">30</xref>).</p>
</sec>
<sec>
<title>2.2 Prompt-tuning</title>
<p>Despite the success of fine-tuning PLMs, recent studies have identified a critical challenge: the significant gap in objective forms between pre-training and fine-tuning, which limits the exploitation of knowledge in PLMs. To tackle this issue, prompt-tuning has emerged with inspiration from GPT-3 for improving semantic modeling across a wide range of NLP tasks (<xref ref-type="bibr" rid="B31">31</xref>). Prompt-tuning involves inserting input statements into natural language templates and adjusting the mask model to transform tasks into cloze-style filling tasks (<xref ref-type="bibr" rid="B17">17</xref>). The prompt-tuning has been widely explored and applied with tremendous success in various downstream NLP tasks, including information extraction (<xref ref-type="bibr" rid="B32">32</xref>), question answering (<xref ref-type="bibr" rid="B33">33</xref>), text generation (<xref ref-type="bibr" rid="B34">34</xref>), and text classification (<xref ref-type="bibr" rid="B35">35</xref>).</p>
<p>The primary components of prompt-tuning include a template and a set of label words. The template serves as a background description of the current task, while the label words consist of the high-probability vocabulary predicted by PLMs in the given context. Initially, hand-crafted templates, which involve discrete prompts manually specified and kept unchanged during training, were proposed and applied. For instance, (<xref ref-type="bibr" rid="B36">36</xref>) encoded prior knowledge of a classification task into rules and decomposed it into sub-tasks, combining human-picked sub-prompts for the final classification tasks. (<xref ref-type="bibr" rid="B37">37</xref>) introduced learning discrete prompts through continuous optimization, which achieved notable performance in both image generation and language classification tasks. In the context of relation extraction, (<xref ref-type="bibr" rid="B32">32</xref>) incorporated knowledge among relation labels into prompt-tuning. The method involved injecting learnable virtual answer words into semantic knowledge to represent relation labels and then optimizing the representations with structured constraints rather than relying on entity-type annotations.</p>
<p>Although hand-crafted templates have shown substantial success in various NLP tasks, their construction is often time-consuming and labor-intensive, and inappropriate templates may lead to substandard model performance. To address this, more recently, automatic-generated template methods, i.e., soft templates, have been explored (<xref ref-type="bibr" rid="B18">18</xref>, <xref ref-type="bibr" rid="B38">38</xref>). In contrast to hard templates, soft templates are continuous prompts, usually presented as vectors, that can be continually optimized during training to yield optimal results. For example, (<xref ref-type="bibr" rid="B39">39</xref>) proposed to learn a mixture of soft prompts to extract relational knowledge from language models, which has shown that continuous vectors can achieve impressive performance compared to &#x0201C;hard prompts.&#x0201D; (<xref ref-type="bibr" rid="B40">40</xref>) introduced an automatic prompt generation method that achieved promising results in Natural Language Understanding tasks by identifying the most suitable template for downstream tasks and incorporating learnable vectors into the template while continually optimizing it during training. Additionally, (<xref ref-type="bibr" rid="B41">41</xref>) proposed a transfer learning method based on soft prompts, where a prompt is first trained on one or more source tasks, and then the resulting prompt is utilized to initialize the prompt for the downstream tasks.</p>
<p>In addition to the template used in prompt-tuning, the mapping from label words to categories, known as the verbalizer, has been demonstrated to effectively address the discrepancy between text and label space (<xref ref-type="bibr" rid="B19">19</xref>). There have been efforts to create hand-collected verbalizers for various NLP downstream tasks. For instance, (<xref ref-type="bibr" rid="B42">42</xref>) proposed to use pairs of cloze question patterns and manually designed verbalizers to leverage the knowledge contained within PLMs for downstream tasks. However, manually designed verbalizers are heavily influenced by prior knowledge, leading to potential omissions and biases in knowledge expansion. Automatic verbalizer construction methods have been developed to mitigate these issues (<xref ref-type="bibr" rid="B43">43</xref>&#x02013;<xref ref-type="bibr" rid="B45">45</xref>). For example, (<xref ref-type="bibr" rid="B35">35</xref>) introduced a knowledgeable expansion prompt-learning method for short text classification. This approach incorporates both the short text itself and external knowledge from open Knowledge Bases, such as Probase, to extend the label words space. Several different strategies are employed for automatic verbalizer construction. (<xref ref-type="bibr" rid="B46">46</xref>) proposed a method to elicit knowledge from PLMs for constructing verbalizers. In this method, the label information is encoded as prototypical embeddings in the latent feature space, and representations of masked words and prototypical embeddings are calculated for the classification tasks. Furthermore, (<xref ref-type="bibr" rid="B47">47</xref>) devised a method to incorporate imprecise knowledge from large unlabelled corpora into verbalizer construction for biomedical text relation extraction. In this method, word and relation word embeddings are learned to infuse entity and relation information, and biomedical domain knowledge constraints are introduced to enhance representations.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<title>3 Methodology</title>
<p>The whole framework of our MSP is illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>. In this section, the automatic template generation, verbalizer construction, and medical short text classification are described successively in detail.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>The illustration of our MSP. There are three main components in our method: soft template generation, automatic verbalizer construction, and medical short text classification. Firstly, the mask and the input <italic>x</italic> are mapped into the embedding by PLMs such as BERT in the experiments. The neural network, i.e., BiLSTM in our experiments, is then employed to train soft tokens in the soft template. Secondly, the top <italic>N</italic><sub><italic>a</italic></sub> concepts concerning the entities in the medical short text itself are retrieved by two strategies for automatic verbalizer construction. Finally, the constructed soft prompt and verbalizer are used to predict the probability of each label word belonging to a special class for medical short text classification.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1519280-g0001.tif"/>
</fig>
<sec>
<title>3.1 Soft template generation</title>
<p>As depicted in <xref ref-type="fig" rid="F1">Figure 1</xref>, the soft template <italic>T</italic> comprises two main components: the soft prompt tokens and the embeddings of the input sentence <italic>x</italic> and MASK. For example, in the aforementioned example, where the task is to predict the topic of the sentence <italic>x</italic>: &#x0201C;Which department of the hospital should be registered for epilepsy?,&#x0201D; the prediction is based on the probability that the topic word, such as &#x0201C;Neurology&#x0201D; or &#x0201C;Endocrinology,&#x0201D; fills the [<italic>MASK</italic>] token.</p>
<p>In the experiments, we utilized BERT as PLMs to learn the embeddings of the input sentence <italic>x</italic> and MASK. The input sentence <italic>x</italic> is represented as <italic>x</italic> &#x0003D; {<italic>x</italic><sub>0</sub>, &#x02026;, <italic>x</italic><sub><italic>i</italic></sub>, &#x02026;, <italic>x</italic><sub><italic>h</italic></sub>}, which is mapped into the embeddings as <italic>e</italic>(<italic>x</italic>) &#x0003D; {<italic>e</italic>(<italic>x</italic><sub>0</sub>), &#x02026;, <italic>e</italic>(<italic>x</italic><sub><italic>i</italic></sub>), &#x02026;, <italic>e</italic>(<italic>x</italic><sub><italic>h</italic></sub>)} by PLMs <italic>e</italic>. Similarly, the mask is mapped into the embeddings as <italic>e</italic>(<italic>mask</italic>). Then the soft tokens in the template are denoted as [token<sub>1</sub>]&#x02026;[token<sub><italic>i</italic></sub>][token<sub><italic>i</italic>&#x0002B;1</sub>]&#x02026;[token<sub><italic>n</italic></sub>]. In contrast to manually designed templates, the soft template generation involves the use of a BiLSTM as the neural network to train the initial soft tokens, resulting in <italic>h</italic><sub>1</sub>, &#x02026;, <italic>h</italic><sub><italic>i</italic></sub>, <italic>hi</italic>&#x0002B;1, &#x02026;, <italic>h</italic><sub><italic>n</italic></sub>. The soft prompt <italic>T</italic> can then be represented as <xref ref-type="disp-formula" rid="E1">Equation 1</xref>:</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>T</mml:mi><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mn>0</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>n</mml:mi></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mi>e</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>h</italic><sub><italic>i</italic></sub> can be formulated as <xref ref-type="disp-formula" rid="E2">Equation 2</xref>:</p>
<disp-formula id="E2"><label>(2)</label><mml:math id="M2"><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>h</mml:mi><mml:mo>&#x2192;</mml:mo></mml:mover><mml:mi>i</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mover accent='true'><mml:mi>h</mml:mi><mml:mo>&#x2190;</mml:mo></mml:mover><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mover accent='true'><mml:mrow><mml:mi>L</mml:mi><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mo stretchy='true'>&#x2192;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy='true'>&#x2192;</mml:mo></mml:mover></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mrow><mml:mi>L</mml:mi><mml:mi>S</mml:mi><mml:mi>T</mml:mi><mml:mi>M</mml:mi></mml:mrow><mml:mo stretchy='true'>&#x2190;</mml:mo></mml:mover><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:mover accent='true'><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mi>n</mml:mi></mml:msub></mml:mrow><mml:mo stretchy='true'>&#x2190;</mml:mo></mml:mover></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula>
<p>The soft prompt <italic>T</italic> enables us to discover better continuous prompts for enhancing the performance of downstream tasks. Finally, the loss function <italic>L</italic> for the short text function with respect to <italic>h</italic><sub><italic>i</italic></sub> can be formulated as <xref ref-type="disp-formula" rid="E3">Equation 3</xref>:</p>
<disp-formula id="E3"><label>(3)</label><mml:math id="M3"><mml:mrow><mml:mover accent='true'><mml:mi>h</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>arg&#x000A0;min&#x000A0;</mml:mtext></mml:mrow><mml:mrow><mml:msub><mml:mi>h</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow></mml:munder><mml:mi>L</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mi>x</mml:mi><mml:mo>,</mml:mo><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>s</mml:mi><mml:mi>k</mml:mi></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:math></disp-formula>
</sec>
<sec>
<title>3.2 Automatic verbalizer construction</title>
<p>In prompt-tuning, verbalizer refers to the mapping from the expanded label words (e.g., breast, sterility, obstetrics, Cervical diseases, gynecologist, etc.) to their corresponding categories (e.g., gynecology and obstetrics), which has been empirically proven to effectively reduce the discrepancy between text and label spaces, thereby enhancing the performance of downstream tasks (<xref ref-type="bibr" rid="B19">19</xref>). In contrast to previous methods that directly search concepts in large knowledge bases using category names (<xref ref-type="bibr" rid="B44">44</xref>, <xref ref-type="bibr" rid="B45">45</xref>), we propose two different strategies, namely Concepts Retrieval and Context Information, to expand the label words from the short text itself. Each strategy captures different aspects of the characteristics of the expanded words, and these words are subsequently integrated into the final verbalizer. Below are the details of the two strategies:</p>
<sec>
<title>3.2.1 Concepts retrieval</title>
<p>In previous verbalizer construction methods, such as those mentioned in (<xref ref-type="bibr" rid="B44">44</xref>) and (<xref ref-type="bibr" rid="B45">45</xref>), the expanded label words were identified in large knowledge bases through semantic similarity calculation. However, these methods not only yielded unsatisfactory performance in medical short text classification but also proved to be time-consuming and labor-intensive. To tackle this issue, in this paper, we first retrieved concepts related to entities mentioned in the medical short text from an open knowledge base, such as Probase,<xref ref-type="fn" rid="fn0001"><sup>1</sup></xref> which provides the probability of each entity belonging to a particular concept. This novel approach, termed Concepts Retrieval, allows us to address the challenges posed by professional medical vocabulary and complex medical measures in medical short text classification. Moreover, it enables us to avoid searching the entire knowledge base and focus solely on retrieving professional medical concepts by leveraging Probase for probabilities ranking. To be more specific, we retrieved <italic>N</italic>(<italic>v</italic>) concepts from Probase ranked by their probabilities. Subsequently, we introduced category names <italic>y</italic> (such as &#x0201C;Neurology&#x0201D; and &#x0201C;Endocrinology&#x0201D; in medical text datasets as anchor words. The distance <italic>dist</italic>(<italic>V</italic><sub><italic>y</italic></sub>, <italic>y</italic>) between each expanded label word and the category name <italic>y</italic> was calculated in the embedded space. In our experiments, we selected the top <italic>N</italic><sub><italic>a</italic></sub> words, excluding morphological derivations of <italic>y</italic>. In the experiments, <italic>N</italic><sub><italic>a</italic></sub> &#x0003D; 15 is used for Concepts Retrieval strategy in the experiments.</p>
</sec>
<sec>
<title>3.2.2 Context information</title>
<p>To expand words while incorporating context information from the words preceding and following the masked word, we employed PLMs such as BERT in our experiments, as opposed to traditional N-gram language modeling. However, due to that BERT is a non-autoregressive language model, it cannot directly compute the likelihood of a sentence. To address this, we introduced a symmetric window of size <italic>c</italic> around the &#x0201C;[MASK]&#x0201D; word as the context. Let <italic>W</italic> &#x0003D;&#x02026;<italic>w</italic><sub>&#x02212;<italic>c</italic></sub>, &#x02026;<italic>w</italic><sub>&#x02212;1</sub>, <italic>w, w</italic><sub>1</sub>, &#x02026;<italic>w</italic><sub><italic>c</italic></sub>, &#x02026; represent the context of the masked word <italic>w</italic>. We then masked each <italic>w</italic><sub><italic>i</italic></sub> in <italic>W</italic> from front to back and fed it into the BERT model to compute the loss of <italic>w</italic>, as expressed in <xref ref-type="disp-formula" rid="E4">Equation 4</xref>:</p>
<disp-formula id="E4"><label>(4)</label><mml:math id="M4"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>&#x02208;</mml:mo><mml:mi>V</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mn>1</mml:mn><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>}</mml:mo></mml:mrow><mml:mo>&#x000D7;</mml:mo><mml:mo class="qopname">log</mml:mo><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>v</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mo>\</mml:mo><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>V</italic> represents the set of words in the vocabulary, 1&#x000B7; is the indicator function, and <italic>p</italic>(<italic>v</italic><sub><italic>i</italic></sub> &#x0003D; <italic>w</italic><sub><italic>i</italic></sub>|<italic>W</italic><sub>\<sub><italic>w</italic></sub><sub><italic>i</italic></sub></sub>) is the BERT prediction distribution that is conditioned on <italic>W</italic> excluding <italic>w</italic><sub><italic>i</italic></sub>. The total loss of <italic>W</italic> is then computed as the average of loss for each word <italic>w</italic><sub><italic>i</italic></sub>, which can be represented as <xref ref-type="disp-formula" rid="E5">Equation 5</xref>:</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M5"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mn>2</mml:mn><mml:mi>c</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mo>-</mml:mo><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mi>c</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>L</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>w</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Finally, all the expanded words are sorted based on their corresponding sequence loss <italic>L</italic>(<italic>W</italic>). In our experiments, we set <italic>c</italic> to 5 and discarded words with higher loss. Similar to the &#x0201C;Concepts Retrieval&#x0201D; strategy, we finally selected <italic>N</italic><sub><italic>a</italic></sub> words among all the predicted words to construct the verbalizer, and <italic>N</italic><sub><italic>a</italic></sub> &#x0003D; 15 is also used for context information strategy in the experiments.</p>
</sec>
</sec>
<sec>
<title>3.3 Medical short text classification</title>
<p>Once we have constructed the final verbalizer for medical short text, the predicted probability for each label word needs to be mapped to a specific category. This mapping process can be represented by an objective function denoted as <italic>g</italic>. Since we assume that each word in the final verbalizer contributes equally to the prediction, we use the average of the predicted scores for text classification. Specifically, <italic>g</italic> can be calculated as <xref ref-type="disp-formula" rid="E6">Equation 6</xref>:</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mo class="qopname">arg</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo class="qopname">max</mml:mo></mml:mrow><mml:mrow><mml:mi>y</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mi>Y</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>|</mml:mo></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>v</mml:mi><mml:mo>&#x02208;</mml:mo><mml:msub><mml:mrow><mml:mi>V</mml:mi></mml:mrow><mml:mrow><mml:mi>y</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munder></mml:mstyle><mml:mi>p</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:mtext class="textrm" mathvariant="normal">MASK</mml:mtext></mml:mrow><mml:mo>]</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mi>v</mml:mi><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>where <italic>V</italic><sub><italic>y</italic></sub> represents the set of label words corresponding to the label <italic>y</italic>, and |<italic>V</italic><sub><italic>y</italic></sub>| denotes the cardinality of <italic>V</italic><sub><italic>y</italic></sub>. The function <italic>p</italic>([MASK] &#x0003D; <italic>v</italic>|<italic>x</italic><sub><italic>p</italic></sub>) computes the probability of the label word <italic>v</italic> given the input text <italic>x</italic><sub><italic>p</italic></sub>.</p>
</sec>
</sec>
<sec id="s4">
<title>4 Experiments</title>
<sec>
<title>4.1 Datasets</title>
<p>To validate the effectiveness of our proposed MSP, we utilized web crawling techniques to acquire a certain amount of data from the Chinese medical conversation dataset. This enabled us to derive the symptom diagnosis classification dataset and the gynecological multi-classification dataset, which were then subjected to experimental analysis. Below is a comprehensive depiction of the two datasets:</p>
<list list-type="bullet">
<list-item><p>Symptom dataset. This dataset contains contents of eight categories, namely, infectious diseases, proctology, orthopedics, respiratory, andrology, burn, cardiovascular, and plastic surgery, crawled from the Internet, with a total of 80,000 training sets and 4,000 test sets of original data.</p></list-item>
<list-item><p>Gynecology dataset. This dataset contains contents of the infertility department, obstetrics department, and gynecology department under the gynecology category crawled from the network, with a total of 45,000 original data training sets and 2,100 test sets.</p></list-item>
</list>
</sec>
<sec>
<title>4.2 Compared methods</title>
<p>The following methods including deep neural networks and fine-tuned PLMs models are utilized as compared methods.</p>
<list list-type="bullet">
<list-item><p>Regular Prompt-tuning (PT) (<xref ref-type="bibr" rid="B16">16</xref>): It employs hand-crafted templates and label words to form the prompt, along with an ensemble model to annotate an unlabeled dataset, To ensure fairness we use the same sample template for the same dataset.</p></list-item>
<list-item><p>TextCNN (<xref ref-type="bibr" rid="B23">23</xref>): The CNN architecture is utilized for the task of text classification. In particular, the text undergoes preliminary word segmentation, followed by passing through a convolutional and pooling layer in succession, and the outcome is then passed through an external softmax classifier to classify the text.</p></list-item>
<list-item><p>ERNIE (<xref ref-type="bibr" rid="B48">48</xref>): By improving the classical PLMs like BERT, the knowledge and linguistic semantic information are integrated to enhance the representation of text semantics, which is more suitable for Chinese natural language processing tasks.</p></list-item>
<list-item><p>P-tuning (<xref ref-type="bibr" rid="B40">40</xref>): It proposes to learn continuous prompts instead of hand-crafted prompt by inserting trainable variables into the embedded input.</p></list-item>
</list>
</sec>
<sec>
<title>4.3 Experiment setting</title>
<sec>
<title>4.3.1 Training data</title>
<p>To simulate the situation of data scarcity, we conducted experiments using 10-shot, 15-shot, and 20-shot sampling methods to evaluate the effectiveness of our proposed MSP. Here, we provide a detailed account of the training samples used in the experiments. For each K-shot methodology discussed in this paper, along with PT and P-tuning, K sample data from each class were extracted from the original training set to form small-sized training sets. Additionally, another K sample data from each class were extracted to create the corresponding test sets. As the selection of training and verification sets with small samples introduced variability, we executed three random sampling experiments, and the final experimental results were averaged over these instances of random sampling.</p>
<p>Regarding the ERNIE and TextCNN models&#x00027; performance, we conducted manual sampling by handpicking different numbers of training samples. This allowed us to compare the quantities of 10-shot, 15-shot, and 20-shot samples utilized in the proposed method. Specific sample figures are presented in <xref ref-type="table" rid="T1">Table 1</xref>.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>The detail of training data of different models.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>ERNIE</bold></th>
<th valign="top" align="center"><bold>TextCNN</bold></th>
<th valign="top" align="center"><bold>PT/P-tuning/Ours</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Symptom</td>
<td valign="top" align="center">2,240/2,400/2,800</td>
<td valign="top" align="center">4,000/4,400/4,800</td>
<td valign="top" align="center">80/120/160</td>
</tr>
<tr>
<td valign="top" align="left">Gynecology</td>
<td valign="top" align="center">1,350/1,380/1,440</td>
<td valign="top" align="center">720/750/840</td>
<td valign="top" align="center">30/45/60</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<title>4.3.2 Parameter settings</title>
<p>The detailed experimental configurations are as follows: We use Python 3.6 based programming environment on Linux as our foundation. For both datasets, the batch size was set to 64. Considering the dataset distribution, the Symptom diagnosis dataset was trained for a total of 10 epochs, while the Gynecological multi-classification dataset was trained for 20 epochs. The model&#x00027;s learning rate was set to 0.0003 with an AdamW optimizer. The hidden size was set to 200, and a dropout rate of 0.5 was applied and weight decay was set to 0.01, while the pre-training language model parameters were frozen during the training process. To ensure fairness, we used the same experimental settings for all compared methods based on prompt-tuning, such as Regular Prompt-tuning: PT, p-tuning. BERT was used as the backbone PLM, and its implementation was based on the Hugging Face Transformer Library for the main methods.</p>
</sec>
</sec>
<sec>
<title>4.4 Experimental results</title>
<p><xref ref-type="table" rid="T2">Table 2</xref>, accompanied by <xref ref-type="fig" rid="F2">Figures 2</xref>, <xref ref-type="fig" rid="F3">3</xref>, provides a comprehensive analysis of the experimental findings. Based on the experimental results, the following conclusions can be drawn. Firstly, the accuracy of all methods has improved as the number of training samples increases. This indicates that increasing the number of samples in few-shot learning is beneficial for enhancing model performance. Notably, the proposed method exhibits better experimental results compared to the other four comparison methods on all cases in terms of overall classification efficacy.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>The performance of accuracy (%) on all two datasets.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Dataset</bold></th>
<th valign="top" align="center"><bold>Method</bold></th>
<th valign="top" align="center" colspan="3"><bold>Training data and the accuracy (%) results</bold></th>
</tr>
</thead>
<tbody>
<tr style="background-color:#919498;color:#ffffff">
<td/>
<td/>
<td valign="top" align="center"><bold>10/500/280/10/10</bold></td>
<td valign="top" align="center"><bold>15/550/300/15/15</bold></td>
<td valign="top" align="center"><bold>20/600/400/20/20</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Symptom</td>
<td valign="top" align="center">PT</td>
<td valign="top" align="center">83.35&#x000B1;1.57</td>
<td valign="top" align="center">85.17&#x000B1;1.51</td>
<td valign="top" align="center">86.87&#x000B1;1.62</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">TextCNN</td>
<td valign="top" align="center">82.28&#x000B1;1.82</td>
<td valign="top" align="center">82.82&#x000B1;1.65</td>
<td valign="top" align="center">83.38&#x000B1;2.94</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">ERNIE</td>
<td valign="top" align="center">83.38&#x000B1;5.10</td>
<td valign="top" align="center">84.54&#x000B1;3.82</td>
<td valign="top" align="center">85.18&#x000B1;5.30</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">p-tuning</td>
<td valign="top" align="center">80.39&#x000B1;0.98</td>
<td valign="top" align="center">84.94&#x000B1;0.96</td>
<td valign="top" align="center">86.25&#x000B1;0.46</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">Ours</td>
<td valign="top" align="center"><bold>83.78</bold>&#x000B1;0.61</td>
<td valign="top" align="center"><bold>85.67</bold>&#x000B1;082</td>
<td valign="top" align="center"><bold>86.93</bold>&#x000B1;0.63</td>
</tr>
<tr style="background-color:#919498;color:#ffffff">
<td/>
<td/>
<td valign="top" align="center"><bold>10/240/450/10/10</bold></td>
<td valign="top" align="center"><bold>15/250/460/15/15</bold></td>
<td valign="top" align="center"><bold>20/280/480/20/20</bold></td>
</tr>
 <tr>
<td valign="top" align="left">Gynecology</td>
<td valign="top" align="center">PT</td>
<td valign="top" align="center">72.38&#x000B1;1.82</td>
<td valign="top" align="center">75.66&#x000B1;2.53</td>
<td valign="top" align="center">79.65&#x000B1;1.70</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">TextCNN</td>
<td valign="top" align="center">72.30&#x000B1;2.19</td>
<td valign="top" align="center">74.52&#x000B1;3.81</td>
<td valign="top" align="center">78.15&#x000B1;1.92</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">ERNIE</td>
<td valign="top" align="center">70.20&#x000B1;3.55</td>
<td valign="top" align="center">74.51&#x000B1;5.48</td>
<td valign="top" align="center">80.73&#x000B1;2.90</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">p-tuning</td>
<td valign="top" align="center">65.79&#x000B1;1.48</td>
<td valign="top" align="center">71.71&#x000B1;1.06</td>
<td valign="top" align="center">74.43&#x000B1;0.76</td>
</tr>
 <tr>
<td/>
<td valign="top" align="center">Ours</td>
<td valign="top" align="center"><bold>72.60</bold>&#x000B1;0.74</td>
<td valign="top" align="center"><bold>76.30</bold>&#x000B1;1.05</td>
<td valign="top" align="center"><bold>81.18</bold>&#x000B1;0.97</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bolder ones mean better.</p>
</table-wrap-foot>
</table-wrap>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Results of each method on the Symptom dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1519280-g0002.tif"/>
</fig>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p>Results of each method on the Gynecology dataset.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1519280-g0003.tif"/>
</fig>
<p>Secondly, in the Symptom dataset, when employing the PT and P-tuning methods alongside manual templates, the experimental results are significantly better than those of the ERNIE and TextCNN models. However, none of these methods match the performance of our MSP method presented in this paper, validating the effectiveness of the proposed model. On the other hand, the overall effect on Gynecological datasets is not as satisfactory. This can be attributed to the relatively limited content differentiation within the Gynecological classification datasets. Nonetheless, the proposed MSP method shows improvement in classification accuracy by incorporating external knowledge.</p>
<p>Lastly, in the context of Chinese medical dialogue datasets, it is noteworthy that despite the ERNIE model being a pre-trained language model and TextCNN being considered highly adept at Chinese natural language processing tasks, the sample size of the proposed method in this paper is merely 1/20 of those used in the two methods. Despite this, the classification accuracy of the proposed method still surpasses that of these two widely used text classification approaches, highlighting the efficacy of the proposed method, especially in limited data learning scenarios.</p>
</sec>
<sec>
<title>4.5 Parameter sensitivity</title>
<p>Some important parameters in the experiments often affect the performance, such as learning rate and batch size. In this section, we performed sensitivity experiments on the Symptom dataset under the condition of 20-shot.</p>
<sec>
<title>4.5.1 Batch size</title>
<p>The batch size parameter plays a crucial role in determining the performance of the model in the realm of text classification. In our experiment, we conducted a comparison of the model&#x00027;s performance under five different batch sizes: 8, 16, 32, 64, and 128, and analyzed the results. Our empirical findings, as depicted in <xref ref-type="fig" rid="F4">Figure 4</xref>, reveal that the highest accuracy score of 86.91% was attained when the batch size was set to 64, while a suboptimal accuracy score of 83.52% was recorded at a batch size of 8. When the batch size was increased to 128, the accuracy plummeted to 85.72%. We attribute this experimental result to the distinct data distribution patterns learned by the model under different batch sizes. Smaller batch sizes are more susceptible to overfitting due to the relatively limited amount of data processed at each interval. Conversely, excessively large batch sizes tend to complicate the model training process, leading to inefficient learning. Based on our analysis of the Symptom dataset, we conclude that a batch size of 16 represents the most favorable parameter for the model, striking a balance between overfitting and training efficiency.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p>The effect of batch size on experimental results.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1519280-g0004.tif"/>
</fig>
</sec>
<sec>
<title>4.5.2 Learning rate</title>
<p>When undertaking the task of medical short text classification, the learning rate parameter plays a crucial role in the optimization process, as it governs the magnitude of weight adjustments during training. The selection of an appropriate learning rate is of utmost importance, given its profound influence on the model&#x00027;s performance. Our analysis of the experimental results, as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>, confirms that the model&#x00027;s performance varies under the influence of the learning rate, with optimal results achieved at a learning rate of 0.00003, reaching an accuracy of 86.91%. Conversely, when the learning rate is set to 0.00004, the model&#x00027;s accuracy is the lowest, measuring at 85.19%.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>The effect of learning rate on experimental results.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1519280-g0005.tif"/>
</fig>
<p>Achieving optimal performance necessitates the careful selection of an appropriate learning rate. However, it is essential to acknowledge that disparate tasks and datasets may require different learning rates. Hence, conducting experimental testing is imperative to determine the ideal parameter values that yield the best results for a specific task and dataset.</p>
</sec>
</sec>
<sec>
<title>4.6 Ablation experiment</title>
<p>To better exemplify the efficacy of our MSP, we conducted ablation experiments to obtain verbalizers that were introduced at varying degrees. Specifically, instead of engaging two strategies concurrently to construct verbalizer, including Concepts Retrieval (CR) and Context Information (CI). We incrementally incorporated strategies and evaluated experimental performance under 20-shot conditions on two datasets. As <xref ref-type="fig" rid="F6">Figure 6</xref> evinces, the accuracy of the experiments improved to a certain extent with the integration of diverse strategies. For instance, in the Symptom dataset, the experimental results surged from 84.24% to 86.91 %, representing a 2.67% increase, and there was discernible progress made in the Gynecology dataset as well. The amalgamation of various strategies detailed in this paper yields remarkable outcomes.</p>
<fig id="F6" position="float">
<label>Figure 6</label>
<caption><p>The results of the ablation experiment.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fmed-12-1519280-g0006.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="conclusions" id="s5">
<title>5 Conclusions</title>
<p>This paper introduces a novel method for medical short text classification with soft prompt-tuning (short for MSP). MSP is proposed to address the problems of professional medical vocabulary and complex medical measures, and it achieved excellent performance even in few-shot scenarios. The soft prompt in MSP comprises the vectors of the MASK and the input sentence learned by BERT, and the soft tokens learned by BiLSTM. Furthermore, two strategies including concept retrieval and context information are adopted for verbalizer construction. Extensive experiments validated the effectiveness of our MSP compared to other neural networks, fine-tuned PLMs, and prompt-tuning methods.</p>
<p>In the future, we will extend the research work of medical short text classification in the following two directions. Firstly, we aim to explore adaptive prompt-tuning techniques that automatically adjust based on data distribution, user intent, or medical domain variations. Secondly, we plan to incorporate more auxiliary information including medical ontologies, clinical guidelines, or electronic health records (EHRs) for improving context understanding and classification accuracy.</p>
</sec>
<sec id="s6">
<title>6 Limitation</title>
<p>In this paper, we have validated the effectiveness of our method on medical short text classification. However, firstly, the constructed verbalizer has not been validated by domain experts or automated knowledge bases, future work should investigate strategies to address this problem. Secondly, the optimization of soft prompts requires additional computational resources compared to traditional hand-crafted prompt-tuning methods. While our method reduces manual effort, further optimizations could be explored to improve computational efficiency.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>XX: Conceptualization, Methodology, Project administration, Software, Writing &#x02013; original draft. HW: Formal analysis, Methodology, Software, Visualization, Writing &#x02013; review &#x00026; editing. FJ: Data curation, Formal analysis, Supervision, Writing &#x02013; review &#x00026; editing. TQ: Software, Validation, Writing &#x02013; review &#x00026; editing. WW: Conceptualization, Investigation, Project administration, Resources, Supervision, Validation, Writing &#x02013; original draft.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This research was partially supported by the Open Project Program of Key Laboratory of Big Data Analysis and Knowledge Services, Yangzhou City, Yangzhou University (YBK202205) and the Open Project Program of Key Laboratory of Knowledge Engineering with Big Data (the Ministry of Education of China).</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<fn-group>
<fn id="fn0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://concept.research.microsoft.com/">https://concept.research.microsoft.com/</ext-link></p></fn>
</fn-group>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>G</given-names></name> <name><surname>Ye</surname> <given-names>Y</given-names></name> <name><surname>Du</surname> <given-names>X</given-names></name> <name><surname>Huang</surname> <given-names>X</given-names></name> <name><surname>Bie</surname> <given-names>S</given-names></name></person-group>. <article-title>Short text classification: a survey</article-title>. <source>J Multimed</source>. (<year>2014</year>) <volume>9</volume>:<fpage>635</fpage>&#x02013;<lpage>43</lpage>. <pub-id pub-id-type="doi">10.4304/jmm.9.5.635-643</pub-id></citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Q</given-names></name> <name><surname>Yuan</surname> <given-names>Q</given-names></name> <name><surname>Lv</surname> <given-names>P</given-names></name> <name><surname>Zhang</surname> <given-names>M</given-names></name> <name><surname>Lv</surname> <given-names>L</given-names></name></person-group>. <article-title>Research on medical text classification based on improved capsule network</article-title>. <source>Electronics</source>. (<year>2022</year>) <volume>11</volume>:<fpage>2229</fpage>. <pub-id pub-id-type="doi">10.3390/electronics11142229</pub-id><pub-id pub-id-type="pmid">34330264</pub-id></citation></ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>X</given-names></name> <name><surname>Cui</surname> <given-names>M</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Bai</surname> <given-names>R</given-names></name> <name><surname>Lu</surname> <given-names>Z</given-names></name> <name><surname>Aickelin</surname> <given-names>U</given-names></name> <etal/></person-group>. <article-title>hybrid medical text classification framework: integrating attentive rule construction and neural network</article-title>. <source>Neurocomputing</source>. (<year>2021</year>) <volume>443</volume>:<fpage>345</fpage>&#x02013;<lpage>55</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.02.069</pub-id></citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Huang</surname> <given-names>Q</given-names></name> <name><surname>Ren</surname> <given-names>S</given-names></name> <name><surname>Jiang</surname> <given-names>L</given-names></name> <name><surname>Deng</surname> <given-names>B</given-names></name> <name><surname>Qin</surname> <given-names>Y</given-names></name> <etal/></person-group>. <article-title>A novel medical text classification model with Kalman filter for clinical decision making</article-title>. <source>Biomed Signal Process Control</source>. (<year>2023</year>) <volume>82</volume>:<fpage>104503</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2022.104503</pub-id></citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qing</surname> <given-names>L</given-names></name> <name><surname>Linhong</surname> <given-names>W</given-names></name> <name><surname>Xuehai</surname> <given-names>D</given-names></name></person-group>. <article-title>A novel neural network-based method for medical text classification</article-title>. <source>Future Internet</source>. (<year>2019</year>) <volume>11</volume>:<fpage>255</fpage>. <pub-id pub-id-type="doi">10.3390/fi11120255</pub-id></citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mao</surname> <given-names>S</given-names></name> <name><surname>Zhang</surname> <given-names>LL</given-names></name> <name><surname>Guan</surname> <given-names>ZG</given-names></name></person-group>. <article-title>An LSTM&#x00026;Topic-CNN model for classification of online Chinese medical questions</article-title>. <source>IEEE Access</source>. (<year>2021</year>) <volume>9</volume>:<fpage>52580</fpage>&#x02013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3070375</pub-id></citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname> <given-names>X</given-names></name> <name><surname>Li</surname> <given-names>Y</given-names></name> <name><surname>Liang</surname> <given-names>W</given-names></name> <name><surname>CNN-RNN</surname></name></person-group>. <article-title>based intelligent recommendation for online medical pre-diagnosis support</article-title>. <source>IEEE/ACM Trans Comput Biol Bioinform</source>. (<year>2020</year>) <volume>18</volume>:<fpage>912</fpage>&#x02013;<lpage>21</lpage>. <pub-id pub-id-type="doi">10.1109/TCBB.2020.2994780</pub-id><pub-id pub-id-type="pmid">32750846</pub-id></citation></ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Beeksma</surname> <given-names>M</given-names></name> <name><surname>Verberne</surname> <given-names>S</given-names></name> <name><surname>van den Bosch</surname> <given-names>A</given-names></name> <name><surname>Das</surname> <given-names>E</given-names></name> <name><surname>Hendrickx</surname> <given-names>I</given-names></name> <name><surname>Groenewoud</surname> <given-names>S</given-names></name></person-group>. <article-title>Predicting life expectancy with a long short-term memory recurrent neural network using electronic medical records</article-title>. <source>BMC Med Inform Decis Mak</source>. (<year>2019</year>) <volume>19</volume>:<fpage>1</fpage>&#x02013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1186/s12911-019-0775-2</pub-id><pub-id pub-id-type="pmid">30819172</pub-id></citation></ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>D</given-names></name> <name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>P</given-names></name></person-group>. <article-title>Representation learning for question classification via topic sparse autoencoder and entity embedding</article-title>. In: <source>2018 IEEE International Conference on Big Data (Big Data)</source>. (<year>2018</year>). p. <fpage>126</fpage>&#x02013;<lpage>133</lpage>. <pub-id pub-id-type="doi">10.1109/BigData.2018.8622331</pub-id></citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Song</surname> <given-names>Z</given-names></name> <name><surname>Xie</surname> <given-names>Y</given-names></name> <name><surname>Huang</surname> <given-names>W</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name></person-group>. <article-title>Classification of traditional Chinese medicine cases based on character-level bert and deep learning</article-title>. In: <source>IEEE 8th Joint International Information Technology and Artificial Intelligence Conference (ITAIC)</source>. (<year>2019</year>). p. <fpage>1383</fpage>&#x02013;<lpage>1387</lpage>. <pub-id pub-id-type="doi">10.1109/ITAIC.2019.8785612</pub-id></citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Devlin</surname> <given-names>J</given-names></name> <name><surname>Chang</surname> <given-names>MW</given-names></name> <name><surname>Lee</surname> <given-names>K</given-names></name> <name><surname>Toutanova</surname> <given-names>K</given-names></name></person-group>. <article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title>. In: <source>Proceedings of the conference of the North American chapter of the association for computational linguistics: human language technologies, volume 1 (long and short papers).</source> (<year>2019</year>). p. <fpage>4171</fpage>&#x02013;<lpage>4186</lpage>.</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Y</given-names></name> <name><surname>Ott</surname> <given-names>M</given-names></name> <name><surname>Goyal</surname> <given-names>N</given-names></name> <name><surname>Du</surname> <given-names>J</given-names></name> <name><surname>Joshi</surname> <given-names>M</given-names></name> <name><surname>Chen</surname> <given-names>D</given-names></name> <etal/></person-group>. <article-title>Roberta: a robustly optimized BERT pretraining approach</article-title>. <source>arXiv preprint arXiv:190711692</source>. (<year>2019</year>).</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raffel</surname> <given-names>C</given-names></name> <name><surname>Shazeer</surname> <given-names>N</given-names></name> <name><surname>Roberts</surname> <given-names>A</given-names></name> <name><surname>Lee</surname> <given-names>K</given-names></name> <name><surname>Narang</surname> <given-names>S</given-names></name> <name><surname>Matena</surname> <given-names>M</given-names></name> <etal/></person-group>. <article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title>. <source>J Mach Learn Res</source>. (<year>2020</year>) <volume>21</volume>:<fpage>5485</fpage>&#x02013;<lpage>551</lpage>. <pub-id pub-id-type="doi">10.5555/3455716.3455856</pub-id></citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Floridi</surname> <given-names>L</given-names></name> <name><surname>Chiriatti</surname> <given-names>M</given-names></name></person-group>. <article-title>GPT-3: Its nature, scope, limits, and consequences</article-title>. <source>Minds Mach</source>. (<year>2020</year>) <volume>30</volume>:<fpage>681</fpage>&#x02013;<lpage>94</lpage>. <pub-id pub-id-type="doi">10.1007/s11023-020-09548-1</pub-id></citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>H</given-names></name> <name><surname>Zhengyan</surname> <given-names>Z</given-names></name> <name><surname>Ning</surname> <given-names>D</given-names></name> <name><surname>Yuxian</surname> <given-names>G</given-names></name> <name><surname>Xiao</surname> <given-names>L</given-names></name> <name><surname>Yuqi</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>Pre-trained models: past, present and future</article-title>. <source>AI Open</source>. (<year>2021</year>) <volume>2</volume>:<fpage>225</fpage>&#x02013;<lpage>50</lpage>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2021.08.002</pub-id></citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname> <given-names>N</given-names></name> <name><surname>Hu</surname> <given-names>S</given-names></name> <name><surname>Zhao</surname> <given-names>W</given-names></name> <name><surname>Chen</surname> <given-names>Y</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Zheng</surname> <given-names>H</given-names></name> <etal/></person-group>. <article-title>OpenPrompt: an open-source framework for prompt-learning</article-title>. In: <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics: System Demonstrations</source>. (<year>2022</year>). p. <fpage>105</fpage>&#x02013;<lpage>113</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.acl-demo.10</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brown</surname> <given-names>TB</given-names></name> <name><surname>Mann</surname> <given-names>B</given-names></name> <name><surname>Ryder</surname> <given-names>N</given-names></name> <name><surname>Subbiah</surname> <given-names>M</given-names></name> <name><surname>Kaplan</surname> <given-names>J</given-names></name> <name><surname>Dhariwal</surname> <given-names>P</given-names></name> <etal/></person-group>. <article-title>Language models are few-shot learners</article-title>. In: <source>Neural Information Processing Systems</source>. (<year>2020</year>). p. <fpage>1877</fpage>&#x02013;<lpage>1901</lpage>.</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>XL</given-names></name> <name><surname>Liang</surname> <given-names>P</given-names></name></person-group>. <article-title>Prefix-tuning: optimizing continuous prompts for generation</article-title>. In: <source>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</source>. (<year>2021</year>). p. <fpage>4582</fpage>&#x02013;<lpage>4597</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.353</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schick</surname> <given-names>T</given-names></name> <name><surname>Schmid</surname> <given-names>H</given-names></name> <name><surname>Sch&#x000FC;tze</surname> <given-names>H</given-names></name></person-group>. <article-title>Automatically identifying words that can serve as labels for few-shot text classification</article-title>. In: <source>Proceedings of the 28th International Conference on Computational Linguistics</source>. (<year>2020</year>). p. <fpage>5569</fpage>&#x02013;<lpage>5578</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2020.coling-main.488</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>C</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Weng</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>B</given-names></name> <name><surname>Li</surname> <given-names>Z</given-names></name></person-group>. <article-title>Natural language processing applications for computer-aided diagnosis in oncology</article-title>. <source>Diagnostics</source>. (<year>2023</year>) <volume>13</volume>:<fpage>286</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics13020286</pub-id><pub-id pub-id-type="pmid">36673096</pub-id></citation></ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Park</surname> <given-names>S</given-names></name> <name><surname>Kim-Knauss</surname> <given-names>Y</given-names></name> <name><surname>Sim</surname> <given-names>J</given-names></name></person-group>. <article-title>Leveraging text mining approach to identify what people want to know about mental disorders from online inquiry platforms</article-title>. <source>Front Public Health</source>. (<year>2021</year>) <volume>9</volume>:<fpage>759802</fpage>. <pub-id pub-id-type="doi">10.3389/fpubh.2021.759802</pub-id><pub-id pub-id-type="pmid">34712643</pub-id></citation></ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>C</given-names></name> <name><surname>Qu</surname> <given-names>Y</given-names></name> <name><surname>Jin</surname> <given-names>B</given-names></name> <name><surname>Guo</surname> <given-names>L</given-names></name> <name><surname>Li</surname> <given-names>C</given-names></name> <name><surname>Cui</surname> <given-names>W</given-names></name> <etal/></person-group>. <article-title>A convolutional neural network model for online medical guidance</article-title>. <source>IEEE Access</source>. (<year>2016</year>) <volume>4</volume>:<fpage>4094</fpage>&#x02013;<lpage>103</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2016.2594839</pub-id></citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>Y</given-names></name></person-group>. <article-title>Convolutional neural networks for sentence classification</article-title>. In: <source>Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)</source>. (<year>2014</year>). p. <fpage>1746</fpage>&#x02013;<lpage>1751</lpage>. <pub-id pub-id-type="doi">10.3115/v1/D14-1181</pub-id></citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>Q</given-names></name> <name><surname>Zhang</surname> <given-names>J</given-names></name> <name><surname>Huang</surname> <given-names>J</given-names></name> <name><surname>Lu</surname> <given-names>Y</given-names></name> <name><surname>Lei</surname> <given-names>K</given-names></name></person-group>. <article-title>Improving medical short text classification with semantic expansion using word-cluster embedding</article-title>. In: <source>Information Science and Applications (ICISA)</source>. (<year>2019</year>). p. <fpage>401</fpage>&#x02013;<lpage>411</lpage>. <pub-id pub-id-type="doi">10.1007/978-981-13-1056-0_41</pub-id></citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lan</surname> <given-names>Z</given-names></name> <name><surname>Chen</surname> <given-names>M</given-names></name> <name><surname>Goodman</surname> <given-names>S</given-names></name> <name><surname>Gimpel</surname> <given-names>K</given-names></name> <name><surname>Sharma</surname> <given-names>P</given-names></name> <name><surname>Soricut</surname> <given-names>R</given-names></name></person-group>. <article-title>ALBERT: a lite BERT for self-supervised learning of language representations</article-title>. In: <source>International Conference on Learning Representations</source>. (<year>2020</year>).</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>He</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>C</given-names></name> <name><surname>Zhang</surname> <given-names>S</given-names></name> <name><surname>Li</surname> <given-names>N</given-names></name> <name><surname>Li</surname> <given-names>Z</given-names></name> <name><surname>Zeng</surname> <given-names>Z</given-names></name></person-group>. <article-title>KG-MTT-BERT: knowledge graph enhanced BERT for multi-type medical text classification</article-title>. <source>arXiv preprint arXiv:221003970</source>. (<year>2022</year>).</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gasmi</surname> <given-names>K</given-names></name></person-group>. <article-title>Improving BERT-based model for medical text classification with an optimization algorithm</article-title>. In: <source>International Conference on Computational Collective Intelligence</source>. (<year>2022</year>). p. <fpage>101</fpage>&#x02013;<lpage>111</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-16210-7_8</pub-id></citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cai</surname> <given-names>F</given-names></name> <name><surname>Ye</surname> <given-names>H</given-names></name></person-group>. <article-title>Chinese medical text classification with RoBERTa</article-title>. In: <source>International Symposium on Biomedical and Computational Biology</source>. (<year>2022</year>). p. <fpage>223</fpage>&#x02013;<lpage>236</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-25191-7_17</pub-id><pub-id pub-id-type="pmid">35451969</pub-id></citation></ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Peng</surname> <given-names>Z</given-names></name> <name><surname>Zhang</surname> <given-names>F</given-names></name> <name><surname>Zhou</surname> <given-names>L</given-names></name> <name><surname>Yang</surname> <given-names>F</given-names></name></person-group>. <article-title>Medical text classification based on the discriminative pre-training model and prompt-tuning</article-title>. <source>Digital Health</source>. (<year>2023</year>) <volume>9</volume>:<fpage>20552076231193213</fpage>. <pub-id pub-id-type="doi">10.1177/20552076231193213</pub-id><pub-id pub-id-type="pmid">37559830</pub-id></citation></ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Zhao</surname> <given-names>J</given-names></name> <name><surname>Gao</surname> <given-names>H</given-names></name></person-group>. <article-title>A prompt tuning method for Chinese medical text classification</article-title>. In: <source>International Conference on Advanced Data Mining and Applications</source>. (<year>2023</year>). p. <fpage>151</fpage>&#x02013;<lpage>166</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-46671-7_11</pub-id></citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>P</given-names></name> <name><surname>Yuan</surname> <given-names>W</given-names></name> <name><surname>Fu</surname> <given-names>J</given-names></name> <name><surname>Jiang</surname> <given-names>Z</given-names></name> <name><surname>Hayashi</surname> <given-names>H</given-names></name> <name><surname>Neubig</surname> <given-names>G</given-names></name></person-group>. <article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title>. <source>ACM Comput Surv</source>. (<year>2023</year>) <volume>55</volume>:<fpage>1</fpage>&#x02013;<lpage>35</lpage>. <pub-id pub-id-type="doi">10.1145/3560815</pub-id></citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>X</given-names></name> <name><surname>Zhang</surname> <given-names>N</given-names></name> <name><surname>Xie</surname> <given-names>X</given-names></name> <name><surname>Deng</surname> <given-names>S</given-names></name> <name><surname>Yao</surname> <given-names>Y</given-names></name> <name><surname>Tan</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>Knowprompt: knowledge-aware prompt-tuning with synergistic optimization for relation extraction</article-title>. In: <source>Proceedings of the ACM Web Conference</source>. (<year>2022</year>). p. <fpage>2778</fpage>&#x02013;<lpage>2788</lpage>. <pub-id pub-id-type="doi">10.1145/3485447.3511998</pub-id></citation>
</ref>
<ref id="B33">
<label>33.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khashabi</surname> <given-names>D</given-names></name> <name><surname>Min</surname> <given-names>S</given-names></name> <name><surname>Khot</surname> <given-names>T</given-names></name> <name><surname>Sabharwal</surname> <given-names>A</given-names></name> <name><surname>Tafjord</surname> <given-names>O</given-names></name> <name><surname>Clark</surname> <given-names>P</given-names></name> <etal/></person-group>. <article-title>UNIFIEDQA: crossing format boundaries with a single QA system</article-title>. In: <source>Findings of the Association for Computational Linguistics (EMNLP)</source>. (<year>2020</year>). p. <fpage>1896</fpage>&#x02013;<lpage>1907</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2020.findings-emnlp.171</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B34">
<label>34.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>H</given-names></name> <name><surname>Song</surname> <given-names>D</given-names></name></person-group>. <article-title>DisCup: discriminator cooperative unlikelihood prompt-tuning for controllable text generation</article-title>. In: <source>Proceedings of the Conference on Empirical Methods in Natural Language Processing</source>. (<year>2022</year>). p. <fpage>3392</fpage>&#x02013;<lpage>3406</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.223</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B35">
<label>35.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>Qiang</surname> <given-names>J</given-names></name> <name><surname>Wu</surname> <given-names>X</given-names></name></person-group>. <article-title>Prompt-learning for short text classification</article-title>. <source>IEEE Trans Knowl Data Eng</source>. (<year>2024</year>) <volume>36</volume>:<fpage>5328</fpage>&#x02013;<lpage>39</lpage>. <pub-id pub-id-type="doi">10.1109/TKDE.2023.3332787</pub-id></citation>
</ref>
<ref id="B36">
<label>36.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>X</given-names></name> <name><surname>Zhao</surname> <given-names>W</given-names></name> <name><surname>Ding</surname> <given-names>N</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Sun</surname> <given-names>M</given-names></name></person-group>. <article-title>Ptr: prompt tuning with rules for text classification</article-title>. <source>AI Open</source>. (<year>2022</year>) <volume>3</volume>:<fpage>182</fpage>&#x02013;<lpage>92</lpage>. <pub-id pub-id-type="doi">10.1016/j.aiopen.2022.11.003</pub-id></citation>
</ref>
<ref id="B37">
<label>37.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wen</surname> <given-names>Y</given-names></name> <name><surname>Jain</surname> <given-names>N</given-names></name> <name><surname>Kirchenbauer</surname> <given-names>J</given-names></name> <name><surname>Goldblum</surname> <given-names>M</given-names></name> <name><surname>Geiping</surname> <given-names>J</given-names></name> <name><surname>Goldstein</surname> <given-names>T</given-names></name></person-group>. <article-title>Hard prompts made easy: gradient-based discrete optimization for prompt tuning and discovery</article-title>. In: <source>Thirty-seventh Conference on Neural Information Processing Systems</source>. (<year>2023</year>).</citation>
</ref>
<ref id="B38">
<label>38.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lester</surname> <given-names>B</given-names></name> <name><surname>Al-Rfou</surname> <given-names>R</given-names></name> <name><surname>Constant</surname> <given-names>N</given-names></name></person-group>. <article-title>The power of scale for parameter-efficient prompt tuning</article-title>. In: <source>Proceedings of the Conference on Empirical Methods in Natural Language Processing</source>. (<year>2021</year>). p. <fpage>3045</fpage>&#x02013;<lpage>3059</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B39">
<label>39.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qin</surname> <given-names>G</given-names></name> <name><surname>Eisner</surname> <given-names>J</given-names></name></person-group>. <article-title>Learning how to ask: querying LMs with mixtures of soft prompts</article-title>. In: <source>Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>. (<year>2021</year>). p. <fpage>5203</fpage>&#x02013;<lpage>5212</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.410</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B40">
<label>40.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>X</given-names></name> <name><surname>Ji</surname> <given-names>K</given-names></name> <name><surname>Fu</surname> <given-names>Y</given-names></name> <name><surname>Tam</surname> <given-names>W</given-names></name> <name><surname>Du</surname> <given-names>Z</given-names></name> <name><surname>Yang</surname> <given-names>Z</given-names></name> <etal/></person-group>. <article-title>P-tuning: prompt tuning can be comparable to fine-tuning across scales and tasks</article-title>. In: <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)</source>. (<year>2022</year>). p. <fpage>61</fpage>&#x02013;<lpage>68</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.acl-short.8</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B41">
<label>41.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vu</surname> <given-names>T</given-names></name> <name><surname>Lester</surname> <given-names>B</given-names></name> <name><surname>Constant</surname> <given-names>N</given-names></name> <name><surname>Al-Rfou&#x00027;</surname> <given-names>R</given-names></name> <name><surname>Cer</surname> <given-names>D</given-names></name></person-group>. <article-title>SPoT: better frozen model adaptation through soft prompt transfer</article-title>. In: <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>. (<year>2022</year>). p. <fpage>5039</fpage>&#x02013;<lpage>5059</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.346</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B42">
<label>42.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Schick</surname> <given-names>T</given-names></name> <name><surname>Sch&#x000FC;tze</surname> <given-names>H</given-names></name></person-group>. <article-title>Exploiting cloze-questions for few-shot text classification and natural language inference</article-title>. In: <source>Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume</source>. (<year>2021</year>). p. <fpage>255</fpage>&#x02013;<lpage>269</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.eacl-main.20</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B43">
<label>43.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gao</surname> <given-names>T</given-names></name> <name><surname>Fisch</surname> <given-names>A</given-names></name> <name><surname>Chen</surname> <given-names>D</given-names></name></person-group>. <article-title>Making pre-trained language models better few-shot learners</article-title>. In: <source>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</source>. (<year>2021</year>). p. <fpage>3816</fpage>&#x02013;<lpage>3830</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.295</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B44">
<label>44.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Meng</surname> <given-names>Y</given-names></name> <name><surname>Zhang</surname> <given-names>Y</given-names></name> <name><surname>Huang</surname> <given-names>J</given-names></name> <name><surname>Xiong</surname> <given-names>C</given-names></name> <name><surname>Ji</surname> <given-names>H</given-names></name> <name><surname>Zhang</surname> <given-names>C</given-names></name> <etal/></person-group>. <article-title>Text classification usinglabel names only: a language model self-training approach</article-title>. In: <source>Proceedings of the Empirical Methods in Natural Language Processing</source>. (<year>2020</year>). p. <fpage>9006</fpage>&#x02013;<lpage>9017</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.724</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B45">
<label>45.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>S</given-names></name> <name><surname>Ding</surname> <given-names>N</given-names></name> <name><surname>Wang</surname> <given-names>H</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Wang</surname> <given-names>J</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Knowledgeable prompt-tuning: incorporating knowledge into prompt verbalizer for text classification</article-title>. In: <source>Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</source>. (<year>2022</year>). p. <fpage>2225</fpage>&#x02013;<lpage>2240</lpage>. <pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.158</pub-id><pub-id pub-id-type="pmid">36568019</pub-id></citation></ref>
<ref id="B46">
<label>46.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>Y</given-names></name> <name><surname>Mo</surname> <given-names>T</given-names></name> <name><surname>Jiang</surname> <given-names>Y</given-names></name> <name><surname>Li</surname> <given-names>W</given-names></name> <name><surname>Zhao</surname> <given-names>W</given-names></name></person-group>. <article-title>Eliciting knowledge from pretrained language models for prototypical prompt verbalizer</article-title>. In: <source>International Conference on Artificial Neural Networks</source>. (<year>2022</year>). p. <fpage>222</fpage>&#x02013;<lpage>233</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-031-15931-2_19</pub-id></citation>
</ref>
<ref id="B47">
<label>47.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname> <given-names>Q</given-names></name> <name><surname>Wang</surname> <given-names>Y</given-names></name> <name><surname>You</surname> <given-names>T</given-names></name> <name><surname>Lu</surname> <given-names>Y</given-names></name></person-group>. <article-title>BioKnowPrompt: incorporating imprecise knowledge into prompt-tuning verbalizer with biomedical text for relation extraction</article-title>. <source>Inf Sci</source>. (<year>2022</year>) <volume>617</volume>:<fpage>346</fpage>&#x02013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2022.10.063</pub-id></citation>
</ref>
<ref id="B48">
<label>48.</label>
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>Y</given-names></name> <name><surname>Wang</surname> <given-names>S</given-names></name> <name><surname>Feng</surname> <given-names>S</given-names></name> <name><surname>Ding</surname> <given-names>S</given-names></name> <name><surname>Pang</surname> <given-names>C</given-names></name> <name><surname>Shang</surname> <given-names>J</given-names></name> <etal/></person-group>. <article-title>Ernie 3</article-title>.0: Large-scale knowledge enhanced pre-training for language understanding and generation. <source>arXiv preprint arXiv:210702137</source>. (<year>2021</year>).</citation>
</ref>
</ref-list>
</back>
</article>