<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2024.1514896</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Artificial Intelligence</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Evaluating accuracy and reproducibility of large language model performance on critical care assessments in pharmacy education</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Yang</surname> <given-names>Huibo</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2930104/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Hu</surname> <given-names>Mengxuan</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c002"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2874886/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Most</surname> <given-names>Amoreena</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Hawkins</surname> <given-names>W. Anthony</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2913914/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Murray</surname> <given-names>Brian</given-names></name>
<xref ref-type="aff" rid="aff5"><sup>5</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2929142/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Smith</surname> <given-names>Susan E.</given-names></name>
<xref ref-type="aff" rid="aff6"><sup>6</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname> <given-names>Sheng</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sikora</surname> <given-names>Andrea</given-names></name>
<xref ref-type="aff" rid="aff7"><sup>7</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/1394948/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Computer Science, University of Virginia</institution>, <addr-line>Charlottesville, VA</addr-line>, <country>United States</country></aff>
<aff id="aff2"><sup>2</sup><institution>School of Data Science, University of Virginia</institution>, <addr-line>Charlottesville, VA</addr-line>, <country>United States</country></aff>
<aff id="aff3"><sup>3</sup><institution>University of Georgia College of Pharmacy</institution>, <addr-line>Augusta, GA</addr-line>, <country>United States</country></aff>
<aff id="aff4"><sup>4</sup><institution>Department of Clinical and Administrative Pharmacy, University of Georgia College of Pharmacy</institution>, <addr-line>Albany, GA</addr-line>, <country>United States</country></aff>
<aff id="aff5"><sup>5</sup><institution>University of Colorado Skaggs Schools of Pharmacy and Pharamceutical Sciences</institution>, <addr-line>Aurora, CO</addr-line>, <country>United States</country></aff>
<aff id="aff6"><sup>6</sup><institution>Department of Clinical and Administrative Pharmacy, University of Georgia College of Pharmacy</institution>, <addr-line>Athens, GA</addr-line>, <country>United States</country></aff>
<aff id="aff7"><sup>7</sup><institution>Department of Clinical and Administrative Pharmacy, University of Georgia College of Pharmacy</institution>, <addr-line>Augusta, GA</addr-line>, <country>United States</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Kezhi Li, University College London, United Kingdom</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Jose Amilcar Rizzo Sierra, Polytechnic University of Quer&#x00E9;taro, Mexico</p>
<p>Murat Kirisci, Istanbul University-Cerrahpasa, T&#x00FC;rkiye</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Huibo Yang, <email>aqf7bf@virginia.edu</email></corresp>
<corresp id="c002">Mengxuan Hu, <email>qtq7su@virginia.edu</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>09</day>
<month>01</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>7</volume>
<elocation-id>1514896</elocation-id>
<history>
<date date-type="received">
<day>21</day>
<month>10</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>12</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Yang, Hu, Most, Hawkins, Murray, Smith, Li and Sikora.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Yang, Hu, Most, Hawkins, Murray, Smith, Li and Sikora</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<sec id="sec1">
<title>Background</title>
<p>Large language models (LLMs) have demonstrated impressive performance on medical licensing and diagnosis-related exams. However, comparative evaluations to optimize LLM performance and ability in the domain of comprehensive medication management (CMM) are lacking. The purpose of this evaluation was to test various LLMs performance optimization strategies and performance on critical care pharmacotherapy questions used in the assessment of Doctor of Pharmacy students.</p>
</sec>
<sec id="sec2">
<title>Methods</title>
<p>In a comparative analysis using 219 multiple-choice pharmacotherapy questions, five LLMs (GPT-3.5, GPT-4, Claude 2, Llama2-7b and 2-13b) were evaluated. Each LLM was queried five times to evaluate the primary outcome of accuracy (i.e., correctness). Secondary outcomes included variance, the impact of prompt engineering techniques (e.g., chain-of-thought, CoT) and training of a customized GPT on performance, and comparison to third year doctor of pharmacy students on knowledge recall vs. knowledge application questions. Accuracy and variance were compared with student&#x2019;s t-test to compare performance under different model settings.</p>
</sec>
<sec id="sec3">
<title>Results</title>
<p>ChatGPT-4 exhibited the highest accuracy (71.6%), while Llama2-13b had the lowest variance (0.070). All LLMs performed more accurately on knowledge recall vs. knowledge application questions (e.g., ChatGPT-4: 87% vs. 67%). When applied to ChatGPT-4, few-shot CoT across five runs improved accuracy (77.4% vs. 71.5%) with no effect on variance. Self-consistency and the custom-trained GPT demonstrated similar accuracy to ChatGPT-4 with few-shot CoT. Overall pharmacy student accuracy was 81%, compared to an optimal overall LLM accuracy of 73%. Comparing question types, six of the LLMs demonstrated equivalent or higher accuracy than pharmacy students on knowledge recall questions (e.g., self-consistency vs. students: 93% vs. 84%), but pharmacy students achieved higher accuracy than all LLMs on knowledge application questions (e.g., self-consistency vs. students: 68% vs. 80%).</p>
</sec>
<sec id="sec4">
<title>Conclusion</title>
<p>ChatGPT-4 was the most accurate LLM on critical care pharmacy questions and few-shot CoT improved accuracy the most. Average student accuracy was similar to LLMs overall, and higher on knowledge application questions. These findings support the need for future assessment of customized training for the type of output needed. Reliance on LLMs is only supported with recall-based questions.</p>
</sec>
</abstract>
<kwd-group>
<kwd>large language model</kwd>
<kwd>artificial intelligence</kwd>
<kwd>pharmacy</kwd>
<kwd>education</kwd>
<kwd>critical care</kwd>
<kwd>medical education</kwd>
<kwd>higher education</kwd>
<kwd>machine learning</kwd>
</kwd-group>
<counts>
<fig-count count="1"/>
<table-count count="7"/>
<equation-count count="0"/>
<ref-count count="41"/>
<page-count count="8"/>
<word-count count="5469"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Medicine and Public Health</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec5">
<title>Introduction</title>
<p>Large language models (LLMs) have shown remarkable abilities in the medical domain, including diagnosing complex patient cases (<xref ref-type="bibr" rid="ref16">Kanjee et al., 2023</xref>), creating and summarizing patient notes (<xref ref-type="bibr" rid="ref15">Hu et al., 2024</xref>), and generating personalized treatment plans (<xref ref-type="bibr" rid="ref3">Benary et al., 2023</xref>); however, these tasks have largely focused on more structured diagnostic problems with clearly delineated correct and incorrect answers (<xref ref-type="bibr" rid="ref28">Sallam, 2023</xref>; <xref ref-type="bibr" rid="ref8">Chowdhery et al., 2022</xref>; <xref ref-type="bibr" rid="ref4">Bommasani et al., 2023</xref>; <xref ref-type="bibr" rid="ref39">Yang et al., 2023</xref>). Comprehensive medication management (CMM) poses a more unstructured problem where multiple appropriate courses of action may be available, requiring clinicians, including trainees, to weigh known risks and benefits of medications as a component of a shared decision making model (<xref ref-type="bibr" rid="ref16">Kanjee et al., 2023</xref>; <xref ref-type="bibr" rid="ref15">Hu et al., 2024</xref>). Importantly, the poly-prescribing of multiple medications (critically ill patients average 13&#x2013;20 medications at any given time) increases the risk of adverse drug events (ADEs) and medication errors (<xref ref-type="bibr" rid="ref27">Raffel et al., 2019</xref>; <xref ref-type="bibr" rid="ref29">Sikora, 2023</xref>). Given that each year it is estimated that 4 billion prescriptions are dispensed in the United States alone and that medication errors are thought to occur daily for critically ill patients, CMM performed by highly trained clinicians is essential for safe and efficacious medication use. Indeed, it has been shown that critical care pharmacists reduce adverse drug events (ADEs) by 70% (<xref ref-type="bibr" rid="ref32">Tariq et al., 2024</xref>).</p>
<p>LLMs may be an important tool towards making medication use safer; however, the testing of LLMs for CMM has only just begun (<xref ref-type="bibr" rid="ref28">Sallam, 2023</xref>; <xref ref-type="bibr" rid="ref8">Chowdhery et al., 2022</xref>; <xref ref-type="bibr" rid="ref4">Bommasani et al., 2023</xref>). Thus far, LLMs have been tested for deprescribing benzodiazepines, identifying drug-herb interactions, predicting outcomes from medication regimens, and performance on a national pharmacist examination, showing early promise but also concerning rates of hallucinations and inaccurate information (<xref ref-type="bibr" rid="ref7">Bu&#x017E;an&#x010D;i&#x0107; et al., 2024</xref>; <xref ref-type="bibr" rid="ref14">Hsu et al., 2023</xref>; <xref ref-type="bibr" rid="ref18">Kunitsu, 2023</xref>). Most LLMs were trained on a widely available corpus (e.g., the Internet), which creates the potential for problems in domains marked by highly technical language germane to healthcare and medication management (<xref ref-type="bibr" rid="ref19">Gu et al., 2024</xref>). Moreover, improving LLM reasoning abilities via prompt engineering (<xref ref-type="bibr" rid="ref24">Naveed et al., 2023</xref>; <xref ref-type="bibr" rid="ref36">Wei et al., 2022a</xref>, <xref ref-type="bibr" rid="ref37">2022b</xref>; <xref ref-type="bibr" rid="ref38">Wei et al., 2023</xref>; <xref ref-type="bibr" rid="ref26">Rae et al., 2022</xref>) and reasoning engine strategies in the pharmacy domain remains underexplored. Recent efforts to cultivate expert thinking skills among pharmacy trainees further underscore the need for specialized tools that support clinical decision-making in complex environments like CMM (<xref ref-type="bibr" rid="ref12">Hawkins and Palmer, 2024</xref>).</p>
<p>As a first step towards clinically characterizing the role of LLMs in CMM, this study aimed to compare the performance of several LLMs on case-based, multiple-choice questions focusing on critical care pharmacotherapy. Further, prompt engineering and reasoning engines techniques were explored.</p>
</sec>
<sec sec-type="methods" id="sec6">
<title>Methods</title>
<sec id="sec7">
<title>Study design</title>
<p>The performance of six LLMs based on pharmacy school course materials was evaluated across multiple-choice questions related to critical care pharmacotherapy. The primary outcome was model accuracy (i.e., correctness when compared to ground truth). A key secondary outcome was model variance (i.e., change over time). Additional secondary outcomes included evaluation of model performance by question type (knowledge recall vs. knowledge application), evaluation of the effect of different prompt engineering techniques on model performance, and performance of LLMs relative to pharmacy students for a subset of questions.</p>
</sec>
<sec id="sec8">
<title>Data source</title>
<p>A total of 219 multiple-choice questions focused on critical care pharmacotherapy topics used in Doctor of Pharmacy curricula from two accredited, four-year colleges of pharmacy were compiled for this study. Questions were written for students in their third professional year who participated in a critical care elective course (99 questions) and critical care module from the core pharmacotherapy series (120 questions). Questions were formatted to have four answer choices and images were converted to textual input. Additionally, questions were further categorized into knowledge-based (51 questions) and skill-based (168), with knowledge questions testing fact recall and application questions testing application of pharmacy knowledge to simple patient cases. Ground truth was established as the correct answer by the course coordinators/item writers of the respective Doctor of Pharmacy courses, who are all considered content experts.</p>
</sec>
<sec id="sec9">
<title>Models</title>
<p>A total of six LLMs were evaluated, including ChatGPT-3.5, ChatGPT-4, Claude2, Llama2-7b, Llama2-13b, and customized ChatGPT-4. ChatGPT-3.5 and ChatGPT-4 are models from OpenAI known for their advanced natural language understanding and generation capabilities. Claude2, developed by Anthropic, focuses on safety and alignment in artificial intelligence (AI) outputs, enhancing understanding and reasoning while prioritizing safe and reliable responses. Llama2-7b and Llama2-13b, part of Meta&#x2019;s LLaMA suite, are designed for efficiency and effectiveness in natural language tasks. Llama2-7b utilizes a smaller parameter count to achieve competitive performance while Llama2-13b offers improved performance and accuracy due to its increased parameter count, potentially making it more suitable for more complex and nuanced language processing tasks. Additionally, a Custom ChatGPT by OpenAI named PharmacyGPT was trained on a dataset of relevant pharmacy school course materials to serve as a proof-of-concept for domain-specific training. Performance metrics were compared to the ChatGPT-4 results with initialization prompt and CoT prompt.</p>
</sec>
<sec id="sec10">
<title>Initialization prompt</title>
<p>Input was standardized to generate output that provided correct answers and explanations. The following prompt served as a scaffold to orient the model to the specific task and context, with the goal of enhancing model performance by producing more accurate and structured answers: &#x201C;This is a midterm exam for the critical care elective course in pharmacy school. Please select the most correct answer from the following multiple-choice options and give your reason why you chose it. Please follow the following format to answer the question: The correct answer is: (fill in the blank). The reason is: (fill in the blank).&#x201D; Further prompt engineering methodology is provided in <xref ref-type="supplementary-material" rid="SM1">Appendix A</xref>.</p>
<p><italic>Prompt engineering</italic> is a set of methodologies centered on using prompts to perform in-context learning and instruct LLMs with the goal to adeptly tackle downstream tasks (<xref ref-type="bibr" rid="ref25">Pryzant et al., 2023</xref>; <xref ref-type="bibr" rid="ref31">Sun et al., 2023</xref>). Prompts provide specific instructions or cues to the models, which direct LLMs towards a specific task without necessitating time-consuming annotation of large amounts of data for fine-tuning (<xref ref-type="bibr" rid="ref15">Hu et al., 2024</xref>; <xref ref-type="bibr" rid="ref38">Wei et al., 2023</xref>; <xref ref-type="bibr" rid="ref35">Wang et al., 2022</xref>; <xref ref-type="bibr" rid="ref22">Liu et al., 2021</xref>; <xref ref-type="bibr" rid="ref23">Ma et al., 2024</xref>; <xref ref-type="bibr" rid="ref10">Guan et al., 2023</xref>). <italic>Reasoning engines</italic> like Chain-of-Thought, Tree-of-Thought, and Graph-of-Thought break up problems into steps from which logical inferences can be made (akin to showing a step by step process in answering an algebra problem). Reasoning engines are useful because they reduce hallucinations and support assessment for gaps in domain knowledge (<xref ref-type="bibr" rid="ref36">Wei et al., 2022a</xref>, <xref ref-type="bibr" rid="ref37">2022b</xref>; <xref ref-type="bibr" rid="ref13">Holmes et al., 2023</xref>). Both of these methodologies to improve LLM performance were evaluated. Specifically, the effect of prompt engineering (as a means of in-context learning) based on ChatGPT-4 was explored. This means it can better understand the prompt and improve the generation based on it. First, the zero-shot chain-of-thought (CoT) approach was applied by including &#x201C;Let us think step by step&#x201D; in the prompt and requesting the model to provide both the answer and the corresponding reasoning steps directly. Zero-Shot CoT was applied to ChatGPT-4 and was evaluated in five separate trials. Model performance parameters were compared to the LLM with an initialization prompt. Then, a few-shot CoT was applied by offering a set of examples including questions, intermediate steps, and answers, requesting the LLM to generate intermediate steps and arrive at the correct final answer for new problems. This was evaluated in five separate trials, and model performance was compared with the initialization prompt results and Zero-Shot CoT results. Chain of thought methodology is further summarized in <xref ref-type="supplementary-material" rid="SM1">Appendix B</xref>. In the self-consistency (SC) approach, the final result was determined by selecting the answer that obtains the highest number of votes among the five trials, thereby leveraging the model&#x2019;s ability to produce consistent responses across multiple iterations and potentially enhancing overall performance. The model performance generated by this approach was then compared to those of the initialization prompt, the Zero-Shot CoT, and the CoT results of ChatGPT-4.</p>
</sec>
<sec id="sec11">
<title>PharmacyGPT</title>
<p>In addition to the prompt engineering techniques, a ChatGPT was built based on a custom dataset of relevant pharmacy school course materials as a proof of concept to improve GPT-4 model performance. Performance metrics were compared to the ChatGPT-4 results with initialization prompt and CoT prompt.</p>
</sec>
<sec id="sec12">
<title>Recall vs. application based question analysis and comparison to pharmacy student performance</title>
<p>Response accuracy on recall- and application-based questions from the LLMs (ChatGPT-3.5, ChatGPT &#x2212;4, Claude2, Llama2-7b, Llama2-13b) with the initialization prompt and GPT-4 engineered with few-shot CoT were compared to pharmacy student performance for 120 multiple-choice questions on which student performance was available. Student performance was available for the core pharmacotherapy course for one year (as questions are updated on a yearly basis).</p>
</sec>
<sec id="sec13">
<title>Statistical analysis</title>
<p>Model accuracy was evaluated by inputting the same prompt into each LLM five separate times and reporting the accuracy of each model for each run when compared to ground truth answer along with the overall average accuracy across all runs. Model variance was evaluated by assigning numeric values (1, 2, 3, 4) to the four answer choices in each question and calculating variance from the response accuracy and the assigned value for each LLM. To further examine the consistency of responses between humans and LLMs across various types of questions, heatmap visualization techniques were used to visualize the distribution of data.</p>
<p>All comparisons were evaluated by two-sided independent-sample t-tests with significance thresholds of 0.05. The analysis was performed using Python 3.11.3 and SciPy version 1.11.4, ensuring robust and reliable statistical computations.</p>
</sec>
</sec>
<sec sec-type="results" id="sec14">
<title>Results</title>
<sec id="sec15">
<title>Initialization prompt</title>
<p>The performance of the five LLMs evaluated in terms of accuracy of each of the five runs, average accuracy, and variance over the five runs are included in <xref ref-type="table" rid="tab1">Table 1</xref>. ChatGPT-4 achieved the highest average accuracy rate at 71.6% with a satisfactory variance of 0.14 among five LLMs. Conversely, Llama2-13b had the lowest variance (0.070) among the LLMs, but its accuracy was limited (41.5%). ChatGPT-4 significantly outperformed the other LLMs (<xref ref-type="table" rid="tab2">Table 2</xref>).</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Response accuracy and variance of LLMs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">LLM</th>
<th align="center" valign="top">Acc-Run1</th>
<th align="center" valign="top">Acc-Run2</th>
<th align="center" valign="top">Acc-Run3</th>
<th align="center" valign="top">Acc-Run4</th>
<th align="center" valign="top">Acc-Run5</th>
<th align="center" valign="top">Acc-Avg</th>
<th align="center" valign="top">Variance</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-3.5</td>
<td align="center" valign="top">0.55</td>
<td align="center" valign="top">0.53</td>
<td align="center" valign="top">0.51</td>
<td align="center" valign="top">0.55</td>
<td align="center" valign="top">0.54</td>
<td align="center" valign="top">0.54</td>
<td align="center" valign="top">0.30</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="top">0.73</td>
<td align="center" valign="top">0.70</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.72</td>
<td align="center" valign="top">0.70</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.14</td>
</tr>
<tr>
<td align="left" valign="top">Claude2</td>
<td align="center" valign="top">0.60</td>
<td align="center" valign="top">0.60</td>
<td align="center" valign="top">0.62</td>
<td align="center" valign="top">0.61</td>
<td align="center" valign="top">0.61</td>
<td align="center" valign="top">0.61</td>
<td align="center" valign="top">0.09</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-7b</td>
<td align="center" valign="top">0.36</td>
<td align="center" valign="top">0.38</td>
<td align="center" valign="top">0.36</td>
<td align="center" valign="top">0.35</td>
<td align="center" valign="top">0.35</td>
<td align="center" valign="top">0.36</td>
<td align="center" valign="top">0.21</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-13b</td>
<td align="center" valign="top">0.40</td>
<td align="center" valign="top">0.40</td>
<td align="center" valign="top">0.44</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">0.07</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Acc-R#: Accuracy of run number #. Acc-Avg: Average accuracy of five runs. Var: Variance of five runs.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Comparison of average accuracy of five runs between ChatGPT-4 and other LLMs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Model comparison</th>
<th align="center" valign="top">ChatGPT-4 acc-avg</th>
<th align="center" valign="top">Other model acc-avg</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-4 vs. ChatGPT-3.5</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.54</td>
<td align="center" valign="top"><italic>p</italic> &#x003C;&#x202F;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4 vs. Claude2</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.61</td>
<td align="center" valign="top">p&#x202F;&#x003C;&#x202F;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4 vs. Llama2-7b</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.36</td>
<td align="center" valign="top">p&#x202F;&#x003C;&#x202F;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4 vs. Llama2-13b</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">p&#x202F;&#x003C;&#x202F;0.01</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>When comparing LLM performance on knowledge versus skill-based questions, all five LLMs demonstrated higher accuracy in knowledge-based questions as shown in <xref ref-type="table" rid="tab3">Table 3</xref>. An inverse pattern was reflected in variance, where all LLMs except for Llama2-7b showed lower variance when answering knowledge questions and higher variance in their responses to application-based questions. In particular, ChatGPT-4 achieved the highest accuracy for recall- and application -based questions, with an accuracy of 87 and 67%, respectively.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Average response accuracy and variance of LLMs answering skill-based vs. knowledge-based questions.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">LLM</th>
<th align="center" valign="top">Accuracy-recall</th>
<th align="center" valign="top">Accuracy-application</th>
<th align="center" valign="top">Variance-recall</th>
<th align="center" valign="top">Variance-application</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-3.5</td>
<td align="center" valign="top">0.69</td>
<td align="center" valign="top">0.50</td>
<td align="center" valign="top">0.22</td>
<td align="center" valign="top">0.33</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="top">0.87</td>
<td align="center" valign="top">0.67</td>
<td align="center" valign="top">0.08</td>
<td align="center" valign="top">0.15</td>
</tr>
<tr>
<td align="left" valign="top">Claude2</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.57</td>
<td align="center" valign="top">0.09</td>
<td align="center" valign="top">0.09</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-7b</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">0.34</td>
<td align="center" valign="top">0.22</td>
<td align="center" valign="top">0.21</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-13b</td>
<td align="center" valign="top">0.51</td>
<td align="center" valign="top">0.39</td>
<td align="center" valign="top">0.06</td>
<td align="center" valign="top">0.07</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>All data in this table are averaged over five runs. Accuracy-knowledge: Average accuracy across knowledge-based questions. Accuracy-skill: Average accuracy across skill-based questions. Variance-knowledge: Variance of answers across knowledge-based questions. Variance-skill: Variance of answers across skill-based questions.</p>
</table-wrap-foot>
</table-wrap>
<p><xref ref-type="table" rid="tab4">Table 4</xref> shows the response accuracy and variance with a zero-shot CoT approach. All five LLMs performed similarly with a zero-shot CoT approach compared to the original initialization prompt used, showing minimal improvement with this approach.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Average response accuracy and variance of LLMs with zero-shot CoT.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">LLM</th>
<th align="center" valign="top">Acc-Run1</th>
<th align="center" valign="top">Acc-Run2</th>
<th align="center" valign="top">Acc-Run3</th>
<th align="center" valign="top">Acc-Run4</th>
<th align="center" valign="top">Acc-Run5</th>
<th align="center" valign="top">Acc-avg</th>
<th align="center" valign="top">Variance</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-3.5</td>
<td align="center" valign="top">0.55</td>
<td align="center" valign="top">0.53</td>
<td align="center" valign="top">0.53</td>
<td align="center" valign="top">0.56</td>
<td align="center" valign="top">0.54</td>
<td align="center" valign="top">0.54</td>
<td align="center" valign="top">0.32</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="top">0.73</td>
<td align="center" valign="top">0.70</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.72</td>
<td align="center" valign="top">0.70</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.13</td>
</tr>
<tr>
<td align="left" valign="top">Claude2</td>
<td align="center" valign="top">0.59</td>
<td align="center" valign="top">0.59</td>
<td align="center" valign="top">0.61</td>
<td align="center" valign="top">0.60</td>
<td align="center" valign="top">0.60</td>
<td align="center" valign="top">0.60</td>
<td align="center" valign="top">0.08</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-7b</td>
<td align="center" valign="top">0.35</td>
<td align="center" valign="top">0.34</td>
<td align="center" valign="top">0.33</td>
<td align="center" valign="top">0.33</td>
<td align="center" valign="top">0.35</td>
<td align="center" valign="top">0.34</td>
<td align="center" valign="top">0.13</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-13b</td>
<td align="center" valign="top">0.38</td>
<td align="center" valign="top">0.42</td>
<td align="center" valign="top">0.40</td>
<td align="center" valign="top">0.42</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">0.41</td>
<td align="center" valign="top">0.09</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Acc-Run#: Accuracy of run number #. Acc-Avg: Average accuracy of five runs. Var: Average Variance of answers of five runs.</p>
</table-wrap-foot>
</table-wrap>
<p>Few-shot CoT was explored, and the results for ChatGPT-4 are presented in <xref ref-type="table" rid="tab5">Table 5</xref>. This table presents a breakdown of ChatGPT-4&#x2019;s average accuracy and variance across different shot iterations, showcasing the incremental changes in performance with each additional shot. The results demonstrate that CoT could improve model performance from 71.5% to a maximum of 77.4% (<italic>p</italic>-value &#x003C;0.001). More CoT examples led to better performance, as evidenced by the highest accuracy achieved with five-shot CoT. However, the use of few-shot CoT did not lead to a reduction in the variance. Visualizations are provided in the <xref ref-type="supplementary-material" rid="SM1">Supplemental Figures in Appendix B</xref>.</p>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>Average response accuracy and variance of ChatGPT-4 with few-shot CoT across five runs.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">ChatGPT-4</th>
<th align="center" valign="top">Accuracy-0-shot</th>
<th align="center" valign="top">Accuracy-1-shot</th>
<th align="center" valign="top">Accuracy-3-shot</th>
<th align="center" valign="top">Accuracy-5-shot</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="top">0.71</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.77</td>
<td align="center" valign="top">0.77</td>
</tr>
<tr>
<td/>
<td align="left" valign="top">Variance-0-shot</td>
<td align="center" valign="top">Variance-1-shot</td>
<td align="center" valign="top">Variance-3-shot</td>
<td align="center" valign="top">Variance-5-shot</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="top">0.13</td>
<td align="center" valign="top">0.0</td>
<td align="center" valign="top">0.12</td>
<td align="center" valign="top">0.17</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Furthermore, the results of the Self-Consistency approach, based on five-shot ChatGPT-4, have shown further promising outcomes. Self-Consistency led to a modest improvement in performance, resulting in a 2% increase in accuracy on the five-shot ChatGPT-4 (<xref ref-type="table" rid="tab6">Table 6</xref>).</p>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>Accuracies of self-consistency and 5-shot CoT of ChatGPT-4.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top">Acc-Run1</th>
<th align="center" valign="top">Acc-Run2</th>
<th align="center" valign="top">Acc-Run3</th>
<th align="center" valign="top">Acc-Run4</th>
<th align="center" valign="top">Acc-Run5</th>
<th align="center" valign="top">Acc-Avg</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Self-consistency</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="middle" rowspan="2">0.03</td>
</tr>
<tr>
<td align="left" valign="top">5-shot CoT</td>
<td align="center" valign="top">0.70</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">0.70</td>
<td align="center" valign="top">0.73</td>
<td align="center" valign="top">0.75</td>
<td align="center" valign="top">0.726</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>Acc-Run#: Accuracy of run number #. Acc-Avg: Average Accuracy of five runs. <italic>p</italic>-value for independent two sample <italic>t</italic>-test.</p>
</table-wrap-foot>
</table-wrap>
<p>In a comparison of recall and application-based questions, student performance was similar across both question types whereas LLM performance was lower for application-based questions with all models. <xref ref-type="supplementary-material" rid="SM1">Supplemental Figure 3 in Appendix B</xref> shows the average response accuracy between LLMs and students on 120 questions. Students outperformed the best-performing LLM (self-consistency with CoT) model based on 5-shot CoT by 5%.</p>
<p>ChatGPT-4 with self-consistency achieved high accuracy for knowledge-based questions, which outperformed the student average in this domain (93% vs. 84%, <italic>p</italic>-value&#x202F;=&#x202F;0.05) (<xref ref-type="table" rid="tab7">Table 7</xref>). However, the performance of the best model for application -based questions was lower than the student average (69% vs. 80%, <italic>p</italic>-value&#x202F;=&#x202F;0.024). Additionally, the response accuracy for both recall- and application-based questions improved as more CoT examples were provided. PharmacyGPT outperformed ChatGPT-4 when using the initialization prompt on both recall-based questions (90% vs. 84%, <italic>p</italic>-value&#x202F;=&#x202F;0.0310) and application-based questions (69% vs. 60%, <italic>p</italic>-value&#x202F;=&#x202F;0.0032). Specifically, PharmacyGPT outperformed the model with self-consistency, which was the best model developed via the prompt engineering approach, on application-based questions.</p>
<table-wrap position="float" id="tab7">
<label>Table 7</label>
<caption>
<p>Comparison of LLMs to student performance on 120 multiple-choice questions.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th/>
<th align="center" valign="top">Accuracy-overall</th>
<th align="center" valign="top">Accuracy-recall</th>
<th align="center" valign="top">Accuracy-application</th>
<th align="center" valign="top">Variance-recall</th>
<th align="center" valign="top">Variance-application</th>
<th align="center" valign="top"><italic>p</italic>-value</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Student</td>
<td align="center" valign="top">0.81</td>
<td align="center" valign="top">0.84</td>
<td align="center" valign="top">0.80</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">0.11</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-3.5</td>
<td align="center" valign="bottom">0.51</td>
<td align="center" valign="top">0.68</td>
<td align="center" valign="top">0.45</td>
<td align="center" valign="top">0.20</td>
<td align="center" valign="top">0.34</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4</td>
<td align="center" valign="bottom">0.65</td>
<td align="center" valign="top">0.84</td>
<td align="center" valign="top">0.60</td>
<td align="center" valign="top">0.08</td>
<td align="center" valign="top">0.19</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4-1S</td>
<td align="center" valign="bottom">0.70</td>
<td align="center" valign="top">0.87</td>
<td align="center" valign="top">0.65</td>
<td align="center" valign="top">0.01</td>
<td align="center" valign="top">0.10</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4-3S</td>
<td align="center" valign="bottom">0.73</td>
<td align="center" valign="top">0.90</td>
<td align="center" valign="top">0.68</td>
<td align="center" valign="top">0.01</td>
<td align="center" valign="top">0.18</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">ChatGPT-4-5S</td>
<td align="center" valign="bottom">0.73</td>
<td align="center" valign="top">0.91</td>
<td align="center" valign="top">0.67</td>
<td align="center" valign="top">0.09</td>
<td align="center" valign="top">0.24</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">Claude2</td>
<td align="center" valign="bottom">0.59</td>
<td align="center" valign="top">0.73</td>
<td align="center" valign="top">0.55</td>
<td align="center" valign="top">0.02</td>
<td align="center" valign="top">0.13</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-7b</td>
<td align="center" valign="bottom">0.33</td>
<td align="center" valign="top">0.38</td>
<td align="center" valign="top">0.32</td>
<td align="center" valign="top">0.10</td>
<td align="center" valign="top">0.17</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">Llama2-13b</td>
<td align="center" valign="bottom">0.39</td>
<td align="center" valign="top">0.48</td>
<td align="center" valign="top">0.36</td>
<td align="center" valign="top">0.05</td>
<td align="center" valign="top">0.10</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">Self-consistency</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">0.93</td>
<td align="center" valign="top">0.69</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">&#x003C;0.01</td>
</tr>
<tr>
<td align="left" valign="top">PharmacyGPT</td>
<td align="center" valign="top">0.74</td>
<td align="center" valign="top">0.90</td>
<td align="center" valign="top">0.69</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">0.01</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>ChatGPT-4-1S, GPT-4 with 1 shot CoT; ChatGPT-4-3S, GPT-4 with 3 shot CoT; ChatGPT-4-5S, GPT-4 with 5 shot CoT.</p>
</table-wrap-foot>
</table-wrap>
<p>To further explore the performance across different questions, a heatmap of the average accuracy for each was plotted in <xref ref-type="fig" rid="fig1">Figure 1</xref>. This revealed that challenging questions for humans were not necessarily difficult for LLMs, and vice versa, suggesting differences in expertise alignment between LLMs and humans.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Correctness heatmap across LLMs and humans on 120 questions. Color represents the average accuracy, where a deeper color indicates lower accuracy and a lighter color indicates higher accuracy.</p>
</caption>
<graphic xlink:href="frai-07-1514896-g001.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="discussion" id="sec16">
<title>Discussion</title>
<p>This study compared the performance of ChatGPT-3.5, ChatGPT-4, Claude2, Llama2-7b, Llama2-13b, and a customized GPT-4 on multiple-choice questions related to critical care pharmacotherapy. The findings demonstrate that specific prompt engineering techniques, particularly the few-shot CoT and self-consistency approaches, enhanced the response accuracy of these language models. ChatGPT-4, in particular, exhibited the highest accuracy across different prompts, outperforming human pharmacy students on knowledge-based questions when advanced prompting techniques were used. LLMs showed worse performance on application based questions compared to pharmacy students, likely reflective of the difference between recall vs. application tasks.</p>
<p>Prompt engineering techniques enhanced performance. ChatGPT-4 showed marked improvements in response accuracy when using few-shot CoT prompting. This improvement underscores the importance of structured prompting in maximizing the utility of LLMs for complex question-answering tasks. The self-consistency approach also contributed to performance gains, albeit modestly. This incremental improvement highlights the potential for combining multiple advanced prompting techniques to optimize LLM outputs. Given that the variance in responses did not significantly decrease with CoT prompting, it is evident that while these techniques enhance accuracy, they do not necessarily stabilize the model&#x2019;s performance across different runs. These findings align with previous research suggesting that incorporating domain-specific prompts and examples can significantly enhance the reasoning capabilities of LLMs (<xref ref-type="bibr" rid="ref6">Brown et al., 2020</xref>). For example, ChatGPT-4 outperformed residents on the Family Medicine In-Training Exam (86.5%) but struggled on gastroenterology and pediatric subspecialty exams, showing variability in outcomes across medical domains (<xref ref-type="bibr" rid="ref20">Liu et al., 2024</xref>; <xref ref-type="bibr" rid="ref11">Hanna et al., 2024</xref>). Furthermore, GPT-4&#x2019;s performance was consistently higher in English-speaking environments, with 26 out of 29 passing cases globally, but it faced challenges in non-English settings, indicating the importance of language context in medical evaluations (<xref ref-type="bibr" rid="ref20">Liu et al., 2024</xref>). Currently, most LLMs are trained on general text datasets, with few designed specifically for medical applications. Consequently, even with advanced prompt engineering techniques, their performance remains limited due to bias and error propagation inherent in the training data (<xref ref-type="bibr" rid="ref34">Ullah et al., 2024</xref>). Previous studies have shown that ChatGPT&#x2019;s performance on medical exams varies by specialty; for instance, it achieved a passing grade on neurosurgery board finals but failed a gastroenterology board-like examination (<xref ref-type="bibr" rid="ref30">Smith et al., 2023</xref>). Similarly, GPT-4 excelled in psychiatry and general medicine on Israeli medical board exams, while performing less impressively in pediatrics and OB/GYN (<xref ref-type="bibr" rid="ref17">Katz et al., 2024</xref>). In ophthalmology exams, ChatGPT Plus showed better results in general medicine compared to subspecialties like neuro-ophthalmology, reflecting how performance varies across disciplines (<xref ref-type="bibr" rid="ref2">Antaki et al., 2023</xref>; <xref ref-type="bibr" rid="ref1">Abbas et al., 2024</xref>). Customized models like PharmacyGPT demonstrate initial potential for developing LLMs tailored specifically for pharmacy applications (<xref ref-type="bibr" rid="ref21">Liu et al., 2023</xref>). Thus, gathering relevant pharmacy training data and designing and training dedicated medical LLMs combined with prompt engineering could improve performance.</p>
<p>Although students performed similarly on both recall and application questions, LLMs struggled more with application-based questions, even with prompt-engineering techniques. Recall-based questions typically demand factual recall or recognition. In contrast, application-based questions often require nuanced understanding and reasoning abilities to apply knowledge in complex scenarios, posing greater challenges for LLMs (<xref ref-type="bibr" rid="ref40">Ye et al., 2024</xref>). Students are specifically trained to develop these practical skills, whereas LLMs have limited exposure to such application-based questions during training, contributing to the performance gap between humans and LLMs (<xref ref-type="bibr" rid="ref5">Branan et al., 2024</xref>). The superior performance on knowledge-based questions suggests that LLMs have a great ability to retrieve and synthesize information from their training data, a task well-suited to their design and capabilities. Previous research has shown the utility of AI in clinical decision support systems, particularly in areas requiring rapid and precise information retrieval (<xref ref-type="bibr" rid="ref33">Topol, 2019</xref>; <xref ref-type="bibr" rid="ref41">Yu et al., 2018</xref>). In contrast, pharmacy students, while knowledgeable, may not have the same depth and breadth of information readily accessible in their memory (<xref ref-type="bibr" rid="ref5">Branan et al., 2024</xref>). A possibility exists that lower performance (accuracy) and higher variance may exist over time for students, compared to a more stable level of performance for the LLMs.</p>
<p>This paper is the first to compare various prompt engineering techniques across different popular LLMs for answering pharmacy questions; however, this study has limitations. The focus was primarily on highly-structured, multiple-choice questions, whereas in real-world scenarios many questions remain open-ended and ill-structured. Furthermore, only popular decoder-based LLMs (Llama/ChatGPT) were included, which while it maximizes some elements of generalizability, improvements in LLMs are being made at regular increments. LLMs with other architectures, such as encoder-decoder models [T5 (<xref ref-type="bibr" rid="ref27">Raffel et al., 2019</xref>)] and encoder-based models [BERT (<xref ref-type="bibr" rid="ref9">Devlin et al., 2018</xref>)], have not been evaluated. Moreover, while peer-reviewed custom training materials were used, it is known that clinical practice variability, seen in the form of expert judgement, is present in both the materials and exam question answers. Overall, this study provides important groundwork for understanding how to incorporate LLMs into the realm of comprehensive medication management.</p>
</sec>
<sec sec-type="conclusions" id="sec17">
<title>Conclusion</title>
<p>This study highlights the potential of LLMs, especially when equipped with advanced prompt engineering techniques, to support pharmacists in knowledge-based decision-making scenarios. These findings underscore the importance of developing and refining LLMs for specialized medical fields to enhance clinical decision support systems. These findings support the need for future assessment of customized training for the type of output needed and emphasize that reliability of LLMs is currently only supported with recall-based questions.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec18">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="author-contributions" id="sec19">
<title>Author contributions</title>
<p>HY: Formal analysis, Methodology, Writing &#x2013; original draft. MH: Data curation, Formal analysis, Methodology, Writing &#x2013; original draft. AM: Data curation, Methodology, Visualization, Writing &#x2013; original draft. WH: Supervision, Writing &#x2013; review &#x0026; editing. BM: Data curation, Methodology, Writing &#x2013; review &#x0026; editing. SS: Data curation, Methodology, Supervision, Writing &#x2013; review &#x0026; editing. SL: Supervision, Writing &#x2013; review &#x0026; editing. AS: Conceptualization, Formal analysis, Funding acquisition, Methodology, Supervision, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec20">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. Funding through Agency of Healthcare Research and Quality for Drs. Sikora, Smith, and Li were provided through R01HS029009.</p>
</sec>
<ack>
<p>The authors acknowledge William Hsieh for assistance with creating figures for this article.</p>
</ack>
<sec sec-type="COI-statement" id="sec21">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec22">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="sec23">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec sec-type="supplementary-material" id="sec24">
<title>Supplementary material</title>
<p>The Supplementary material for this article can be found online at: <ext-link xlink:href="https://www.frontiersin.org/articles/10.3389/frai.2024.1514896/full#supplementary-material" ext-link-type="uri">https://www.frontiersin.org/articles/10.3389/frai.2024.1514896/full#supplementary-material</ext-link></p>
<supplementary-material xlink:href="Data_Sheet_1.docx" id="SM1" mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Abbas</surname> <given-names>A.</given-names></name> <name><surname>Rehman</surname> <given-names>M. S.</given-names></name> <name><surname>Rehman</surname> <given-names>S. S.</given-names></name></person-group> (<year>2024</year>). <article-title>Comparing the performance of popular large language models on the National Board of medical examiners sample questions</article-title>. <source>Cureus</source> <volume>16</volume>:<fpage>e55991</fpage>. doi: <pub-id pub-id-type="doi">10.7759/cureus.55991</pub-id>, PMID: <pub-id pub-id-type="pmid">38606229</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Antaki</surname> <given-names>F.</given-names></name> <name><surname>Touma</surname> <given-names>S.</given-names></name> <name><surname>Milad</surname> <given-names>D.</given-names></name> <name><surname>El-Khoury</surname> <given-names>J.</given-names></name> <name><surname>Duval</surname> <given-names>R.</given-names></name></person-group> (<year>2023</year>). <article-title>Evaluating the performance of ChatGPT in ophthalmology: an analysis of its successes and shortcomings</article-title>. <source>Ophthalmol Sci.</source> <volume>3</volume>:<fpage>100324</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.xops.2023.100324</pub-id>, PMID: <pub-id pub-id-type="pmid">37334036</pub-id></citation></ref>
<ref id="ref3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Benary</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>X. D.</given-names></name> <name><surname>Schmidt</surname> <given-names>M.</given-names></name> <name><surname>Soll</surname> <given-names>D.</given-names></name> <name><surname>Hilfenhaus</surname> <given-names>G.</given-names></name> <name><surname>Nassir</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Leveraging large language models for decision support in personalized oncology</article-title>. <source>JAMA Netw. Open</source> <volume>6</volume>:<fpage>e2343689</fpage>. doi: <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.43689</pub-id>, PMID: <pub-id pub-id-type="pmid">37976064</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bommasani</surname> <given-names>R.</given-names></name> <name><surname>Liang</surname> <given-names>B.</given-names></name> <name><surname>Lee</surname> <given-names>T.</given-names></name></person-group> (<year>2023</year>). <article-title>Holistic evaluation of language models</article-title>. <source>Ann. N. Y. Acad. Sci.</source> <volume>1525</volume>, <fpage>140</fpage>&#x2013;<lpage>146</lpage>. doi: <pub-id pub-id-type="doi">10.1111/nyas.15007</pub-id>, PMID: <pub-id pub-id-type="pmid">37230490</pub-id></citation></ref>
<ref id="ref5"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Branan</surname> <given-names>T. N.</given-names></name> <name><surname>Darley</surname> <given-names>A.</given-names></name> <name><surname>Hawkins</surname> <given-names>W. A.</given-names></name></person-group> (<year>2024</year>). <article-title>How critical is it? Integrating critical care into the pharmacy didactic curriculum</article-title>. <source>Am. J. Health Syst. Pharm.</source> <volume>81</volume>, <fpage>871</fpage>&#x2013;<lpage>875</lpage>. doi: <pub-id pub-id-type="doi">10.1093/ajhp/zxae153</pub-id>, PMID: <pub-id pub-id-type="pmid">38874404</pub-id></citation></ref>
<ref id="ref6"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Brown</surname> <given-names>T. B.</given-names></name> <name><surname>Mann</surname> <given-names>B.</given-names></name> <name><surname>Ryder</surname> <given-names>N.</given-names></name> <name><surname>Subbiah</surname> <given-names>M.</given-names></name> <name><surname>Kaplan</surname> <given-names>J. D.</given-names></name> <name><surname>Dhariwal</surname> <given-names>P.</given-names></name> <etal/></person-group>. (<year>2020</year>). <source>Language models are few-shot learners</source>. <volume>arXiv</volume>:<fpage>2005.14165</fpage>.</citation></ref>
<ref id="ref7"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bu&#x017E;an&#x010D;i&#x0107;</surname> <given-names>I.</given-names></name> <name><surname>Belec</surname> <given-names>D.</given-names></name> <name><surname>Dr&#x017E;ai&#x0107;</surname> <given-names>M.</given-names></name> <name><surname>Kummer</surname> <given-names>I.</given-names></name> <name><surname>Brki&#x0107;</surname> <given-names>J.</given-names></name> <name><surname>Fialov&#x00E1;</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Clinical decision making in benzodiazepine deprescribing by healthcare providers vs AI-assisted approach</article-title>. <source>Br. J. Clin. Pharmacol.</source> <volume>90</volume>, <fpage>662</fpage>&#x2013;<lpage>674</lpage>. doi: <pub-id pub-id-type="doi">10.1111/bcp.15963</pub-id>, PMID: <pub-id pub-id-type="pmid">37949663</pub-id></citation></ref>
<ref id="ref8"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Chowdhery</surname> <given-names>A.</given-names></name> <name><surname>Narang</surname> <given-names>S.</given-names></name> <name><surname>Devlin</surname> <given-names>J.</given-names></name> <name><surname>Bosma</surname> <given-names>M.</given-names></name> <name><surname>Mishra</surname> <given-names>G.</given-names></name> <name><surname>Roberts</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2022</year>). <source>PaLM: scaling language modeling with pathways</source>. <volume>arXiv</volume>:<fpage>2204.02311</fpage>.</citation></ref>
<ref id="ref9"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Devlin</surname> <given-names>J.</given-names></name> <name><surname>Chang</surname> <given-names>M. W.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Toutanova</surname> <given-names>K.</given-names></name></person-group> (<year>2018</year>). <article-title>BERT: Pre-training of deep bidirectional transformers for language understanding</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></citation></ref>
<ref id="ref10"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Guan</surname> <given-names>Z</given-names></name> <name><surname>Wu</surname> <given-names>Z</given-names></name> <name><surname>Liu</surname> <given-names>Z</given-names></name> <etal/></person-group>. <source>CohortGPT: An enhanced GPT for participant recruitment in clinical study</source>. (<year>2023</year>). <comment>Available at:</comment> <ext-link xlink:href="https://arxiv.org/abs/2307.11346v1" ext-link-type="uri">https://arxiv.org/abs/2307.11346v1</ext-link></citation></ref>
<ref id="ref19"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>X.</given-names></name> <name><surname>Chen</surname> <given-names>M.</given-names></name> <name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Zhang</surname> <given-names>H.</given-names></name> <name><surname>Wan</surname> <given-names>C.</given-names></name> <etal/></person-group>. (<year>2024</year>). <source>On the effectiveness of large language models in domain-specific code generation</source>. <volume>arXiv</volume>:<fpage>2312.01639</fpage>.</citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hanna</surname> <given-names>R. E.</given-names></name> <name><surname>Smith</surname> <given-names>L. R.</given-names></name> <name><surname>Mhaskar</surname> <given-names>R.</given-names></name> <name><surname>Hanna</surname> <given-names>K.</given-names></name></person-group> (<year>2024</year>). <article-title>Performance of language models on the family medicine in-training exam</article-title>. <source>Fam. Med.</source> <volume>56</volume>, <fpage>555</fpage>&#x2013;<lpage>560</lpage>. doi: <pub-id pub-id-type="doi">10.22454/FamMed.2024.233738</pub-id>, PMID: <pub-id pub-id-type="pmid">39207788</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hawkins</surname> <given-names>W. A.</given-names></name> <name><surname>Palmer</surname> <given-names>R.</given-names></name></person-group> (<year>2024</year>). <article-title>Cultivating expert thinking skills for experiential pharmacy trainees</article-title>. <source>Am. J. Health Syst. Pharm.</source> doi: <pub-id pub-id-type="doi">10.1093/ajhp/zxae366</pub-id>, PMID: <pub-id pub-id-type="pmid">39612914</pub-id></citation></ref>
<ref id="ref13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Holmes</surname> <given-names>J.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name> <name><surname>Ding</surname> <given-names>Y.</given-names></name> <name><surname>Sio</surname> <given-names>T. T.</given-names></name> <name><surname>McGee</surname> <given-names>L. A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Evaluating large language models on a highly-specialized topic, radiation oncology physics</article-title>. <source>Front. Oncol.</source> <volume>13</volume>. doi: <pub-id pub-id-type="doi">10.3389/fonc.2023.1219326</pub-id>, PMID: <pub-id pub-id-type="pmid">37529688</pub-id></citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hsu</surname> <given-names>H. Y.</given-names></name> <name><surname>Hsu</surname> <given-names>K. C.</given-names></name> <name><surname>Hou</surname> <given-names>S. Y.</given-names></name> <name><surname>Wu</surname> <given-names>C. L.</given-names></name> <name><surname>Hsieh</surname> <given-names>Y. W.</given-names></name> <name><surname>Cheng</surname> <given-names>Y. D.</given-names></name></person-group> (<year>2023</year>). <article-title>Examining real-world medication consultations and drug-herb interactions: ChatGPT performance evaluation</article-title>. <source>JMIR Med. Educ.</source> <volume>9</volume>:<fpage>e48433</fpage>. doi: <pub-id pub-id-type="doi">10.2196/48433</pub-id>, PMID: <pub-id pub-id-type="pmid">37561097</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>Q.</given-names></name> <name><surname>du</surname> <given-names>J.</given-names></name> <name><surname>Peng</surname> <given-names>X.</given-names></name> <name><surname>Keloth</surname> <given-names>V.</given-names></name> <name><surname>Zuo</surname> <given-names>X.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title>. <source>J. Am. Med. Inform. Assoc.</source> <volume>31</volume>, <fpage>1812</fpage>&#x2013;<lpage>1820</lpage>. doi: <pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id>, PMID: <pub-id pub-id-type="pmid">38281112</pub-id></citation></ref>
<ref id="ref16"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kanjee</surname> <given-names>Z.</given-names></name> <name><surname>Crowe</surname> <given-names>B.</given-names></name> <name><surname>Rodman</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>. <source>JAMA</source> <volume>330</volume>, <fpage>78</fpage>&#x2013;<lpage>80</lpage>. doi: <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>, PMID: <pub-id pub-id-type="pmid">37318797</pub-id></citation></ref>
<ref id="ref17"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Katz</surname> <given-names>U.</given-names></name> <name><surname>Cohen</surname> <given-names>E.</given-names></name> <name><surname>Shachar</surname> <given-names>E.</given-names></name> <name><surname>Somer</surname> <given-names>J.</given-names></name> <name><surname>Fink</surname> <given-names>A.</given-names></name> <name><surname>Morse</surname> <given-names>E.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>GPT versus resident physicians: a benchmark based on official board scores</article-title>. <source>NEJM AI</source> <volume>1</volume>. doi: <pub-id pub-id-type="doi">10.1056/AIdbp2300192</pub-id></citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kunitsu</surname> <given-names>Y.</given-names></name></person-group> (<year>2023</year>). <article-title>The potential of GPT-4 as a support tool for pharmacists: analytical study using the Japanese National Examination for pharmacists</article-title>. <source>JMIR Med. Educ.</source> <volume>9</volume>:<fpage>e48452</fpage>. doi: <pub-id pub-id-type="doi">10.2196/48452</pub-id>, PMID: <pub-id pub-id-type="pmid">37837968</pub-id></citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>M.</given-names></name> <name><surname>Okuhara</surname> <given-names>T.</given-names></name> <name><surname>Chang</surname> <given-names>X.</given-names></name> <name><surname>Shirabe</surname> <given-names>R.</given-names></name> <name><surname>Nishiie</surname> <given-names>Y.</given-names></name> <name><surname>Okada</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Performance of ChatGPT across different versions in medical licensing examinations worldwide: systematic review and meta-analysis</article-title>. <source>J. Med. Internet Res.</source> <volume>26</volume>:<fpage>e60807</fpage>. doi: <pub-id pub-id-type="doi">10.2196/60807</pub-id>, PMID: <pub-id pub-id-type="pmid">39052324</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>Z</given-names></name> <name><surname>Wu</surname> <given-names>Z</given-names></name> <name><surname>Hu</surname> <given-names>M</given-names></name> <etal/></person-group>. PharmacyGPT: The AI pharmacist. <volume>20</volume>, (<year>2023</year>). Accessed May 27, 2024. Available at: <ext-link xlink:href="http://arxiv.org/abs/2307.10432" ext-link-type="uri">http://arxiv.org/abs/2307.10432</ext-link></citation></ref>
<ref id="ref22"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>P</given-names></name> <name><surname>Yuan</surname> <given-names>W</given-names></name> <name><surname>Fu</surname> <given-names>J</given-names></name> <name><surname>Jiang</surname> <given-names>Z</given-names></name> <name><surname>Hayashi</surname> <given-names>H</given-names></name> <name><surname>Neubig</surname> <given-names>G</given-names></name></person-group>. Pre-train, prompt, and predict: A systematic survey of prompting methods in natural language processing. (<year>2021</year>). <comment>Available at:</comment> <ext-link xlink:href="https://arxiv.org/abs/2107.13586v1" ext-link-type="uri">https://arxiv.org/abs/2107.13586v1</ext-link></citation></ref>
<ref id="ref23"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>C.</given-names></name> <name><surname>Wu</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>J.</given-names></name> <name><surname>Xu</surname> <given-names>S.</given-names></name> <name><surname>Wei</surname> <given-names>Y.</given-names></name> <name><surname>Liu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2024</year>). <source>An iterative optimizing framework for radiology report summarization with ChatGPT</source>. <volume>arXiv</volume>:<fpage>2304.08448</fpage>.</citation></ref>
<ref id="ref24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Naveed</surname> <given-names>H.</given-names></name> <name><surname>Khan</surname> <given-names>A. U.</given-names></name> <name><surname>Qiu</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>A comprehensive overview of large language models</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2307.06435</pub-id></citation></ref>
<ref id="ref25"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Pryzant</surname> <given-names>R</given-names></name> <name><surname>Iter</surname> <given-names>D</given-names></name> <name><surname>Li</surname> <given-names>J</given-names></name> <name><surname>Lee</surname> <given-names>YT</given-names></name> <name><surname>Zhu</surname> <given-names>C</given-names></name> <name><surname>Zeng</surname> <given-names>M</given-names></name></person-group>. Automatic prompt optimization with &#x201C;gradient descent&#x201D; and beam search. (<year>2023</year>). <comment>Available at:</comment> <ext-link xlink:href="https://arxiv.org/abs/2305.03495v2" ext-link-type="uri">https://arxiv.org/abs/2305.03495v2</ext-link></citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Rae</surname> <given-names>J. W.</given-names></name> <name><surname>Borgeaud</surname> <given-names>S.</given-names></name> <name><surname>Cai</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Scaling language models: methods, analysis &#x0026; insights from training gopher</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2112.11446</pub-id></citation></ref>
<ref id="ref27"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Raffel</surname> <given-names>C.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Roberts</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Narang</surname> <given-names>S.</given-names></name> <name><surname>Matena</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2019</year>). <source>Exploring the limits of transfer learning with a unified text-to-text transformer</source>. <volume>arXiv</volume>:<fpage>1910.10683</fpage>.</citation></ref>
<ref id="ref28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sallam</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>. <source>Healthcare (Basel)</source> <volume>11</volume>:<fpage>887</fpage>. doi: <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>, PMID: <pub-id pub-id-type="pmid">36981544</pub-id></citation></ref>
<ref id="ref29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sikora</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Critical care pharmacists: a focus on horizons</article-title>. <source>Crit. Care Clin.</source> <volume>39</volume>, <fpage>503</fpage>&#x2013;<lpage>527</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ccc.2023.01.006</pub-id>, PMID: <pub-id pub-id-type="pmid">37230553</pub-id></citation></ref>
<ref id="ref30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Smith</surname> <given-names>J.</given-names></name> <name><surname>Choi</surname> <given-names>P. M.</given-names></name> <name><surname>Buntine</surname> <given-names>P.</given-names></name></person-group> (<year>2023</year>). <article-title>Will code one day run a code? Performance of language models on ACEM primary examinations and implications</article-title>. <source>Emerg. Med. Australas.</source> <volume>35</volume>, <fpage>876</fpage>&#x2013;<lpage>878</lpage>. doi: <pub-id pub-id-type="doi">10.1111/1742-6723.14280</pub-id>, PMID: <pub-id pub-id-type="pmid">37414729</pub-id></citation></ref>
<ref id="ref31"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Sun</surname> <given-names>H</given-names></name> <name><surname>Li</surname> <given-names>X</given-names></name> <name><surname>Xu</surname> <given-names>Y</given-names></name> <etal/></person-group>. AutoHint: automatic prompt optimization with hint generation. (<year>2023</year>). <comment>Available at:</comment> <ext-link xlink:href="https://arxiv.org/abs/2307.07415v2" ext-link-type="uri">https://arxiv.org/abs/2307.07415v2</ext-link></citation></ref>
<ref id="ref32"><citation citation-type="book"><person-group person-group-type="author"><name><surname>Tariq</surname> <given-names>R. A.</given-names></name> <name><surname>Vashisht</surname> <given-names>R.</given-names></name> <name><surname>Sinha</surname> <given-names>A.</given-names></name> <name><surname>Scherbak</surname> <given-names>Y.</given-names></name></person-group> (<year>2024</year>). &#x201C;<article-title>Medication dispensing errors and prevention</article-title>&#x201D; in <source>StatPearls</source>, vol. <volume>73</volume> (<publisher-name>StatPearls Publishing</publisher-name>), <fpage>171</fpage>&#x2013;<lpage>184</lpage>. <comment>Available at:</comment> <ext-link xlink:href="http://www.ncbi.nlm.nih.gov/books/NBK519065/" ext-link-type="uri">http://www.ncbi.nlm.nih.gov/books/NBK519065/</ext-link></citation></ref>
<ref id="ref33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Topol</surname> <given-names>E. J.</given-names></name></person-group> (<year>2019</year>). <article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title>. <source>Nat. Med.</source> <volume>25</volume>, <fpage>44</fpage>&#x2013;<lpage>56</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id>, PMID: <pub-id pub-id-type="pmid">30617339</pub-id></citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ullah</surname> <given-names>E.</given-names></name> <name><surname>Parwani</surname> <given-names>A.</given-names></name> <name><surname>Baig</surname> <given-names>M. M.</given-names></name> <name><surname>Singh</surname> <given-names>R.</given-names></name></person-group> (<year>2024</year>). <article-title>Challenges and barriers of using large language models (LLM) such as ChatGPT for diagnostic medicine with a focus on digital pathology &#x2013; a recent scoping review</article-title>. <source>Diagn. Pathol.</source> <volume>19</volume>:<fpage>43</fpage>. doi: <pub-id pub-id-type="doi">10.1186/s13000-024-01464-7</pub-id>, PMID: <pub-id pub-id-type="pmid">38414074</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X</given-names></name> <name><surname>Wei</surname> <given-names>J</given-names></name> <name><surname>Schuurmans</surname> <given-names>D</given-names></name> <etal/></person-group>. Self-consistency improves chain of thought reasoning in language models. (<year>2022</year>). <comment>Available at:</comment> <ext-link xlink:href="https://arxiv.org/abs/2203.11171v4" ext-link-type="uri">https://arxiv.org/abs/2203.11171v4</ext-link></citation></ref>
<ref id="ref36"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Bosma</surname> <given-names>M.</given-names></name> <name><surname>Zhao</surname> <given-names>V. Y.</given-names></name> <etal/></person-group>. (<year>2022a</year>). <article-title>Finetuned language models are zero-shot learners</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2109.01652</pub-id></citation></ref>
<ref id="ref37"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>J</given-names></name> <name><surname>Tay</surname> <given-names>Y</given-names></name> <name><surname>Bommasani</surname> <given-names>R</given-names></name> <etal/></person-group>. Emergent abilities of large language models. (<year>2022b</year>). <comment>Available at:</comment> <ext-link xlink:href="https://arxiv.org/abs/2206.07682v2" ext-link-type="uri">https://arxiv.org/abs/2206.07682v2</ext-link></citation></ref>
<ref id="ref38"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Schuurmans</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>. <source>arXiv</source>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2201.11903</pub-id></citation></ref>
<ref id="ref39"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Yang</surname> <given-names>J.</given-names></name> <name><surname>Jin</surname> <given-names>H.</given-names></name> <name><surname>Tang</surname> <given-names>R.</given-names></name> <name><surname>Han</surname> <given-names>X.</given-names></name> <name><surname>Feng</surname> <given-names>Q.</given-names></name> <name><surname>Jiang</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <source>Harnessing the power of LLMs in practice: a survey on ChatGPT and beyond</source>. <volume>arXiv</volume>:<fpage>2304.13712</fpage>.</citation></ref>
<ref id="ref40"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Ye</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>D.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Hwang</surname> <given-names>H.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Jo</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <source>FLASK: Fine-grained language model evaluation based on alignment skill sets</source>. <volume>arXiv</volume>:<fpage>2307.10928</fpage>.</citation></ref>
<ref id="ref41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>K. H.</given-names></name> <name><surname>Beam</surname> <given-names>A. L.</given-names></name> <name><surname>Kohane</surname> <given-names>I. S.</given-names></name></person-group> (<year>2018</year>). <article-title>Artificial intelligence in healthcare</article-title>. <source>Nat. Biomed. Eng.</source> <volume>2</volume>, <fpage>719</fpage>&#x2013;<lpage>731</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41551-018-0305-z</pub-id></citation></ref>
</ref-list>
</back>
</article>