<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v2.3 20070202//EN" "archivearticle.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="systematic-review" dtd-version="2.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Comput. Sci.</journal-id>
<journal-title>Frontiers in Computer Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Comput. Sci.</abbrev-journal-title>
<issn pub-type="epub">2624-9898</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcomp.2025.1523699</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Computer Science</subject>
<subj-group>
<subject>Systematic Review</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Evaluating large language models: a systematic review of efficiency, applications, and future directions</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Saleh</surname> <given-names>Yasmeen</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Abu Talib</surname> <given-names>Manar</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Nasir</surname> <given-names>Qassim</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Dakalbab</surname> <given-names>Fatima</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2645602/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>Department of Computer Science, College of Computing and Informatics, University of Sharjah</institution>, <addr-line>Sharjah</addr-line>, <country>United Arab Emirates</country></aff>
<aff id="aff2"><sup>2</sup><institution>Department of Computer Engineering, College of Computing and Informatics, University of Sharjah</institution>, <addr-line>Sharjah</addr-line>, <country>United Arab Emirates</country></aff>
<author-notes>
<fn fn-type="edited-by" id="fn0001">
<p>Edited by: Barkaoui Kamel, Conservatoire National des Arts et M&#x00E9;tiers (CNAM), France</p>
</fn>
<fn fn-type="edited-by" id="fn0002">
<p>Reviewed by: Adamantios Koumpis, University Hospital of Cologne, Germany</p>
<p>Sabina Rossi, Ca&#x2019; Foscari University of Venice, Italy</p>
<p>Xiaoding Wang, Fujian Normal University, China</p>
</fn>
<corresp id="c001">&#x002A;Correspondence: Manar Abu Talib, <email>mtalib@sharjah.ac.ae</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>27</day>
<month>05</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>7</volume>
<elocation-id>1523699</elocation-id>
<history>
<date date-type="received">
<day>06</day>
<month>11</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>05</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Saleh, Abu Talib, Nasir and Dakalbab.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Saleh, Abu Talib, Nasir and Dakalbab</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Large language models, the innovative breakthrough taking the world by storm, have been applied in several fields, such as medicine, education, finance, and law. Moreover, large language models can integrate into those fields through their abilities in natural language processing, text generation, question answering, and several other use cases that benefit human interactions and decision-making. Furthermore, it is imperative to acknowledge the differences involved with large language models beyond their applications by considering aspects such as their types, setups, parameters, and performance. This could help us understand how each large language model could be utilized to its fullest extent for maximum benefit. In this systematic literature review, we explore each of these aspects in depth. Finally, we conclude with insights and future directions for advancing the efficiency and applicability of large language models.</p>
</abstract>
<kwd-group>
<kwd>large language models</kwd>
<kwd>LLMS</kwd>
<kwd>efficiency</kwd>
<kwd>performance</kwd>
<kwd>application</kwd>
</kwd-group>
<counts>
<fig-count count="6"/>
<table-count count="8"/>
<equation-count count="0"/>
<ref-count count="55"/>
<page-count count="13"/>
<word-count count="8963"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Theoretical Computer Science</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>In today&#x2019;s world, human interaction with artificial intelligence has significantly risen thanks to the recent advancements in large language models and natural language processing. The field of large language models, while still an emerging subfield of artificial intelligence, is a vast field with varying types and specifications of each large language model and the limitations and accuracies of each. To discover this vast field more, we must develop a basic understanding of large language models, their history, applications, and challenges. Furthermore, efficiency in large language models involves several aspects, including hardware and software requirements, sourcing, training, and output accuracy. Understanding and optimizing the efficiencies of these models is imperative, given the increasing reliance on such technology in various applications. To the best of our knowledge, there are very few Systematic Literature Reviews (SLR) on the efficiencies of large language models, which has motivated this work. Therefore, this systematic literature review aims to provide a comprehensive overview of the state-of-the-art research on the efficiencies of large language models.</p>
<p>Firstly, language models possess the skill of assigning probabilities to sequences of tokens by analyzing statistical patterns in the distribution of a sequence of tokens within data. Modern language models include multiple neural network layers representing tokens within a multidimensional feature space. Unlike early n-gram models that only learned transition probabilities between one-word sequences and the following, neural language models could utilize pre-trained representation of words, called embedding (<xref ref-type="bibr" rid="ref2">Bender et al., 2021</xref>; <xref ref-type="bibr" rid="ref45">Trott et al., 2023</xref>). Since language models cannot store or recall information, having a memory component, such as vector stores, is imperative. Vector stores help search and store embedded data. When data retrieval is required from the vector store, invoked by a user query, the documents could be passed to the large language models (LLMs) through multiple methods. One of the most used methods is called the stuff method. This method is most efficient when passing similar documents in a single prompt, whereas other methods can be used in processing documents that cannot be passed in a single prompt (<xref ref-type="bibr" rid="ref44">Topsakal and Akinci, 2023</xref>). Sophisticated neural language models with billions of parameters and several deep learning techniques are what modern LLMs are.</p>
<p>Moreover, it is no wonder that with such powerful internal workflow complexity and ease of user access and querying, LLMs will be able to solve a wide range of tasks with complexities while being user-friendly. This sparked the widespread usage and integration of LLMs across various areas and into multiple fields, such as medicine, education, finance, and law. LLMs are essential in disease prediction, diagnosis, and assessment of therapeutic targets in medicine. These include providing treatment guidelines for cancer patients based on their magnetic resonance imaging radionics and predicting aging-related diseases (<xref ref-type="bibr" rid="ref41">Singhal et al., 2023</xref>; <xref ref-type="bibr" rid="ref4">Cascella et al., 2023</xref>; <xref ref-type="bibr" rid="ref17">Jo et al., 2023</xref>). ChatGPT, an LLM, was used to write preauthorization requests for dental insurance companies. Tailored and fine-tuned applications based on LLMs can enhance dental telemedicine services when combined with dental health care personnel (<xref ref-type="bibr" rid="ref14">Huang et al., 2023</xref>; <xref ref-type="bibr" rid="ref8">Eggmann et al., 2023</xref>). In education, ChatGPT was used to evaluate student-generated answers in a learning environment and help students generate answers to their questions (<xref ref-type="bibr" rid="ref32">Porsdam Mann et al., 2023</xref>; <xref ref-type="bibr" rid="ref27">Meyer et al., 2023</xref>; <xref ref-type="bibr" rid="ref20">Kasneci et al., 2023</xref>; <xref ref-type="bibr" rid="ref28">Milano et al., 2023</xref>; <xref ref-type="bibr" rid="ref24">Lund et al., 2023</xref>). Financial applications include fraud detection, algorithmic trading, and risk assessment (<xref ref-type="bibr" rid="ref9">Fan, 2024</xref>). In legal settings, LLMs support document analysis, contract review, and automated legal reasoning (<xref ref-type="bibr" rid="ref39">Siino et al., 2025</xref>). In addition to their established uses in medicine, education, finance, and law, LLMs are being explored in emerging fields such as blockchain. Recent research has highlighted the use of LLMs to automate smart contract verification and improve security in decentralized systems (<xref ref-type="bibr" rid="ref34">Ressi et al., 2024</xref>). AI-enhanced blockchain technology provides new prospects for boosting trust and accuracy in contract execution, which is still an area for future research.</p>
<p>Unfortunately, despite their various applications, there are still many challenges relating to performance, ethics, and many more. On the ethical front, concerns grow regarding bias and integrity, as these models, developed on extensive data collections, may unknowingly perpetuate and reinforce existing biases present in the training data. This raises questions about the accuracy and fairness of the outputs generated by these models, especially in sensitive applications such as hiring processes or automated decision-making (<xref ref-type="bibr" rid="ref13">Head et al., 2023</xref>). The extensive knowledge these models acquire while training raises serious privacy problems, raising the possibility of inadvertently disclosing sensitive information and necessitating the implementation of strong privacy protections. Furthermore, the necessity of developing ethical standards to stop malicious use is emphasized by the possibility of manipulating and abusing massive language models to produce false information or participate in disinformation campaigns (<xref ref-type="bibr" rid="ref52">Wu et al., 2023</xref>).</p>
<p>Beyond ethical concerns, LLMs encounter other challenges. The computational resources required for training and fine-tuning are extensive, limiting access to these technologies for smaller organizations and researchers with constrained computing capabilities. The dependency on training data introduces challenges related to the diversity and quality of the data, potentially leading to difficulties in understanding specific contexts or generating appropriate responses for underrepresented topics. Adapting these models to domain-specific contexts requires careful consideration, as fine-tuning for specialized tasks may be resource-intensive and may only sometimes yield optimal results (<xref ref-type="bibr" rid="ref6">Deng et al., 2023</xref>). The delicate balance between human-machine collaboration presents a challenge, as it is crucial to ensure that these models augment human capabilities without replacing critical decision-making processes (<xref ref-type="bibr" rid="ref2">Bender et al., 2021</xref>; <xref ref-type="bibr" rid="ref45">Trott et al., 2023</xref>).</p>
<p>Continuous learning and updating pose challenges as well. LLMs need frequent updates to stay relevant and accurate, necessitating a robust infrastructure for managing model evolution and ensuring seamless integration with emerging information sources. These challenges underscore the importance of a collaborative effort involving researchers, policymakers, and industry stakeholders to establish ethical guidelines, develop governance mechanisms, and foster responsible use of LLMs. As these models play a transformative role in diverse domains, addressing these challenges is imperative for ensuring ethical and effective social integration (<xref ref-type="bibr" rid="ref45">Trott et al., 2023</xref>).</p>
<p>With this systematic literature review, we look forward to providing a comprehensive analysis and comparison of the efficiencies of different LLMs. We will contribute to presenting such a comparison by presenting the information we collected on the hardware and software requirements, sourcing, training, and output accuracy associated with these models. This represents a critical step in understanding the multifaceted dimensions of LLM efficiencies, enabling researchers, practitioners, and policymakers to make informed decisions about their utilization and development. By shedding light on the current state of knowledge in this domain, we aim to facilitate the development of accurate and optimal solutions in the era of LLMs.</p>
<p>The remainder of this paper is divided into six sections: Section 2 provides information on related work. Section 3 describes the methodology. Section 4 lists the results and discussions. Section 5 addresses the limitations of this review&#x2014;finally, Section 6 concludes and suggests suggestions for future work.</p>
</sec>
<sec id="sec2">
<label>2</label>
<title>Related work</title>
<p>During our research, we found a total of 7 survey papers that are related to our topic. These papers have been published in the last 5&#x202F;years, and most of the documents tackled the advantages, disadvantages, and ethical and legal issues associated with LLMs. Despite our paper discussing similar points, we have mainly focused on the efficiency aspect of LLMs, unlike the other papers. Furthermore, we developed a deeper understanding of LLMs, their efficiencies, application, and overall benefits to compare our work with others. All the papers mentioned below discuss LLMs.</p>
<p><xref ref-type="bibr" rid="ref11">Floridi (2023)</xref>, <xref ref-type="bibr" rid="ref30">M&#x00F6;kander et al. (2023)</xref>, and <xref ref-type="bibr" rid="ref43">Teubner et al. (2023)</xref> discuss topics of ethical and legal matters regarding LLMs. To be specific, <xref ref-type="bibr" rid="ref11">Floridi (2023)</xref> talks about intelligence regarding LLMs. The author provides information regarding LLMs&#x2019; possible implications and ethical, legal, and human costs. Floridi compares the spiritual, animal, and AI agents and how we interact with them (<xref ref-type="bibr" rid="ref11">Floridi, 2023</xref>). Secondly, <xref ref-type="bibr" rid="ref30">M&#x00F6;kander et al. (2023)</xref> delved deeper into implications and discussed auditing, its importance, methods, and limitations. As the author explained, auditing is the governing process used to recognize and alleviate issues with AI (artificial intelligence) technologies. Auditing LLMs can be done through a three-layered approach, which includes (governance, model, and application) (<xref ref-type="bibr" rid="ref30">M&#x00F6;kander et al., 2023</xref>). Thirdly, <xref ref-type="bibr" rid="ref43">Teubner et al. (2023)</xref> discussed the expectations and future involved with LLMs and their implications. Teubner defends LLMs by pointing out that acknowledging their power instead of banning them is a more reasonable action toward LLMs&#x2019; growth. He also discusses their effectiveness, legality, and threats, clarifying misconceptions and supporting integrating and adopting LLMs into society (<xref ref-type="bibr" rid="ref43">Teubner et al., 2023</xref>). Fourthly, PLMs (pre-trained language models) and NLP (natural language processing), two fields relating to LLMs, were explored by <xref ref-type="bibr" rid="ref29">Min et al. (2023)</xref>. The survey provides background information on PLMs and categorizes the utilization of PLMs for NLPs into three paradigms: pre-train then fine-tune, prompt-based learning, and NLP as text generation, each discussed in depth (<xref ref-type="bibr" rid="ref29">Min et al., 2023</xref>). Next, <xref ref-type="bibr" rid="ref23">Liu et al. (2023)</xref> discuss prompting and provide in-depth background information. The author also explains more complex ideas, such as multi-prompt learning methods and prompt engineering, and provides information on the topic&#x2019;s applications and challenges (<xref ref-type="bibr" rid="ref23">Liu et al., 2023</xref>). Furthermore, <xref ref-type="bibr" rid="ref18">Kamnis (2023)</xref> explores GPTs (generative pre-trained transformers) through surface engineering. However, the author&#x2019;s main idea is custom data indexing, which enables entities to organize and store data using AI tools for efficient data retrieval. The author compares GPT-4 and a fine-tuned data-indexed GPT-3 model, evaluating them on their query-answering performances (<xref ref-type="bibr" rid="ref18">Kamnis, 2023</xref>). Finally, <xref ref-type="bibr" rid="ref33">Qureshi et al. (2023)</xref> investigate LLMs&#x2019;, specifically ChatGPT&#x2019;s, ability to integrate into SRs (systematic reviews). The author tests ChatGPT&#x2019;s utility and applicability by quizzing it on language interpretation tasks related to systematic reviews. Although ChatGPT faced some challenges, it could still form responses according to what was requested (<xref ref-type="bibr" rid="ref33">Qureshi et al., 2023</xref>). <xref ref-type="bibr" rid="ref11">Floridi (2023)</xref> and <xref ref-type="bibr" rid="ref43">Teubner et al. (2023)</xref> all discuss similar topics of ethical and legal matters regarding LLMs. However, they seem to lack information on the efficiencies of LLMs. Except for <xref ref-type="bibr" rid="ref11">Floridi (2023)</xref>, the other two papers, <xref ref-type="bibr" rid="ref30">M&#x00F6;kander et al. (2023)</xref> and <xref ref-type="bibr" rid="ref43">Teubner et al. (2023)</xref>, did not include comparisons between LLMs. Similarly, <xref ref-type="bibr" rid="ref33">Qureshi et al. (2023)</xref> do not conduct comparisons, but they test and discuss topics related to ChatGPT. On the contrary, <xref ref-type="bibr" rid="ref18">Kamnis (2023)</xref> compares, but the topic is too specific.</p>
<p>In our work, we will conduct a systematic literature review comparing different LLMs focusing on efficiency. <xref ref-type="table" rid="tab1">Table 1</xref> shows the contributions of each paper. We have added a column describing the difference between our contribution and the others&#x2019;.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Summary of related work.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Ref.</th>
<th align="center" valign="top">Year</th>
<th align="left" valign="top">Contributions</th>
<th align="left" valign="top">Difference</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref11">Floridi (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">Talks about different LLMs, their pros and cons, and their ethical and legal issues.</td>
<td align="left" valign="top">Provides no information on the requirements or efficiency of the large language models.</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref30">M&#x00F6;kander et al. (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">This paper analyses and evaluates LLMs from technical, ethical, and legal perspectives. It talks about the opportunities and risks of LLMs, highlights the properties that undermine the feasibility and effectiveness of existing AI auditing procedures, and derives and defends seven claims about how LLM auditing procedures should be designed and how to structure such procedures.</td>
<td align="left" valign="top">It does not compare large language models or cover the main idea, efficiency.</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref43">Teubner et al. (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">This paper discusses the emergence of ChatGPT and LLMs in general and their limits, threats, and legality.</td>
<td align="left" valign="top">It does not directly compare the efficiencies of different large language models and mainly discusses ChatGPT.</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref29">Min et al. (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">This paper surveys the three trending paradigms that use pre-trained language models for natural language processing. The paper describes each of them in-depth, summarizes prior works whose applications have shown promise, and discusses limitations.</td>
<td align="left" valign="top">It compares large language models from a natural language processing perspective, but not generally.</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref23">Liu et al. (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">This paper summarizes and analyses several paradigms in developing statistical natural language processing techniques. It also highlights the commonalities and differences between the four paradigms of natural language processing.</td>
<td align="left" valign="top">Compares large language models from a prompting parameters perspective.</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref18">Kamnis (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">This paper demonstrates that a fine-tuned data-indexed GPT model can significantly improve query response performance compared to state-of-the-art GPT-4. This model can provide more accurate, coherent, and relevant responses, which have important implications for developing and applying natural language processing models in surface engineering domains by utilizing domain adaptation and data indexing techniques.</td>
<td align="left" valign="top">Focuses on large language models, specifically GPT, for surface engineering.</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref33">Qureshi et al. (2023)</xref>
</td>
<td align="center" valign="top">2023</td>
<td align="left" valign="top">This paper discusses the capability of ChatGPT and other LLMs and their limitations or reliability in being integrated into systematic reviews.</td>
<td align="left" valign="top">It only tests ChatGPT and does not test or compare it with other models.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec sec-type="methods" id="sec3">
<label>3</label>
<title>Methodology</title>
<p>In this critical review, we used the framework proposed by Kitchenham and Charters methodology to implement our review. This approach comprises planning, conducting, and reporting phases, each comprising various stages. During the planning phase, a review protocol was formulated, encompassing six stages: articulating research questions, devising the search strategy, delineating study selection procedures, specifying quality assessment rules, outlining the data extraction strategy, and combining the extracted data. <xref ref-type="fig" rid="fig1">Figure 1</xref> illustrates the six stages mentioned.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>The stages of conducting a systematic literature review.</p>
</caption>
<graphic xlink:href="fcomp-07-1523699-g001.tif"/>
</fig>
<p><xref ref-type="fig" rid="fig1">Figure 1</xref> illustrates our journey from identifying research questions to synthesizing extracted data. The stages involve identifying search terms, searching, initial results, filtering, acquiring the final papers, applying data extraction strategies, finalizing the extraction, and finally synthesizing the extracted data.</p>
<p><xref ref-type="fig" rid="fig2">Figure 2</xref>, shown above, illustrates the process we followed in helping us narrow down our research papers. It first starts with identifying our research questions and search terms. Secondly, we apply an initial search and filtration process. Lastly, we finalize the extraction and double-check if the research technique requires to be repeated.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Applied research methodology.</p>
</caption>
<graphic xlink:href="fcomp-07-1523699-g002.tif"/>
</fig>
<sec id="sec4">
<label>3.1</label>
<title>Research questions</title>
<p>The formulation of research questions was as follows:</p>
<list list-type="bullet">
<list-item>
<p>RQ1: What is the large language model&#x2019;s application and use case deployed?</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>This question aims to understand the diverse range of applications where large language models are utilized, shedding light on the practical contexts in which they are deployed.</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>RQ2: Which specific type of large language model is employed? Is the considered model open source?</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>This question seeks to identify the specific models used in different studies and assess whether they are open-source or proprietary, which can affect replicability and accessibility.</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>RQ3: What prerequisites and resource demands are utilized in deploying a large language model? Which hardware specifications were used in the experiment? What were the model parameters employed in the experiment?</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>The sub-questions delve into the hardware and computational requirements and the model parameters, providing insights into the resource demands of deploying large language models.</p>
</list-item>
</list>
<list list-type="bullet">
<list-item>
<p>RQ4: What are the methodologies for assessing the performance metrics of the large language model deployed?</p>
</list-item>
</list>
<list list-type="simple">
<list-item>
<p>This question aims to understand the evaluation methods and metrics employed to assess the performance of large language models in various applications, offering insights into their effectiveness and limitations.</p>
</list-item>
</list>
</sec>
<sec id="sec5">
<label>3.2</label>
<title>Search strategy</title>
<p>Moving on to the subsequent stage, we provide the search strategy, aligning it with the initial stage to retrieve pertinent articles. Identifying search terms and the leading publishers used, essential for precision in the search, was also addressed.</p>
<sec id="sec6">
<label>3.2.1</label>
<title>Key search terms</title>
<p><xref ref-type="table" rid="tab2">Table 2</xref>, shown below, presents the key search terms used in the search process. These search expressions were identified based on three criteria: Firstly, the research questions were the main driver to guide the determination of the search phrases. Secondly, Boolean operators such as ANDs and ORs were utilized to aid in filtering the search results. Thirdly, new search terms were discovered by exploring relevant resources.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Display of key search terms.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">LLM keywords</th>
<th align="center" valign="top">Operator</th>
<th align="left" valign="top">Performance keywords</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">&#x201C;Large language model&#x201D; OR &#x201C;LLM&#x201D; OR &#x201C;Efficient language models&#x201D; OR &#x201C;Prompt-based language models&#x201D; OR &#x201C;Generative pre-trained transformer &#x201C;OR &#x201C;ChatGPT&#x201D; OR &#x201C;GPT-3&#x201D; OR &#x201C;GPT-4&#x201D; OR &#x201C;google BARD&#x201D; OR &#x201C;LLaMA&#x201D;</td>
<td align="center" valign="middle">AND</td>
<td align="left" valign="top">&#x201C;Efficiency&#x201D; OR &#x201C;Optimization&#x201D; OR &#x201C;Contextual prompts efficiency&#x201D; OR &#x201C;Prompt optimization transformer models.&#x201D;</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec7">
<label>3.2.2</label>
<title>Publishers</title>
<list list-type="order">
<list-item>
<p>ACM Digital Library</p>
</list-item>
<list-item>
<p>Springer</p>
</list-item>
<list-item>
<p>IEEE Explore</p>
</list-item>
<list-item>
<p>Elsevier Science Direct</p>
</list-item>
<list-item>
<p>Google Scholar</p>
</list-item>
</list>
</sec>
</sec>
<sec id="sec8">
<label>3.3</label>
<title>Study selection</title>
<p>Stage three focused on selection criteria and establishing inclusion and exclusion rules, as shown in <xref ref-type="table" rid="tab3">Table 3</xref>.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Exclusion and inclusion criteria.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Inclusion rules</th>
<th align="left" valign="top">Exclusion rules</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>Trusted source.</p>
</list-item>
<list-item>
<p>Published in the last 5&#x202F;years.</p>
</list-item>
<list-item>
<p>Direct mention of large language models.</p>
</list-item>
</list>
</td>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>Weak or unknown source.</p>
</list-item>
<list-item>
<p>Archive (unpublished).</p>
</list-item>
<list-item>
<p>Papers that talk about using large language models, too, specifically.</p>
</list-item>
<list-item>
<p>Papers that do not mention large language models.</p>
</list-item>
<list-item>
<p>Papers less than four on the QAR total score.</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec9">
<label>3.4</label>
<title>Quality assessment rules (QARs)</title>
<p>In stage 4, we evaluate the collected research articles based on the following QAR set. The QAR utilized in this research are listed below:</p>
<p>QAR 1: Is the application and use case of the deployed large language models stated?</p>
<p>QAR 2: Are the types of large language models used identified and explained?</p>
<p>QAR 3: Are the requirements for deploying the large language model detailed?</p>
<p>QAR 4: Is there a comparison between the efficiency of different large language models?</p>
<p>QAR 5: Is the evaluation of the significant language model/s well performed?</p>
<p>QAR 6: Is the method used for evaluating the large language model clear and accurate?</p>
<p>QAR 7: Are the performance metrics of large language models clearly defined and used?</p>
<p>QAR 8: Is the large language model&#x2019;s experimental setup stated and clear?</p>
<p>QAR 9: Are the large language models&#x2019; parameters described clearly and concisely?</p>
<p>QAR 10: Does this study provide enough information and evidence to be considered as related to our work?</p>
<p>Each QAR score is allocated based on the following scale. &#x2018;Not answered&#x2019; is assigned a score of 0, &#x2018;below average&#x2019; is valued at 0.25, &#x2018;average&#x2019; is given a score of 0.5, &#x2018;above average&#x2019; is designated 0.75, and &#x2018;fully answered&#x2019; is assigned a score of 1. Each study&#x2019;s overall QAR score was determined on a scale of 1 to 10. Studies that had a score of less than four were disqualified from further synthesis in accordance with our review process. The assessment adhered to a uniform and repeatable methodology that was established during the systematic review&#x2019;s preparation phase, even though the authoring team handled the scoring. This strategy aligns with Kitchenham and Charters&#x2019; suggestions, which highlight protocol-driven quality evaluation as a means of minimizing bias and improving transparency in software engineering reviews.</p>
</sec>
<sec id="sec10">
<label>3.5</label>
<title>Data extraction strategy</title>
<p>We created a sheet for articles that we found and collected. The sheet includes information regarding the large language model, paper number, paper URL, paper title, author/s, publisher, publisher source, publication type, year of publication, paper description, RQ1(field), RQ2(LLM type, source), RQ3(software requirements, hardware requirements, model parameters), and RQ4(performance metrics). It&#x2019;s imperative to note that not all research papers can answer the research questions.</p>
</sec>
<sec id="sec11">
<label>3.6</label>
<title>Synthesis of extracted data</title>
<p>As emphasized by Kitchenham and Charters, the review protocol holds significant importance in any SLR. Consequently, the authors have held regular meetings to mitigate researcher bias and uphold the quality of the review protocol. Due to the nature of our findings, our data synthesis technique is qualitative because our RQs do not involve numbers or calculations. In the results and discussions section below, we will organize the data in diagrams to the best of our ability.</p>
</sec>
</sec>
<sec sec-type="results|discussions" id="sec12">
<label>4</label>
<title>Results and discussions</title>
<p>This section will discuss the answers to the RQs and their subsections, enabling us to conclude our results for this SLR.</p>
<sec id="sec13">
<label>4.1</label>
<title>RQ1: LLM application and use cases</title>
<p>In this research question, we aim to understand what field or area the large language model utilized. Since each paper discussed a different topic in different fields, we created categories to help organize the collected research papers. After studying the papers carefully, we found that most papers covered four fields: data generation (image, text, code, etc.), prompting, modification or control of data (editing, deletion, retrieval, etc.), and prediction. We then illustrated the result of this categorization in <xref ref-type="fig" rid="fig3">Figure 3</xref>.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>Frequency histogram of the 4 LLM categories.</p>
</caption>
<graphic xlink:href="fcomp-07-1523699-g003.tif"/>
</fig>
<p>While prompt engineering and data generation may overlap in practice, they are fundamentally independent categories. Designing and organizing input prompts to elicit particular actions or enhance the quality of model output is the primary purpose of prompt engineering. This covers prompt adjustments, prompt templates, and zero&#x2212;/few-shot instances. Data generation, on the other hand, deals with the output process itself, when new artifacts like text passages, code snippets, or summaries are generated from the LLM. Studies were categorized according to their main goal as stated in each paper: tasks that focused on generating outputs were labeled as data generation, whereas operations that focused on modifying inputs were categorized as prompt engineering. Research papers on LLMs in data generation and prompting appeared most frequently. The references for the documents in each category are listed in <xref ref-type="table" rid="tab4">Table 4</xref>. Out of 27 papers, 13 were related to data generation, nine were prompting-related, four were related to the modification or control of data, and one was associated with data prediction.</p>
<table-wrap position="float" id="tab4">
<label>Table 4</label>
<caption>
<p>Paper reference numbers in each RQ1 category.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Reference</th>
<th align="left" valign="top">RQ1 Category</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle"><xref ref-type="bibr" rid="ref6">Deng et al. (2023)</xref>; <xref ref-type="bibr" rid="ref37">Sarsa et al. (2022)</xref>; <xref ref-type="bibr" rid="ref12">H&#x00E4;m&#x00E4;l&#x00E4;inen et al. (2023)</xref>; <xref ref-type="bibr" rid="ref36">Ross et al. (2023)</xref>; <xref ref-type="bibr" rid="ref1">Badini et al. (2023)</xref>; <xref ref-type="bibr" rid="ref26">Mahuli et al. (2023)</xref>; <xref ref-type="bibr" rid="ref25">Macneil et al. (2022)</xref>; <xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>; <xref ref-type="bibr" rid="ref15">Jain et al. (2022)</xref>; <xref ref-type="bibr" rid="ref47">Vaithilingam et al. (2022)</xref>; <xref ref-type="bibr" rid="ref7">Di Fede et al. (2022)</xref>; <xref ref-type="bibr" rid="ref22">King (2023)</xref>; <xref ref-type="bibr" rid="ref19">Kang et al. (2023)</xref></td>
<td align="left" valign="middle">Generation of Data</td>
</tr>
<tr>
<td align="left" valign="middle"><xref ref-type="bibr" rid="ref42">Strobelt et al. (2023)</xref>; <xref ref-type="bibr" rid="ref51">Wang et al. (2023)</xref>; <xref ref-type="bibr" rid="ref5">Chang (2023)</xref>; <xref ref-type="bibr" rid="ref55">Zamfirescu-Pereira et al. (2023)</xref>; <xref ref-type="bibr" rid="ref53">Wu et al. (2022)</xref>; <xref ref-type="bibr" rid="ref16">Jiang et al. (2022)</xref>; <xref ref-type="bibr" rid="ref35">Reynolds and McDonell (2021)</xref>; <xref ref-type="bibr" rid="ref40">Singh et al. (2023)</xref>; <xref ref-type="bibr" rid="ref3">Beurer-Kellner et al. (2023)</xref></td>
<td align="left" valign="middle">Prompting Related</td>
</tr>
<tr>
<td align="left" valign="middle"><xref ref-type="bibr" rid="ref31">Pan and Ke (2023)</xref>; <xref ref-type="bibr" rid="ref38">Scells et al. (2023)</xref>; <xref ref-type="bibr" rid="ref10">Fan et al. (2023)</xref>; <xref ref-type="bibr" rid="ref46">Urban et al. (2023)</xref></td>
<td align="left" valign="middle">Modification or Control of Data</td>
</tr>
<tr>
<td align="left" valign="middle">
<xref ref-type="bibr" rid="ref21">Kim et al. (2021)</xref>
</td>
<td align="left" valign="middle">Prediction of Data</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="sec14">
<label>4.2</label>
<title>RQ2: LLM type and access</title>
<p>In this research question, we plan to investigate the type of large language model used in the research paper. We also explore whether the LLM is open source or closed source. <xref ref-type="fig" rid="fig4">Figure 4</xref> displays the LLM type along with the frequency; <xref ref-type="fig" rid="fig5">Figure 5</xref> shows whether the LLM type is open or closed source, while <xref ref-type="table" rid="tab5">Table 5</xref> provides a combination of references for papers involved with each LLM type and source.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Frequency histogram of the LLM types.</p>
</caption>
<graphic xlink:href="fcomp-07-1523699-g004.tif"/>
</fig>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Open source LLM frequency.</p>
</caption>
<graphic xlink:href="fcomp-07-1523699-g005.tif"/>
</fig>
<table-wrap position="float" id="tab5">
<label>Table 5</label>
<caption>
<p>LLM type for each paper.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top" colspan="4">LLM Frequency</th>
</tr>
<tr>
<th align="left" valign="top">Reference Number</th>
<th align="left" valign="top">Large Language Model</th>
<th align="center" valign="top">Frequency</th>
<th align="center" valign="top">Open-Source?</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top"><xref ref-type="bibr" rid="ref37">Sarsa et al. (2022)</xref>; <xref ref-type="bibr" rid="ref12">H&#x00E4;m&#x00E4;l&#x00E4;inen et al. (2023)</xref>; <xref ref-type="bibr" rid="ref25">Macneil et al. (2022)</xref>; <xref ref-type="bibr" rid="ref5">Chang (2023)</xref>; <xref ref-type="bibr" rid="ref15">Jain et al. (2022)</xref>; <xref ref-type="bibr" rid="ref7">Di Fede et al. (2022)</xref>; <xref ref-type="bibr" rid="ref35">Reynolds and McDonell (2021)</xref>; <xref ref-type="bibr" rid="ref40">Singh et al. (2023)</xref>; <xref ref-type="bibr" rid="ref3">Beurer-Kellner et al. (2023)</xref>; <xref ref-type="bibr" rid="ref46">Urban et al. (2023)</xref></td>
<td align="left" valign="top">GPT-3</td>
<td align="center" valign="top">10</td>
<td align="center" valign="top">No</td>
</tr>
<tr>
<td align="left" valign="top"><xref ref-type="bibr" rid="ref6">Deng et al. (2023)</xref>; <xref ref-type="bibr" rid="ref37">Sarsa et al. (2022)</xref>; <xref ref-type="bibr" rid="ref36">Ross et al. (2023)</xref>; <xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>; <xref ref-type="bibr" rid="ref47">Vaithilingam et al. (2022)</xref>; <xref ref-type="bibr" rid="ref19">Kang et al. (2023)</xref>; <xref ref-type="bibr" rid="ref40">Singh et al. (2023)</xref>; <xref ref-type="bibr" rid="ref10">Fan et al. (2023)</xref></td>
<td align="left" valign="top">Codex</td>
<td align="center" valign="top">8</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top"><xref ref-type="bibr" rid="ref53">Wu et al. (2022)</xref>; <xref ref-type="bibr" rid="ref31">Pan and Ke (2023)</xref>; <xref ref-type="bibr" rid="ref22">King (2023)</xref></td>
<td align="left" valign="top">LaMDA</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top"><xref ref-type="bibr" rid="ref1">Badini et al. (2023)</xref>; <xref ref-type="bibr" rid="ref26">Mahuli et al. (2023)</xref>; <xref ref-type="bibr" rid="ref55">Zamfirescu-Pereira et al. (2023)</xref></td>
<td align="left" valign="top">GPT-3.5</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">No</td>
</tr>
<tr>
<td align="left" valign="top"><xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>; <xref ref-type="bibr" rid="ref3">Beurer-Kellner et al. (2023)</xref>; <xref ref-type="bibr" rid="ref21">Kim et al. (2021)</xref></td>
<td align="left" valign="top">GPT-2</td>
<td align="center" valign="top">3</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top"><xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>; <xref ref-type="bibr" rid="ref3">Beurer-Kellner et al. (2023)</xref></td>
<td align="left" valign="top">GPT-J</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref42">Strobelt et al. (2023)</xref>
</td>
<td align="left" valign="top">T0</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref51">Wang et al. (2023)</xref>
</td>
<td align="left" valign="top">PaLM</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">No</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref31">Pan and Ke (2023)</xref>
</td>
<td align="left" valign="top">Stytr2</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref38">Scells et al. (2023)</xref>
</td>
<td align="left" valign="top">PubMed-BERT, BERT, DistilBERT</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">Yes</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>
</td>
<td align="left" valign="top">GPT-Neo, GPT-NeoX, CodeParrot</td>
<td align="center" valign="top">1</td>
<td align="center" valign="top">Yes</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From the results in <xref ref-type="fig" rid="fig4">Figure 4</xref> above, we infer that GPT-3 was the most utilized LLM in the research papers studied, with a frequency of 10, meaning it has been used or mentioned by 10 papers. Next comes Codex, with a frequency of 8, making it the second most used or mentioned LLM amongst the papers investigated. Lambda, GPT-3.5, and GPT-2 are all tied with a frequency of 3. Papers (<xref ref-type="bibr" rid="ref6">Deng et al., 2023</xref>; <xref ref-type="bibr" rid="ref37">Sarsa et al., 2022</xref>; <xref ref-type="bibr" rid="ref12">H&#x00E4;m&#x00E4;l&#x00E4;inen et al., 2023</xref>; <xref ref-type="bibr" rid="ref36">Ross et al., 2023</xref>; <xref ref-type="bibr" rid="ref1">Badini et al., 2023</xref>; <xref ref-type="bibr" rid="ref26">Mahuli et al., 2023</xref>; <xref ref-type="bibr" rid="ref25">Macneil et al., 2022</xref>; <xref ref-type="bibr" rid="ref54">Xu et al., 2022</xref>; <xref ref-type="bibr" rid="ref42">Strobelt et al., 2023</xref>; <xref ref-type="bibr" rid="ref51">Wang et al., 2023</xref>; <xref ref-type="bibr" rid="ref5">Chang, 2023</xref>; <xref ref-type="bibr" rid="ref55">Zamfirescu-Pereira et al., 2023</xref>; <xref ref-type="bibr" rid="ref53">Wu et al., 2022</xref>; <xref ref-type="bibr" rid="ref31">Pan and Ke, 2023</xref>; <xref ref-type="bibr" rid="ref38">Scells et al., 2023</xref>) were papers that either decided to deploy their models or have used models that no other paper used, making the LLM, when represented in a table or illustrated, have a frequency of 1.</p>
<p><xref ref-type="fig" rid="fig5">Figure 5</xref> answers the question of whether the LLM is open source or closed source. The results display the percentage of papers that utilized open-source LLMs contrasted with those that accessed closed-source LLMs instead. With 59% against 41%, we conclude that most papers used open-source LLMs. This means that out of the 34 LLMs studied carefully in each paper, 20 LLMs were open source, while 14 were closed source. Despite the fact that a number of articles stated the utilization of open-source models, certain fine-tuning techniques were frequently overlooked out or only briefly mentioned. Therefore, it is still difficult to replicate the experimental conditions outlined in those studies. This draws attention to a more general problem in the literature and emphasizes the importance of identifying methods for supporting reproducibility in future research.</p>
<p><xref ref-type="table" rid="tab5">Table 5</xref> combines the two previous figures, <xref ref-type="fig" rid="fig4">Figure 4</xref> and <xref ref-type="fig" rid="fig5">5</xref>, and details which papers utilized what specific LLM and whether it was open source.</p>
</sec>
<sec id="sec15">
<label>4.3</label>
<title>RQ3: Setup approach and LLM parameters</title>
<p>Through this research question, we seek to provide information on the hardware setup utilized for operating the LLM as well as the LLM&#x2019;s parameters. For this question, we considered the variety of LLMs deployed by each research paper and the different resources used by various researchers. Therefore, we will organize the hardware information we collected into smaller components. The results are reflected in <xref ref-type="table" rid="tab6">Table 6</xref>.</p>
<table-wrap position="float" id="tab6">
<label>Table 6</label>
<caption>
<p>The hardware components of each setup.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Ref</th>
<th align="left" valign="top">LLM</th>
<th align="left" valign="top">CPU</th>
<th align="left" valign="top">RAM</th>
<th align="left" valign="top">GPUs</th>
<th align="left" valign="top">Notes</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref10">Fan et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">Intel Xeon E5-2660</td>
<td align="left" valign="top">64GB</td>
<td align="left" valign="top">1 x NVIDIA Titan V</td>
<td align="left" valign="top">Single GPU, likely development/testing setup</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref3">Beurer-Kellner et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3, GPT-2, GPT-J</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">1 x NVIDIA A100 (40GB/80GB)</td>
<td align="left" valign="top">Single A100 GPU, likely inference/smaller models</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref6">Deng et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">High-end workstation</td>
<td align="left" valign="top">256GB</td>
<td align="left" valign="top">4 x NVIDIA RTX A6000</td>
<td align="left" valign="top">Powerful multi-GPU setup</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref31">Pan and Ke (2023)</xref>
</td>
<td align="left" valign="top">Stytr2</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">2 x NVIDIA Tesla P100&#x202F;+&#x202F;2 x NVIDIA RTX 3090</td>
<td align="left" valign="top">Mixed older/newer GPUs, research-specific setup</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref38">Scells et al. (2023)</xref>
</td>
<td align="left" valign="top">PubMed-BERT, BERT, DistilBERT</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">No hardware information</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>
</td>
<td align="left" valign="top">Codex, GPT-2, GPT-J, GPT-Neo, GPT-NeoX, CodeParrot</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">8 x NVIDIA RTX 8000</td>
<td align="left" valign="top">Most powerful setup, likely large/complex models/training</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref21">Kim et al. (2021)</xref>
</td>
<td align="left" valign="top">GPT-2</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">4 x NVIDIA Tesla V100</td>
<td align="left" valign="top">Multi-GPU setup with older GPUs, research/development</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref19">Kang et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">Intel Core i7-7700</td>
<td align="left" valign="top">32GB</td>
<td align="left" valign="top">Not available</td>
<td align="left" valign="top">Low-end setup, likely small models/testing</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="tab6">Table 6</xref> provides information on the specific hardware components mentioned in the research papers. Although the hardware configurations utilized in the examined research are shown, many of the articles did not provide full system specifications. A variety of configurations are observed among the remaining studies, ranging from high-end multi-GPU systems to more affordable single-GPU or CPU-only setups. This diversity reflects the varying resource capacities of researchers and use cases, and it underscores the need for more consistent reporting in future studies to support improved documentation and analysis of model performance. The most powerful setup among the reviewed papers was reported by the researchers in (<xref ref-type="bibr" rid="ref54">Xu et al., 2022</xref>). The rest have mixed to lower-end setups, yet they could still deploy powerful LLMs despite that. Most papers still need to provide information regarding the hardware setup.</p>
<p>For the section about LLM parameter sizes, a figure was generated to show the scale of the models described in the examined research, from largest to smallest. The findings are shown in <xref ref-type="fig" rid="fig6">Figure 6</xref>. CuBERT has the fewest parameters (about 345 million), while PaLM has the largest, with 540 billion parameters.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>Frequency histogram of the LLMs&#x2019; parameters.</p>
</caption>
<graphic xlink:href="fcomp-07-1523699-g006.tif"/>
</fig>
<p>A comprehensive evaluation of deployment feasibility cannot be obtained from parameter count independently; however, it does give a broad idea of model complexity and possible resource requirements. Due to a lack of consistency in the studied literature, important factors, including energy usage during training and inference, as well as the economic cost per inference step, were excluded from our study. In order to provide more comprehensive and useful assessments of model efficiency, this limitation highlights the significance of including energy and cost-related indicators in further studies.</p>
</sec>
<sec id="sec16">
<label>4.4</label>
<title>RQ4: Performance metrics and evaluation</title>
<p>With this research question, we aim to identify the metrics used to evaluate the performance of different LLMs. It is important to note that the metrics vary from one LLM to another because of the use case or application of the LLM. For example, PaLM (<xref ref-type="bibr" rid="ref51">Wang et al., 2023</xref>) was evaluated on grammar correctness because the paper is prompting-related. In contrast, Codex (<xref ref-type="bibr" rid="ref6">Deng et al., 2023</xref>) was considered regarding the number of detected bugs because the paper is related to code generation purpose.</p>
<p>To provide a more structured overview, performance metrics were grouped into six categories: Translation Evaluation Metrics, Code Analysis Metrics, NLP Output Quality Metrics, User Interaction and Feedback Metrics, Model Evaluation Benchmarks, and Domain-Specific Evaluation Metrics. This restructuring addresses differences in evaluation criteria across domains and ensures a more balanced representation, especially for fields like healthcare and education. While code-related metrics remain prominent due to the number of studies in programming contexts, domain-specific metrics have been explicitly highlighted to mitigate cross-domain bias and promote greater clarity. <xref ref-type="table" rid="tab7">Table 7</xref> presents the reorganized metric categories along with representative evaluation aspects.</p>
<table-wrap position="float" id="tab7">
<label>Table 7</label>
<caption>
<p>Categories of performance metrics and their aspects.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Category</th>
<th align="left" valign="top">Aspect</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">Translation Evaluation Metrics</td>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>BLEU (Bilingual Evaluation Understudy): Measures the quality of machine-generated translations by comparing them to reference translations.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">Code Analysis Metrics</td>
<td align="left" valign="top">Program Analysis Metrics:<break/><list list-type="bullet">
<list-item>
<p>LOC (Lines of Code): Measures the number of lines of code in a program.</p>
</list-item>
<list-item>
<p>Number of model queries, Number of Decoder calls, Billable tokens: Metrics related to the usage and efficiency of language models in code-related tasks.</p>
</list-item>
<list-item>
<p>Statistical analyses, Frechet Distance, precision, recall, topic similarities and differences, answer consistency, game frequencies: Metrics for evaluating code generation models&#x2019; statistical properties and performance.</p>
</list-item>
<list-item>
<p>Number of detected bugs, Code coverage, Number of covered APIs, Number of unique, valid programs generated, Execution time: Metrics related to the quality, coverage, and efficiency of generated code.</p>
</list-item>
</list></td>
</tr>
<tr>
<td align="left" valign="top">NLP Output Quality Metrics</td>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>Grammar Correctness, UI Relevance, Question Coverage, BLEU, CIDEr, ROUGE-L, METEOR.</p>
</list-item>
<list-item>
<p>Exact Matches, Contains GT, Sub-String of GT, Micro-F1: Metrics related to the quality and relevance of natural language outputs, especially in conversational contexts.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">User Interaction and Feedback Metrics</td>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>Success rate (SR), goal conditions recall (GCR), Executability (Exec): Metrics measuring the success and effectiveness of user interactions with language models.</p>
</list-item>
<list-item>
<p>Quantitative and qualitative participant feedback, Surveys: Metrics involving user feedback, satisfaction, and perception.</p>
</list-item>
<list-item>
<p>Number of errors encountered during task completion, Number of retries required to complete a task, Time taken to complete a task, Perceived ease of use, and usefulness of the tool: Metrics assessing the user experience, efficiency, and usability of language models.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">Model Evaluation Benchmarks</td>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>GLUE (General Language Understanding Evaluation Benchmark): Evaluates a model&#x2019;s performance on various NLP tasks.</p>
</list-item>
<list-item>
<p>CRIT (Critical Reading Inquisitive Template): Evaluates models based on critical reading comprehension.</p>
</list-item>
<list-item>
<p>Perplexity: Measures how well a language model predicts the next token in a sequence of code or text.</p>
</list-item>
<list-item>
<p>Recall, precision, f-measure: Standard metrics for evaluating the performance of models in information retrieval or classification tasks.</p>
</list-item>
</list>
</td>
</tr>
<tr>
<td align="left" valign="top">Domain-Specific Evaluation Metrics</td>
<td align="left" valign="top">
<list list-type="bullet">
<list-item>
<p>Using ROBINS-I tool and Risk of Bias analysis, Data extraction from a randomized controlled trial: Metrics related to evaluating research studies and experiments.</p>
</list-item>
<list-item>
<p>Likert Scale: Measures attitudes or opinions using a scale of responses.</p>
</list-item>
</list>
</td>
</tr>
</tbody>
</table>
</table-wrap>
<p><xref ref-type="table" rid="tab8">Table 8</xref> reveals the categorization result, presenting each paper with its language model and the information it provided regarding the metrics and category. The table concludes that most papers belong to the &#x201C;Code Analysis Metrics&#x201D; category, to be specific, 11 papers evaluated their LLMs on program analysis metrics and natural language processing (NLP) metrics. Next were specific model evaluation metrics, user interaction and feedback metrics, and translation evaluation metrics, with 7, 4, and 1 research papers related to each category in that order.</p>
<table-wrap position="float" id="tab8">
<label>Table 8</label>
<caption>
<p>LLM performance metrics.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Ref</th>
<th align="left" valign="top">Model</th>
<th align="left" valign="top">Metrics</th>
<th align="left" valign="top">Category</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref35">Reynolds and McDonell (2021)</xref>
</td>
<td align="left" valign="top">GPT-3</td>
<td align="left" valign="top">BLEU (French-to-English translations)</td>
<td align="left" valign="top">Translation Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref10">Fan et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">Manual analysis, codex-e, TBar, and Recorder</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref3">Beurer-Kellner et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3, GPT-2, GPT-J</td>
<td align="left" valign="top">LOC, Number of model queries, Number of Decoder calls, Billable tokens</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref37">Sarsa et al. (2022)</xref>
</td>
<td align="left" valign="top">GPT-3, Codex</td>
<td align="left" valign="top">Programmatic analysis</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref12">H&#x00E4;m&#x00E4;l&#x00E4;inen et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3</td>
<td align="left" valign="top">Statistical analyses, Frechet Distance, precision, recall, topic similarities and differences, answer consistency, game frequencies</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref6">Deng et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">Number of detected bugs, Code coverage, Number of covered APIs, Number of unique, valid programs generated, Execution time</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref51">Wang et al. (2023)</xref>
</td>
<td align="left" valign="top">PaLM</td>
<td align="left" valign="top">Grammar Correctness, UI Relevance, Question Coverage, BLEU, CIDEr, ROUGE-L, and METEOR, Exact Matches, Contains GT, Sub-String of GT, Micro-F1</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref38">Scells et al. (2023)</xref>
</td>
<td align="left" valign="top">PubMed-BERT, BERT, DistilBERT</td>
<td align="left" valign="top">Recall, precision, f-measure</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref55">Zamfirescu-Pereira et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3.5</td>
<td align="left" valign="top">Number of errors encountered during task completion, Number of retries required to complete a task, Time taken to complete a task, Perceived ease of use, and usefulness of the tool</td>
<td align="left" valign="top">User Interaction and Feedback Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref1">Badini et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3.5</td>
<td align="left" valign="top">Resolution of specific 3D printing issues considering filament material and other conditions</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref53">Wu et al. (2022)</xref>
</td>
<td align="left" valign="top">LaMDA</td>
<td align="left" valign="top">Likert Scale, Interaction mechanisms and behaviors, Consecutive run, Edited, Curated, Created, Undone</td>
<td align="left" valign="top">User Interaction and Feedback Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref54">Xu et al. (2022)</xref>
</td>
<td align="left" valign="top">Codex, GPT-2, GPT-J, GPT-Neo, GPT-NeoX, CodeParrot</td>
<td align="left" valign="top">Perplexity, Code completion accuracy, Human evaluation</td>
<td align="left" valign="top">Code Analysis Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref40">Singh et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">Success rate (SR), goal conditions recall (GCR), Executability (Exec)</td>
<td align="left" valign="top">User Interaction and Feedback Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref36">Ross et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">Quantitative and qualitative feedback from 42 participants, Surveys (pre-study, pre-task, post-task)</td>
<td align="left" valign="top">User Interaction and Feedback Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref26">Mahuli et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3.5</td>
<td align="left" valign="top">Using the ROBINS-I tool and Risk of Bias analysis, Data extraction from a randomized controlled trial</td>
<td align="left" valign="top">User Interaction and Feedback Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref42">Strobelt et al. (2023)</xref>
</td>
<td align="left" valign="top">T0</td>
<td align="left" valign="top">GLUE (General Language Understanding Evaluation Benchmark)</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref5">Chang (2023)</xref>
</td>
<td align="left" valign="top">GPT-3</td>
<td align="left" valign="top">CRIT (Critical Reading Inquisitive Template)</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref25">Macneil et al. (2022)</xref>
</td>
<td align="left" valign="top">GPT-3</td>
<td align="left" valign="top">Tracing the execution of code, Fixing bugs, Explaining how they were fixed, Generating analogies, Listing relevant programming concepts, Predicting the console output</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref22">King (2023)</xref>
</td>
<td align="left" valign="top">LaMDA</td>
<td align="left" valign="top">Analysis of the accuracy of scientific references generated by Google&#x2019;s Bard chatbot</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref46">Urban et al. (2023)</xref>
</td>
<td align="left" valign="top">GPT-3</td>
<td align="left" valign="top">The accuracy of natural language prompts and structured prompts</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref21">Kim et al. (2021)</xref>
</td>
<td align="left" valign="top">GPT-2</td>
<td align="left" valign="top">Evaluation of next token prediction for leaf tokens</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
<tr>
<td align="left" valign="top">
<xref ref-type="bibr" rid="ref19">Kang et al. (2023)</xref>
</td>
<td align="left" valign="top">Codex</td>
<td align="left" valign="top">&#x201C;-acc@n,&#x201D; precision, wef, wef@n</td>
<td align="left" valign="top">Specific Model Evaluation Metrics</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="sec17">
<label>5</label>
<title>Challenges and recommendations</title>
<p>Over a broad spectrum of applications, LLM has shown considerable promise. Nonetheless, there are several major challenges to overcome, especially in developing fields like smart contract validation, Internet of Things (IoT) integration, and privacy-preserving implementations. To guarantee the safe, open, and responsible application of LLMs in high-stakes situations, these challenges require further consideration.</p>
<p>The use of LLMs to support smart contract verification is an emerging area of interest. Although LLMs can assist in developing, summarizing, or analyzing smart contracts, their integration within blockchain systems presents unique difficulties. Smart contracts demand precise and verifiable logic, where even minor errors can lead to significant financial consequences. The limited interpretability of LLM-generated outputs further complicates efforts to trace and validate contract code, particularly in security-critical scenarios. Significant privacy and security issues also arise from the integration of LLMs into edge computing and industrial IoT environments. One study, <xref ref-type="bibr" rid="ref48">Wang et al. (2022)</xref> emphasizes the importance of secure data aggregation techniques in blockchain-enabled IoT systems to protect user privacy. Another work, <xref ref-type="bibr" rid="ref50">Wang et al. (2022)</xref> highlights the complexity of developing hierarchical trust evaluation models in 5G-enabled intelligent transportation systems, especially when incorporating AI-driven components like LLMs. Additionally, hierarchical federated learning has demonstrated the potential to enhance both privacy and anomaly detection in industrial settings (<xref ref-type="bibr" rid="ref49">Wang et al., 2023</xref>). Collectively, these studies underscore the urgency of adapting LLM deployments to privacy-aware architectures, particularly when dealing with real-time, sensitive, or decentralized data.</p>
<p>Future LLM applications in smart contracts should include formal verification tools that confirm logic soundness and identify potential vulnerabilities in order to address these problems. To reduce data exposure during model training and inference, techniques like differential privacy and federated learning should be further investigated in privacy-sensitive domains like the IoT and transportation. Furthermore, policy and regulatory frameworks need to change to take into account the increasing role that LLMs play in operational, financial, and legal decision-making. Lastly, to improve reproducibility and ease cross-domain benchmarking, researchers are encouraged to accept stronger reporting standards, especially with regard to fine-tuning methods, evaluation processes, and deployment.</p>
</sec>
<sec id="sec18">
<label>6</label>
<title>Conclusion and future work</title>
<p>In our systematic literature review, we researched a comparison between large language models, with our focus on their efficiency. We reviewed 27 research papers published between 2019 and 2023. We also crafted four research questions that we believed would be relevant in helping with our comparison. RQ1 covered the field the LLM was used in, RQ2 covered the type of LLM as well as whether it is open source or not, RQ3 covered hardware requirements as well as the LLMs&#x2019; parameters, and finally, RQ4 covered the metrics used for the evaluation of the LLM. We collected research papers and evaluated them based on the above research questions.</p>
<p>Our findings revealed that most studies leveraged LLMs for data generation tasks, followed by prompting-related applications. GPT-3 was the most widely used model, appearing in 10 studies, followed by Codex. A majority of studies utilized open-source LLMs, while others employed proprietary models. Our analysis of hardware setups highlighted a lack of detailed reporting on computational resources, though one study utilized an 8 x NVIDIA RTX 8000 GPU setup for high-performance LLM deployment. Regarding evaluation, we observed a strong emphasis on code analysis metrics, followed by model-specific evaluations and user interaction feedback.</p>
<p>Future studies should investigate more detailed efficiency indicators, including delay inference, energy usage, and model resilience, alongside computing cost and accuracy. Comparative studies in fields such as law, finance, and scientific research could provide further insights into the specialized performance of LLMs. Addressing biases, data privacy challenges, and adversarial robustness will require more systematic evaluations. Additionally, advancements in model optimization techniques, such as pruning, quantization, and efficient fine-tuning, can help mitigate the computational burden of large-scale deployment. Beyond efficiency, future research should emphasize interpretability and usability, as these factors are crucial for real-world adoption. Transparency in decision-making, bias reduction, and explainability in high-risk applications, particularly in healthcare and finance, remain critical research areas. Exploring LLM applications in novel domains, such as blockchain-based smart contract verification, could further reveal insights into their adaptability and security implications. By addressing these research gaps, the continued evolution of LLMs can be guided toward maximizing efficiency while mitigating potential risks associated with widespread deployment.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec19">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="sec20">
<title>Author contributions</title>
<p>YS: Data curation, Formal analysis, Methodology, Resources, Visualization, Writing &#x2013; original draft. MA: Data curation, Funding acquisition, Project administration, Resources, Software, Supervision, Writing &#x2013; review &#x0026; editing. QN: Conceptualization, Formal analysis, Investigation, Project administration, Resources, Supervision, Writing &#x2013; review &#x0026; editing. FD: Conceptualization, Data curation, Formal analysis, Investigation, Resources, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec sec-type="funding-information" id="sec21">
<title>Funding</title>
<p>The author(s) declare that no financial support was received for the research and/or publication of this article.</p>
</sec>
<ack>
<p>We like to convey our sincere appreciation to the General Civil Aviation Authority (GCAA) of the UAE for founding the Aerospace Centre of Excellence and executing this research study. We express our gratitude to our supervisors and colleagues from the OpenUAE Research and Development Group at the University of Sharjah for their invaluable insights and knowledge that significantly contributed to the research.</p>
</ack>
<sec sec-type="COI-statement" id="sec22">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec23">
<title>Generative AI statement</title>
<p>The author(s) declare that no Gen AI was used in the creation of this manuscript.</p>
</sec>
<sec sec-type="disclaimer" id="sec24">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="ref1"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Badini</surname> <given-names>S.</given-names></name> <name><surname>Regondi</surname> <given-names>S.</given-names></name> <name><surname>Frontoni</surname> <given-names>E.</given-names></name> <name><surname>Pugliese</surname> <given-names>R.</given-names></name></person-group> (<year>2023</year>). <article-title>Assessing the capabilities of ChatGPT to improve additive manufacturing troubleshooting</article-title>. <source>Adv. Indust. Eng. Polymer Res.</source> <volume>6</volume>, <fpage>278</fpage>&#x2013;<lpage>287</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.aiepr.2023.03.003</pub-id></citation></ref>
<ref id="ref2"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Bender</surname> <given-names>E. M.</given-names></name> <name><surname>Gebru</surname> <given-names>T.</given-names></name> <name><surname>McMillan-Major</surname> <given-names>A.</given-names></name> <name><surname>Shmitchell</surname> <given-names>S.</given-names></name></person-group>, (<year>2021</year>). &#x201C;On the dangers of stochastic parrots: can language models be too big?,&#x201D; in <italic>FAccT 2021 - Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency</italic>. pp. 610&#x2013;623.</citation></ref>
<ref id="ref3"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Beurer-Kellner</surname> <given-names>L.</given-names></name> <name><surname>Fischer</surname> <given-names>M.</given-names></name> <name><surname>Vechev</surname> <given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>Prompting is programming: a query language for large language models</article-title>. <source>Proc. ACM Program. Lang.</source> <volume>7</volume>, <fpage>1946</fpage>&#x2013;<lpage>1969</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3591300</pub-id></citation></ref>
<ref id="ref4"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cascella</surname> <given-names>M.</given-names></name> <name><surname>Montomoli</surname> <given-names>J.</given-names></name> <name><surname>Bellini</surname> <given-names>V.</given-names></name> <name><surname>Bignami</surname> <given-names>E.</given-names></name></person-group> (<year>2023</year>). <article-title>Evaluating the feasibility of ChatGPT in healthcare: an analysis of multiple clinical and research scenarios</article-title>. <source>J. Med. Syst.</source> <volume>47</volume>, <fpage>1</fpage>&#x2013;<lpage>5</lpage>. doi: <pub-id pub-id-type="doi">10.1007/S10916-023-01925-4/TABLES/2</pub-id></citation></ref>
<ref id="ref5"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Chang</surname> <given-names>E. Y.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Prompting large language models with the Socratic method,&#x201D; In <italic>2023 IEEE 13th Annual Computing and Communication Workshop and Conference, CCWC 2023</italic>, Institute of Electrical and Electronics Engineers Inc., pp. 351&#x2013;360.</citation></ref>
<ref id="ref6"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Deng</surname> <given-names>Y.</given-names></name> <name><surname>Xia</surname> <given-names>C. S.</given-names></name> <name><surname>Peng</surname> <given-names>H.</given-names></name> <name><surname>Yang</surname> <given-names>C.</given-names></name> <name><surname>Zhang</surname> <given-names>L.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Large language models are zero-shot Fuzzers: fuzzing deep-learning libraries via large language models,&#x201D; <italic>ISSTA 2023 - Proceedings of the 32nd ACM SIGSOFT International Symposium on Software Testing and Analysis</italic>. pp. 423&#x2013;435.</citation></ref>
<ref id="ref7"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Di Fede</surname> <given-names>G.</given-names></name> <name><surname>Rocchesso</surname> <given-names>D.</given-names></name> <name><surname>Dow</surname> <given-names>S. P.</given-names></name> <name><surname>Andolina</surname> <given-names>S.</given-names></name></person-group>, (<year>2022</year>). &#x201C;The idea machine: LLM-based expansion, rewriting, combination, and suggestion of ideas,&#x201D; in <italic>ACM International Conference Proceeding Series</italic>. pp. 623&#x2013;627.</citation></ref>
<ref id="ref8"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Eggmann</surname> <given-names>F.</given-names></name> <name><surname>Weiger</surname> <given-names>R.</given-names></name> <name><surname>Zitzmann</surname> <given-names>N. U.</given-names></name> <name><surname>Blatz</surname> <given-names>M. B.</given-names></name></person-group> (<year>2023</year>). <article-title>Implications of large language models such as ChatGPT for dental medicine</article-title>. <source>J. Esthet. Restor. Dent.</source> <volume>35</volume>, <fpage>1098</fpage>&#x2013;<lpage>1102</lpage>. doi: <pub-id pub-id-type="doi">10.1111/jerd.13046</pub-id>, PMID: <pub-id pub-id-type="pmid">37017291</pub-id></citation></ref>
<ref id="ref9"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>M.</given-names></name></person-group>, (<year>2024</year>). &#x201C;LLMs in banking: applications, challenges, and approaches,&#x201D; in <italic>Proceedings of the International Conference on Digital Economy, Blockchain and Artificial Intelligence</italic>, in DEBAI &#x2018;24. New York, NY, USA: Association for Computing Machinery. pp. 314&#x2013;321.</citation></ref>
<ref id="ref10"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Fan</surname> <given-names>Z.</given-names></name> <name><surname>Gao</surname> <given-names>X.</given-names></name> <name><surname>Mirchev</surname> <given-names>M.</given-names></name> <name><surname>Roychoudhury</surname> <given-names>A.</given-names></name> <name><surname>Tan</surname> <given-names>S. H.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Automated repair of programs from large language models,&#x201D; Institute of Electrical and Electronics Engineers (IEEE). pp. 1469&#x2013;1481.</citation></ref>
<ref id="ref11"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Floridi</surname> <given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>AI as agency without intelligence: on ChatGPT, large language models, and other generative models</article-title>. <source>Philos. Technol.</source> <volume>36</volume>, <fpage>1</fpage>&#x2013;<lpage>7</lpage>. doi: <pub-id pub-id-type="doi">10.1007/S13347-023-00621-Y/FIGURES/3</pub-id></citation></ref>
<ref id="ref12"><citation citation-type="other"><person-group person-group-type="author"><name><surname>H&#x00E4;m&#x00E4;l&#x00E4;inen</surname> <given-names>P.</given-names></name> <name><surname>Tavast</surname> <given-names>M.</given-names></name> <name><surname>Kunnari</surname> <given-names>A.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Evaluating large language models in generating synthetic HCI research data: a case study,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings</italic>.</citation></ref>
<ref id="ref13"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Head</surname> <given-names>C. B.</given-names></name> <name><surname>Jasper</surname> <given-names>P.</given-names></name> <name><surname>McConnachie</surname> <given-names>M.</given-names></name> <name><surname>Raftree</surname> <given-names>L.</given-names></name> <name><surname>Higdon</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Large language model applications for evaluation: opportunities and ethical implications</article-title>. <source>N. Dir. Eval.</source> <volume>2023</volume>, <fpage>33</fpage>&#x2013;<lpage>46</lpage>. doi: <pub-id pub-id-type="doi">10.1002/EV.20556</pub-id>, PMID: <pub-id pub-id-type="pmid">40365921</pub-id></citation></ref>
<ref id="ref14"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>H.</given-names></name> <name><surname>Zheng</surname> <given-names>O.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Yin</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name> <name><surname>Ding</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>ChatGPT for shaping the future of dentistry: the potential of multi-modal large language model</article-title>. <source>Int. J. Oral Sci.</source> <volume>15</volume>, <fpage>29</fpage>&#x2013;<lpage>13</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41368-023-00239-y</pub-id>, PMID: <pub-id pub-id-type="pmid">37507396</pub-id></citation></ref>
<ref id="ref15"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Jain</surname> <given-names>N.</given-names></name> <name><surname>Vaidyanath</surname> <given-names>S.</given-names></name> <name><surname>Iyer</surname> <given-names>A.</given-names></name> <name><surname>Natarajan</surname> <given-names>N.</given-names></name> <name><surname>Parthasarathy</surname> <given-names>S.</given-names></name> <name><surname>Rajamani</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2022</year>). &#x201C;Jigsaw: large language models meet program synthesis,&#x201D; in <italic>Proceedings - International Conference on Software Engineering, IEEE Computer Society</italic>. pp. 1219&#x2013;1231.</citation></ref>
<ref id="ref16"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Jiang</surname> <given-names>E.</given-names></name> <name><surname>Olson</surname> <given-names>K.</given-names></name> <name><surname>Toh</surname> <given-names>E.</given-names></name> <name><surname>Molina</surname> <given-names>A.</given-names></name> <name><surname>Donsbach</surname> <given-names>A.</given-names></name> <name><surname>Terry</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2022</year>). &#x201C;PromptMaker: prompt-based prototyping with large language models,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings</italic>. Association for Computing Machinery.</citation></ref>
<ref id="ref17"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Jo</surname> <given-names>E.</given-names></name> <name><surname>Epstein</surname> <given-names>D. A.</given-names></name> <name><surname>Jung</surname> <given-names>H.</given-names></name> <name><surname>Kim</surname> <given-names>Y. H.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Understanding the benefits and challenges of deploying conversational AI leveraging large language models for public health intervention,&#x201D; <italic>Conference on Human Factors in Computing Systems - Proceedings</italic>. p. 16.</citation></ref>
<ref id="ref18"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kamnis</surname> <given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>Generative pre-trained transformers (GPT) for surface engineering</article-title>. <source>Surf. Coat. Technol.</source> <volume>466</volume>:<fpage>129680</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.surfcoat.2023.129680</pub-id></citation></ref>
<ref id="ref19"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Kang</surname> <given-names>S.</given-names></name> <name><surname>Yoon</surname> <given-names>J.</given-names></name> <name><surname>Yoo</surname> <given-names>S.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Large language models are few-shot testers: exploring LLM-based general bug reproduction,&#x201D; in <italic>Proceedings - International Conference on Software Engineering</italic>. pp. 2312&#x2013;2323.</citation></ref>
<ref id="ref20"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kasneci</surname> <given-names>E.</given-names></name> <name><surname>Sessler</surname> <given-names>K.</given-names></name> <name><surname>K&#x00FC;chemann</surname> <given-names>S.</given-names></name> <name><surname>Bannert</surname> <given-names>M.</given-names></name> <name><surname>Dementieva</surname> <given-names>D.</given-names></name> <name><surname>Fischer</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>ChatGPT for good? On opportunities and challenges of large language models for education</article-title>. <source>Learn. Individ. Differ.</source> <volume>103</volume>:<fpage>102274</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.lindif.2023.102274</pub-id>, PMID: <pub-id pub-id-type="pmid">40368751</pub-id></citation></ref>
<ref id="ref21"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Zhao</surname> <given-names>J.</given-names></name> <name><surname>Tian</surname> <given-names>Y.</given-names></name> <name><surname>Chandra</surname> <given-names>S.</given-names></name></person-group>, (<year>2021</year>). &#x201C;Code prediction by feeding trees to transformers,&#x201D; <italic>Proceedings - International Conference on Software Engineering</italic>, pp. 150&#x2013;162.</citation></ref>
<ref id="ref22"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>King</surname> <given-names>M. R.</given-names></name></person-group> (<year>2023</year>). <article-title>Can Bard, Google&#x2019;s experimental Chatbot based on the LaMDA large language model, help to analyze the gender and racial diversity of authors in your cited scientific references?</article-title> <source>Cell. Mol. Bioeng.</source> <volume>16</volume>, <fpage>175</fpage>&#x2013;<lpage>179</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s12195-023-00761-3</pub-id>, PMID: <pub-id pub-id-type="pmid">37096072</pub-id></citation></ref>
<ref id="ref23"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname> <given-names>P.</given-names></name> <name><surname>Yuan</surname> <given-names>W.</given-names></name> <name><surname>Fu</surname> <given-names>J.</given-names></name> <name><surname>Jiang</surname> <given-names>Z.</given-names></name> <name><surname>Hayashi</surname> <given-names>H.</given-names></name> <name><surname>Neubig</surname> <given-names>G.</given-names></name></person-group> (<year>2023</year>). <article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title>. <source>ACM Comput. Surv.</source> <volume>55</volume>, <fpage>1</fpage>&#x2013;<lpage>35</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3560815</pub-id></citation></ref>
<ref id="ref24"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Lund</surname> <given-names>B. D.</given-names></name> <name><surname>Wang</surname> <given-names>T.</given-names></name> <name><surname>Mannuru</surname> <given-names>N. R.</given-names></name> <name><surname>Nie</surname> <given-names>B.</given-names></name> <name><surname>Shimray</surname> <given-names>S.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name></person-group> (<year>2023</year>). <article-title>ChatGPT and a new academic reality: artificial intelligence-written research papers and the ethics of the large language models in scholarly publishing</article-title>. <source>J. Assoc. Inf. Sci. Technol.</source> <volume>74</volume>, <fpage>570</fpage>&#x2013;<lpage>581</lpage>. doi: <pub-id pub-id-type="doi">10.1002/ASI.24750</pub-id></citation></ref>
<ref id="ref25"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Macneil</surname> <given-names>S.</given-names></name> <name><surname>Tran</surname> <given-names>A.</given-names></name> <name><surname>Mogil</surname> <given-names>D.</given-names></name> <name><surname>Bernstein</surname> <given-names>S.</given-names></name> <name><surname>Ross</surname> <given-names>E.</given-names></name> <name><surname>Huang</surname> <given-names>Z.</given-names></name></person-group>, (<year>2022</year>). &#x201C;Generating diverse code explanations using the GPT-3 large language model,&#x201D; in <italic>Proceedings of the 2022 ACM Conference on International Computing Education Research - Volume 2</italic>. p. 3.</citation></ref>
<ref id="ref26"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mahuli</surname> <given-names>S. A.</given-names></name> <name><surname>Rai</surname> <given-names>A.</given-names></name> <name><surname>Mahuli</surname> <given-names>A. V.</given-names></name> <name><surname>Kumar</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>Application ChatGPT in conducting systematic reviews and meta-analyses</article-title>. <source>Br. Dent. J.</source> <volume>235</volume>, <fpage>90</fpage>&#x2013;<lpage>92</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41415-023-6132-y</pub-id>, PMID: <pub-id pub-id-type="pmid">37500847</pub-id></citation></ref>
<ref id="ref27"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Meyer</surname> <given-names>J. G.</given-names></name> <name><surname>Urbanowicz</surname> <given-names>R. J.</given-names></name> <name><surname>Martin</surname> <given-names>P. C. N.</given-names></name> <name><surname>O&#x2019;Connor</surname> <given-names>K.</given-names></name> <name><surname>Li</surname> <given-names>R.</given-names></name> <name><surname>Peng</surname> <given-names>P. C.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>ChatGPT and large language models in academia: opportunities and challenges</article-title>. <source>BioData Mining</source> <volume>16</volume>, <fpage>20</fpage>&#x2013;<lpage>11</lpage>. doi: <pub-id pub-id-type="doi">10.1186/s13040-023-00339-9</pub-id>, PMID: <pub-id pub-id-type="pmid">37443040</pub-id></citation></ref>
<ref id="ref28"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Milano</surname> <given-names>S.</given-names></name> <name><surname>McGrane</surname> <given-names>J. A.</given-names></name> <name><surname>Leonelli</surname> <given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>Large language models challenge the future of higher education</article-title>. <source>Nat. Mach. Intellig.</source> <volume>5</volume>, <fpage>333</fpage>&#x2013;<lpage>334</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s42256-023-00644-2</pub-id></citation></ref>
<ref id="ref29"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Min</surname> <given-names>B.</given-names></name> <name><surname>Ross</surname> <given-names>H.</given-names></name> <name><surname>Sulem</surname> <given-names>E.</given-names></name> <name><surname>Veyseh</surname> <given-names>A. P. B.</given-names></name> <name><surname>Nguyen</surname> <given-names>T. H.</given-names></name> <name><surname>Sainz</surname> <given-names>O.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Recent advances in natural language processing via large pre-trained language models: a survey</article-title>. <source>ACM Comput. Surv.</source> <volume>56</volume>, <fpage>1</fpage>&#x2013;<lpage>40</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3605943</pub-id></citation></ref>
<ref id="ref30"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>M&#x00F6;kander</surname> <given-names>J.</given-names></name> <name><surname>Schuett</surname> <given-names>J.</given-names></name></person-group>, &#x00B7; <person-group person-group-type="author"><name><surname>Hannah</surname> <given-names>R. Kirk</given-names></name> <name><surname>Floridi</surname> <given-names>Luciano</given-names></name></person-group>, &#x201C;<article-title>Auditing large language models: a three-layered approach</article-title>,&#x201D; <source>AI Ethics</source> vol. <volume>1</volume>, pp. <fpage>1</fpage>&#x2013;<lpage>31</lpage>, (<year>2023</year>). doi: <pub-id pub-id-type="doi">10.1007/S43681-023-00289-2</pub-id></citation></ref>
<ref id="ref31"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Pan</surname> <given-names>B.</given-names></name> <name><surname>Ke</surname> <given-names>Y. K.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Efficient artistic image style transfer with large language model (LLM): a new perspective,&#x201D; in <italic>Proceedings of the 8th International Conference on Communication and Electronics systems, ICCES 2023</italic>, Institute of Electrical and Electronics Engineers Inc. pp. 1729&#x2013;1732.</citation></ref>
<ref id="ref32"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Porsdam Mann</surname> <given-names>S.</given-names></name> <name><surname>Earp</surname> <given-names>B. D.</given-names></name> <name><surname>M&#x00F8;ller</surname> <given-names>N.</given-names></name> <name><surname>Vynn</surname> <given-names>S.</given-names></name> <name><surname>Savulescu</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>AUTOGEN: a personalized large language model for academic enhancement&#x2014;ethics and proof of principle</article-title>. <source>Am. J. Bioeth.</source> <volume>23</volume>, <fpage>28</fpage>&#x2013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1080/15265161.2023.2233356</pub-id>, PMID: <pub-id pub-id-type="pmid">37487183</pub-id></citation></ref>
<ref id="ref33"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Qureshi</surname> <given-names>R.</given-names></name> <name><surname>Shaughnessy</surname> <given-names>D.</given-names></name> <name><surname>Gill</surname> <given-names>K. A. R.</given-names></name> <name><surname>Robinson</surname> <given-names>K. A.</given-names></name> <name><surname>Li</surname> <given-names>T.</given-names></name> <name><surname>Agai</surname> <given-names>E.</given-names></name></person-group> (<year>2023</year>). <article-title>Are ChatGPT and large language models &#x2018;the answer&#x2019; to bringing us closer to systematic review automation?</article-title> <source>Syst. Rev.</source> <volume>12</volume>, <fpage>1</fpage>&#x2013;<lpage>4</lpage>. doi: <pub-id pub-id-type="doi">10.1186/S13643-023-02243-Z/PEER-REVIEW</pub-id></citation></ref>
<ref id="ref34"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ressi</surname> <given-names>D.</given-names></name> <name><surname>Romanello</surname> <given-names>R.</given-names></name> <name><surname>Piazza</surname> <given-names>C.</given-names></name> <name><surname>Rossi</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>AI-enhanced blockchain technology: a review of advancements and opportunities</article-title>. <source>J. Netw. Comput. Appl.</source> <volume>225</volume>:<fpage>103858</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jnca.2024.103858</pub-id></citation></ref>
<ref id="ref35"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Reynolds</surname> <given-names>L.</given-names></name> <name><surname>McDonell</surname> <given-names>K.</given-names></name></person-group>, (<year>2021</year>). &#x201C;Prompt programming for large language models: beyond the few-shot paradigm,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings</italic>. Association for Computing Machinery.</citation></ref>
<ref id="ref36"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Ross</surname> <given-names>S. I.</given-names></name> <name><surname>Martinez</surname> <given-names>F.</given-names></name> <name><surname>Houde</surname> <given-names>S.</given-names></name> <name><surname>Muller</surname> <given-names>M.</given-names></name> <name><surname>Weisz</surname> <given-names>J. D.</given-names></name></person-group>, (<year>2023</year>). &#x201C;The Programmer&#x2019;s assistant: conversational interaction with a large language model for software development,&#x201D; in <italic>International Conference on Intelligent User Interfaces, Proceedings IUI</italic>, Association for Computing Machinery. pp. 491&#x2013;514.</citation></ref>
<ref id="ref37"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Sarsa</surname> <given-names>S.</given-names></name> <name><surname>Denny</surname> <given-names>P.</given-names></name> <name><surname>Hellas</surname> <given-names>A.</given-names></name> <name><surname>Leinonen</surname> <given-names>J.</given-names></name></person-group>, (<year>2022</year>). &#x201C;Automatic generation of programming exercises and code explanations using large language models,&#x201D; in <italic>ICER 2022 - Proceedings of the 2022 ACM Conference on International Computing Education Research</italic>. Association for Computing Machinery, Inc. pp. 27&#x2013;43.</citation></ref>
<ref id="ref38"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Scells</surname> <given-names>H.</given-names></name> <name><surname>Schlatt</surname> <given-names>F.</given-names></name> <name><surname>Potthast</surname> <given-names>M.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Smooth operators for effective systematic review queries,&#x201D; in <italic>SIGIR 2023 - Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval</italic>, Association for Computing Machinery, Inc pp. 580&#x2013;590.</citation></ref>
<ref id="ref39"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Siino</surname> <given-names>M.</given-names></name> <name><surname>Falco</surname> <given-names>M.</given-names></name> <name><surname>Croce</surname> <given-names>D.</given-names></name> <name><surname>Rosso</surname> <given-names>P.</given-names></name></person-group> (<year>2025</year>). <article-title>Exploring LLMs applications in law: a literature review on current legal NLP approaches</article-title>. <source>IEEE Access</source> <volume>13</volume>, <fpage>18253</fpage>&#x2013;<lpage>18276</lpage>. doi: <pub-id pub-id-type="doi">10.1109/ACCESS.2025.3533217</pub-id></citation></ref>
<ref id="ref40"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Singh</surname> <given-names>I.</given-names></name> <name><surname>Blukis</surname> <given-names>V.</given-names></name> <name><surname>Mousavian</surname> <given-names>A.</given-names></name> <name><surname>Goyal</surname> <given-names>A.</given-names></name> <name><surname>Xu</surname> <given-names>D.</given-names></name> <name><surname>Tremblay</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2023</year>). &#x201C;ProgPrompt: generating situated robot task plans using large language models,&#x201D; in <italic>Proceedings - IEEE International Conference on Robotics and Automation</italic>, Institute of Electrical and Electronics Engineers Inc. pp. 11523&#x2013;11530.</citation></ref>
<ref id="ref41"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Singhal</surname> <given-names>K.</given-names></name> <name><surname>Azizi</surname> <given-names>S.</given-names></name> <name><surname>Tu</surname> <given-names>T.</given-names></name> <name><surname>Mahdavi</surname> <given-names>S. S.</given-names></name> <name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Chung</surname> <given-names>H. W.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Large language models encode clinical knowledge</article-title>. <source>Nature</source> <volume>620</volume>, <fpage>172</fpage>&#x2013;<lpage>180</lpage>. doi: <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>, PMID: <pub-id pub-id-type="pmid">37438534</pub-id></citation></ref>
<ref id="ref42"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Strobelt</surname> <given-names>H.</given-names></name> <name><surname>Webson</surname> <given-names>A.</given-names></name> <name><surname>Sanh</surname> <given-names>V.</given-names></name> <name><surname>Hoover</surname> <given-names>B.</given-names></name> <name><surname>Beyer</surname> <given-names>J.</given-names></name> <name><surname>Pfister</surname> <given-names>H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Interactive and visual prompt engineering for ad-hoc task adaptation with large language models</article-title>. <source>IEEE Trans. Vis. Comput. Graph.</source> <volume>29</volume>, <fpage>1146</fpage>&#x2013;<lpage>1156</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TVCG.2022.3209479</pub-id>, PMID: <pub-id pub-id-type="pmid">36191099</pub-id></citation></ref>
<ref id="ref43"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Teubner</surname> <given-names>T.</given-names></name> <name><surname>Flath</surname> <given-names>C. M.</given-names></name> <name><surname>Weinhardt</surname> <given-names>C.</given-names></name> <name><surname>van der Aalst</surname> <given-names>W.</given-names></name> <name><surname>Hinz</surname> <given-names>O.</given-names></name></person-group> (<year>2023</year>). <article-title>Welcome to the era of ChatGPT et al.: the prospects of large language models</article-title>. <source>Bus. Inf. Syst. Eng.</source> <volume>65</volume>, <fpage>95</fpage>&#x2013;<lpage>101</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s12599-023-00795-x</pub-id></citation></ref>
<ref id="ref44"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Topsakal</surname> <given-names>O.</given-names></name> <name><surname>Akinci</surname> <given-names>T. C.</given-names></name></person-group> (<year>2023</year>). <article-title>Creating large language model applications utilizing LangChain: a primer on developing LLM apps fast</article-title>. <source>Int. Conf. Appl. Eng. Nat. Sci.</source> <volume>1</volume>, <fpage>1050</fpage>&#x2013;<lpage>1056</lpage>. doi: <pub-id pub-id-type="doi">10.59287/icaens.1127</pub-id></citation></ref>
<ref id="ref45"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Trott</surname> <given-names>S.</given-names></name> <name><surname>Jones</surname> <given-names>C.</given-names></name> <name><surname>Chang</surname> <given-names>T.</given-names></name> <name><surname>Michaelov</surname> <given-names>J.</given-names></name> <name><surname>Bergen</surname> <given-names>B.</given-names></name></person-group> (<year>2023</year>). <article-title>Do large language models know what humans know?</article-title> <source>Cogn. Sci.</source> <volume>47</volume>:<fpage>13309</fpage>. doi: <pub-id pub-id-type="doi">10.1111/COGS.13309</pub-id>, PMID: <pub-id pub-id-type="pmid">37401923</pub-id></citation></ref>
<ref id="ref46"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Urban</surname> <given-names>M.</given-names></name> <name><surname>Nguyen</surname> <given-names>D. D.</given-names></name> <name><surname>Binnig</surname> <given-names>C.</given-names></name></person-group>, (<year>2023</year>). &#x201C;OmniscientDB: a large language model-augmented DBMS that knows what other DBMSs do not know,&#x201D; <italic>Proceedings of the 6th International Workshop on Exploiting Artificial Intelligence Techniques for Data Management, aiDM 2023 - in conjunction with the 2023 ACM SIGMOD/PODS Conference</italic>.</citation></ref>
<ref id="ref47"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Vaithilingam</surname> <given-names>P.</given-names></name> <name><surname>Zhang</surname> <given-names>T.</given-names></name> <name><surname>Glassman</surname> <given-names>E. L.</given-names></name></person-group>, (<year>2022</year>). &#x201C;Expectation vs. experience: evaluating the usability of code generation tools powered by large language models,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings.</italic> Association for Computing Machinery.</citation></ref>
<ref id="ref48"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Garg</surname> <given-names>S.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Kaddoum</surname> <given-names>G.</given-names></name> <name><surname>Jalil Piran</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>Toward accurate anomaly detection in industrial internet of things using hierarchical federated learning</article-title>. <source>IEEE Internet Things J.</source> <volume>9</volume>, <fpage>7110</fpage>&#x2013;<lpage>7119</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JIOT.2021.3074382</pub-id></citation></ref>
<ref id="ref49"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Garg</surname> <given-names>S.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Kaddoum</surname> <given-names>G.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Hassan</surname> <given-names>M. M.</given-names></name></person-group> (<year>2023</year>). <article-title>Heterogeneous Blockchain and AI-driven hierarchical trust evaluation for 5G-enabled intelligent transportation systems</article-title>. <source>IEEE Trans. Intell. Transp. Syst.</source> <volume>24</volume>, <fpage>1</fpage>&#x2013;<lpage>10</lpage>. doi: <pub-id pub-id-type="doi">10.1109/TITS.2021.3129417</pub-id>, PMID: <pub-id pub-id-type="pmid">40358844</pub-id></citation></ref>
<ref id="ref50"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Garg</surname> <given-names>S.</given-names></name> <name><surname>Lin</surname> <given-names>H.</given-names></name> <name><surname>Kaddoum</surname> <given-names>G.</given-names></name> <name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Hossain</surname> <given-names>M. S.</given-names></name></person-group> (<year>2022</year>). <article-title>A secure data aggregation strategy in edge computing and Blockchain-empowered internet of things</article-title>. <source>IEEE Internet Things J.</source> <volume>9</volume>, <fpage>14237</fpage>&#x2013;<lpage>14246</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JIOT.2020.3023588</pub-id></citation></ref>
<ref id="ref51"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>B.</given-names></name> <name><surname>Li</surname> <given-names>G.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Enabling conversational interaction with Mobile UI using large language models,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings.</italic> Association for Computing Machinery.</citation></ref>
<ref id="ref52"><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>X.</given-names></name> <name><surname>Duan</surname> <given-names>R.</given-names></name> <name><surname>Ni</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>Unveiling security, privacy, and ethical concerns of ChatGPT</article-title>. <source>J. Inform. Intellig.</source> <volume>2</volume>, <fpage>102</fpage>&#x2013;<lpage>115</lpage>. doi: <pub-id pub-id-type="doi">10.1016/J.JIIXD.2023.10.007</pub-id>, PMID: <pub-id pub-id-type="pmid">40368751</pub-id></citation></ref>
<ref id="ref53"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Wu</surname> <given-names>T.</given-names></name> <name><surname>Terry</surname> <given-names>M.</given-names></name> <name><surname>Cai</surname> <given-names>C. J.</given-names></name></person-group>, (<year>2022</year>). &#x201C;AI chains: transparent and controllable human-AI interaction by chaining large language model prompts,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings</italic>. Association for Computing Machinery.</citation></ref>
<ref id="ref54"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Xu</surname> <given-names>F. F.</given-names></name> <name><surname>Alon</surname> <given-names>U.</given-names></name> <name><surname>Neubig</surname> <given-names>G.</given-names></name> <name><surname>Hellendoorn</surname> <given-names>V. J.</given-names></name></person-group>, (<year>2022</year>). &#x201C;A systematic evaluation of large language models of code,&#x201D; in <italic>Association for Computing Machinery (ACM)</italic>. pp. 1&#x2013;10.</citation></ref>
<ref id="ref55"><citation citation-type="other"><person-group person-group-type="author"><name><surname>Zamfirescu-Pereira</surname> <given-names>J. D.</given-names></name> <name><surname>Wong</surname> <given-names>R. Y.</given-names></name> <name><surname>Hartmann</surname> <given-names>B.</given-names></name> <name><surname>Yang</surname> <given-names>Q.</given-names></name></person-group>, (<year>2023</year>). &#x201C;Why Johnny Can&#x2019;t prompt: how non-AI experts try (and fail) to design LLM prompts,&#x201D; in <italic>Conference on Human Factors in Computing Systems - Proceedings.</italic> Association for Computing Machinery.</citation></ref>
</ref-list>
</back>
</article>