<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Artif. Intell.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Artificial Intelligence</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Artif. Intell.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2624-8212</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/frai.2026.1665992</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>An efficient strategy for fine-tuning large language models</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Marsh</surname> <given-names>Benjamin</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<uri xlink:href="https://loop.frontiersin.org/people/3121046"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Michaleas</surname> <given-names>Adam</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<uri xlink:href="https://loop.frontiersin.org/people/2716285"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Ricke</surname> <given-names>Darrell O.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<uri xlink:href="https://loop.frontiersin.org/people/266322"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Monera</surname> <given-names>Shaun</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<uri xlink:href="https://loop.frontiersin.org/people/3376437"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zembruski</surname> <given-names>Shriya</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<uri xlink:href="https://loop.frontiersin.org/people/3376441"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Marine Corps Tactical Systems Support Activity, United States Marine Corps</institution>, <city>Camp Pendleton, CA</city>, <country country="us">United States</country></aff>
<aff id="aff2"><label>2</label><institution>MIT Lincoln Laboratory, Artificial Intelligence Technology</institution>, <city>Lexington, MA</city>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Benjamin Marsh, <email xlink:href="mailto:brmarsh76@gmail.com">brmarsh76@gmail.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-17">
<day>17</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>9</volume>
<elocation-id>1665992</elocation-id>
<history>
<date date-type="received">
<day>14</day>
<month>07</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>19</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>26</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Marsh, Michaleas, Ricke, Monera and Zembruski.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Marsh, Michaleas, Ricke, Monera and Zembruski</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-17">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>Large Language Models (LLMs) achieve strong performance on many Natural Language Processing tasks, but adapting them to domain-specific applications is resource-intensive due to the cost of curating task-specific datasets and the compute required for fine-tuning. This work proposes an end-to-end strategy for rapidly fine-tuning LLMs for domain-specific tasks when both data and compute are limited.</p></sec>
<sec>
<title>Methods</title>
<p>The strategy uses Distilling Step-by-Step (DSS) for dataset development and model training, where a teacher model generates task labels and intermediate rationales via Chain-of-Thought prompting for a natural-language-to-Query-DSL structured generation task. Using the resulting supervision, we benchmark three fine-tuning modalities through hyperparameter sweeps: full-precision fine-tuning, Low-Rank Adaptation (LoRA), and Quantized LoRA (QLoRA). To isolate the effect of rationale supervision, we additionally conduct an ablation study comparing DSS training (label &#x0002B; rationale supervision) against a label-only configuration.</p></sec>
<sec>
<title>Results</title>
<p>Across the evaluated configurations, DSS combined with full-precision fine-tuning yields the strongest overall performance. Under resource constraints, DSS with LoRA provides an effective performance-efficiency tradeoff, and DSS with QLoRA enables training under tighter GPU memory budgets while maintaining competitive performance. In the parameter-efficient regimes, an alpha-to-rank ratio of 4:1 provides a consistent balance of performance and compute consumption across the explored settings.</p></sec>
<sec>
<title>Discussion</title>
<p>These findings support a practical process for resource-constrained domain adaptation: use DSS to efficiently construct datasets, then select the fine-tuning modality based on available compute (full-precision when feasible; LoRA or QLoRA when memory-limited). The proposed workflow offers a general guide for efficiently fine-tuning LLMs for domain-specific tasks with limited data availability.</p></sec></abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>distributed computing</kwd>
<kwd>fine-tuning</kwd>
<kwd>large language models</kwd>
<kwd>neural networks</kwd>
<kwd>NLP</kwd>
</kwd-group>
<funding-group>
  <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. Approved for public release. Distribution is unlimited. This material is based upon work supported by the Department of the Air Force under Air Force Contract No. FA8702-15-D-0001.</funding-statement>
</funding-group>
<counts>
<fig-count count="7"/>
<table-count count="4"/>
<equation-count count="3"/>
<ref-count count="33"/>
<page-count count="15"/>
<word-count count="9967"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Natural Language Processing</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>Large Language Models (LLMs) are state-of-the-art NLP tools based on the transformer neural network architecture (<xref ref-type="bibr" rid="B28">Vaswani et al., 2023</xref>). Classical Natural Language Processing (NLP) methods rely on mechanical translation and statistical methods for text prediction (<xref ref-type="bibr" rid="B1">Bengio et al., 2000</xref>). Classical fine-tuning methods for LLMs require significant resources and time to develop datasets and to execute the fine-tuning procedure. This paper introduces an approach to developing datasets and fine-tuning LLMs for domain-specific tasks with limited datasets in resource-constrained compute environments, namely, applying Distilling Step-by-Step (DSS) for dataset development in a distributed computing regime. This approach is then used as a base for a comparative analysis of established fine-tuning methods to determine the most efficient fine-tuning method for rapidly developing models for domain-specific tasks.</p>
<p>Although LLMs have been repeatedly shown to have excellent performance on text-based tasks, there are limitations that constrain their application for specialized use cases. One limitation is that LLMs are trained using large datasets that are composed of general-purpose text and often do not contain information on specialized domains such as law or medicine (<xref ref-type="bibr" rid="B15">Nori et al., 2023</xref>). This lack of specialized domain knowledge often results in LLM-generated text that appears coherent but is incorrect (<xref ref-type="bibr" rid="B32">Zhang et al., 2023</xref>). LLMs often consist of hundreds of millions to hundreds of billions of parameters. The sheer size of these models requires the use of systems with large amounts of compute capacity, which incurs a major financial cost for both the on-premises and cloud computing usage paradigms (<xref ref-type="bibr" rid="B2">Chen et al., 2023</xref>).</p>
<p>Fine-tuning and related strategies remain a standard approach for aligning pretrained LLMs to specialized tasks and constraints. We summarize the relevant literature on fine-tuning in Section 2. In this study, we benchmark full-precision fine-tuning against parameter-efficient alternatives, Low Rank Adaptation (LoRA) and Quantized Low Rank Adaptation (QLoRA) under GPU memory constraints consistent with limited compute environments. To reduce the cost of dataset development, we use Distillation Step-by-Step (DSS) (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>) to construct a task-specific data set that contains both output labels and intermediate rationales, and we explicitly ablate rationale supervision versus label-only training. We use FLAN-T5 (Small/Base/Large/XL) as an instruction-tuned sequence-to-sequence translation exemplar to study scaling behavior across fine-tuning modalities on a structured-generation task. FLAN-T5 is selected due to its strong performance on translation tasks and its suitability for text-to-structured generation tasks (<xref ref-type="bibr" rid="B29">Wei et al., 2022</xref>).</p>
<p>In this work, we detail the extension of the DSS method to the FLAN-T5 language models and benchmark DSS performance across standard fine-tuning, LoRA, and QLoRA fine-tuning methods while using the novel dataset created via DSS: natural language questions to Query Domain-Specific Language (DSL). Additionally, while the DSS method provides an efficient mechanism for developing fine-tuning datasets that include both labels and intermediate reasoning chains, the contribution of rationales relative to label-only supervision remains an open methodological question. Previous research has shown that DSS improves model learning efficiency on open-source benchmark datasets (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>). However, a systematic comparison of DSS fine-tuning versus label-only fine-tuning has not been reported on task-specific fine-tuning datasets, especially in the context of parameter-efficient fine-tuning approaches and quantized optimization regimes.</p>
<p>To address this gap, this study includes an ablation experiment designed to isolate the effect of rationale supervision by comparing DSS against a label-only training configuration. The ablation spans multiple model sizes and fine-tuning methods to assess whether the contribution of rationales varies with model capacity or training constraints.</p>
<p>Large language models are a marked advance in the fields of deep learning and natural language processing. In this work, we emphasize the importance of efficiently fine-tuning these models for domain-specific tasks that fall outside of their original training data. This work provides a guide for efficiently fine-tuning LLMs for domain-specific tasks with limited data availability. DSS combined with full-precision fine-tuning provided better results. For resource-constrained environments, use Low Rank Adaptation fine-tuning with the Alpha to Rank ratio of 4:1 to balance performance and computation consumption. For GPU-limited environments, use Quantized Low Rank Adaptation fine-tuning with an Alpha to Rank ratio of 4:1. The code and instructions are available at the following Git Repository link: <ext-link ext-link-type="uri" xlink:href="https://github.com/brmarsh23/An-Efficient-Strategy-for-Fine-Tuning-Large-Language-Models">https://github.com/brmarsh23/An-Efficient-Strategy-for-Fine-Tuning-Large-Language-Models</ext-link>.</p></sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec>
<label>2.1</label>
<title>Domain adaptation and instruction-tuned sequence-to-sequence models</title>
<p>Modern LLMs are predominantly transformer-based (<xref ref-type="bibr" rid="B28">Vaswani et al., 2023</xref>) and achieve strong performance on general-purpose tasks via pretraining on large datasets. The text-to-text framing method introduced by the T5 family of encoder-decoder models (<xref ref-type="fig" rid="F1">Figure 1</xref>) provides a sequence-to-sequence translation process for a wide range of downstream tasks (<xref ref-type="bibr" rid="B19">Raffel et al., 2023</xref>). Building on this foundation, instruction tuning has been shown to substantially improve zero-shot generalization for T5 models, including FLAN-T5 (<xref ref-type="bibr" rid="B29">Wei et al., 2022</xref>). These instruction-tuned sequence-to-sequence models are therefore a practical starting point for domain-specific text generation tasks, particularly when the target can be expressed as a structured mapping from an input text prompt to an output text sequence.</p>
<fig position="float" id="F1">
<label>Figure 1</label>
<caption><p>Encoder-decoder architecture adapted from <xref ref-type="bibr" rid="B19">Raffel et al. (2023</xref>, <xref ref-type="fig" rid="F4">Figure 4</xref>, CC4.0 changes: redrawn). T5 models use an encoder-decoder architecture with all input tokens fully visible in the encoder and encoder-decoder attention, and causal masking in the decoder. The encoder consists of identical layers of self-attention mechanisms, feedforward networks, and layer normalizations. The decoder consists of identical layers of self-attention mechanisms, feedforward networks, layer normalizations, and causal masking.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0001.tif">
<alt-text content-type="machine-generated">Diagram showing an encoder-decoder neural network architecture with four green encoder blocks labeled x1 through x4 and three pink decoder blocks labeled y1, y2, and a dash, with interconnecting arrows between all layers.</alt-text>
</graphic>
</fig>
<p>However, general-purpose pretraining does not guarantee reliable performance in specialized domains. Empirical studies have documented failures such as confident but incorrect model outputs, described as hallucinations (<xref ref-type="bibr" rid="B32">Zhang et al., 2023</xref>), and poor performance on domain-specific tasks when relevant knowledge is underrepresented in pretraining data (<xref ref-type="bibr" rid="B15">Nori et al., 2023</xref>). More broadly, domain and task mismatch has motivated continued emphasis on model adaptation methods, including domain-specific pretraining and fine-tuning, as ways to align model behavior with a target downstream task (<xref ref-type="bibr" rid="B5">Gururangan et al., 2020</xref>; <xref ref-type="bibr" rid="B27">Tinn et al., 2023</xref>).</p>
</sec>
<sec>
<label>2.2</label>
<title>Full-precision and parameter-efficient fine-tuning</title>
<p>Full-precision fine-tuning adapts a pretrained model by updating a large fraction of parameters, often yielding strong downstream performance but incurring substantial computational and memory costs for transformer-scale models. These compute and memory costs arise from storing model parameters, gradients, and optimizer states and the distributed training overhead that scales with model size and sequence length. Large-scale optimization techniques such as ZeRO reduce memory redundancy and improve feasibility for large model training, but practical constraints remain for resource-limited environments (<xref ref-type="bibr" rid="B21">Rajbhandari et al., 2020</xref>).</p>
<p>Parameter-efficient fine-tuning (PEFT) approaches reduce fine-tuning compute and memory costs by restricting trainable parameters. PEFT approaches include LoRA (Low-Rank Adaptation) for injecting low-rank matrices, adapters for adding small bottleneck layers, prefix tuning/prompt tuning for learning continuous prompts, and QLoRA/DoRA for enhanced efficiency (quantization/decomposition). Herein, the PEFT methods LoRA and QLoRA are compared with full-precision fine-tuning. LoRA injects low-rank trainable adapter weight matrices into targeted model layers while freezing the base weights, enabling competitive downstream performance with fewer trainable parameters than the full-precision method (<xref ref-type="bibr" rid="B10">Hu et al., 2021</xref>). QLoRA extends this regime by quantizing the base model weights while training adapters weight matrices, improving memory efficiency and enabling fine-tuning of larger models under higher compute constraints (<xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>). Surveys and unifying perspectives on PEFT further emphasize that adapter-based methods can provide strong performance-efficiency tradeoffs across tasks and model classes (<xref ref-type="bibr" rid="B6">Han et al., 2024</xref>; <xref ref-type="bibr" rid="B7">He et al., 2022</xref>).</p>
</sec>
<sec>
<label>2.3</label>
<title>Knowledge distillation, chain-of-thought, and rationale supervision</title>
<p>Knowledge distillation is the transferring of capability from a larger teacher model to a smaller student model by using teacher-produced targets as supervision for the student (<xref ref-type="bibr" rid="B8">Hinton et al., 2015</xref>). Distillation Step-by-Step (DSS) augments this paradigm by training on both the targets and intermediate rationales produced by the teacher model, typically elicited through chain-of-thought prompting (<xref ref-type="bibr" rid="B30">Wei et al., 2023</xref>; <xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>). DSS has been shown to improve data efficiency on benchmark reasoning tasks, enabling smaller models to approach or exceed the performance of larger models trained with conventional supervision under comparable compute and dataset constraints (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>).</p>
<p>However, the extent to which rationale supervision improves downstream performance may be task dependent. This open question motivates ablation studies that compare rationale-augmented training against label-only supervision under controlled training conditions, especially when combined with PEFT and quantized optimization regimes where model capacity and optimization dynamics differ from full-precision training (<xref ref-type="bibr" rid="B10">Hu et al., 2021</xref>; <xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>).</p>
</sec>
<sec>
<label>2.4</label>
<title>Natural language to structured query generation</title>
<p>Mapping natural language questions to executable structured queries is a classic semantic parsing problem, widely studied in text-to-SQL and related benchmarks. Large-scale datasets such as Spider emphasize cross-domain generalization and complex compositional query generation (<xref ref-type="bibr" rid="B31">Yu et al., 2018</xref>), while earlier approaches established sequence-to-sequence structured query generation as a viable modeling strategy (<xref ref-type="bibr" rid="B33">Zhong et al., 2017</xref>). A persistent practical challenge in this area is output validity: syntactically invalid structured outputs are unusable even when semantically close to the target. Constrained decoding methods such as PICARD explicitly enforce admissible structure during autoregressive decoding and have been shown to improve validity and performance with models such as T5 (<xref ref-type="bibr" rid="B23">Scholak et al., 2021</xref>).</p>
<p>In operational systems, structured query representations also extend beyond SQL to domain-specific query languages. Query DSL, as used in OpenSearch, is one such representation and is commonly expressed as JSON (<xref ref-type="bibr" rid="B3">Contributors, 2024</xref>). This motivates evaluation setups that consider both semantic correctness and structural validity when fine-tuning LLMs for natural-language to Query-DSL translation.</p>
</sec>
<sec>
<label>2.5</label>
<title>Positioning of this work</title>
<p>Building on instruction-tuned sequence-to-sequence modeling (<xref ref-type="bibr" rid="B29">Wei et al., 2022</xref>; <xref ref-type="bibr" rid="B19">Raffel et al., 2023</xref>), PEFT methods (LoRA/QLoRA) (<xref ref-type="bibr" rid="B10">Hu et al., 2021</xref>; <xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>), and DSS rationale-augmented fine-tuning (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>), this work proposes and evaluates a strategy for constructing a domain-specific dataset via DSS and benchmarks full-precision fine-tuning against LoRA and QLoRA under realistic resource constraints. Additionally, we include an ablation study comparing rationale-augmented training to label-only supervision to isolate the contribution of rationale supervision within this task setting.</p></sec>
</sec>
<sec sec-type="materials|methods" id="s3">
<label>3</label>
<title>Materials and methods</title>
<sec>
<label>3.1</label>
<title>Distilling step-by-step</title>
<p>Distilling Step-by-Step is a method by which a teacher LLM is used to generate feature labels and label rationales in order to train a smaller model to match the teacher model&#x00027;s performance on the task. DSS fine-tuned models have been shown to achieve superior performance, using smaller fine-tuning datasets, than larger models trained with standard fine-tuning methods on benchmark datasets and on few-shot learning tasks (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>).</p>
<p>The generation of label rationales was done by prompting teacher models using chain-of-thought (<xref ref-type="bibr" rid="B30">Wei et al., 2023</xref>). This technique provided intermediate rationales to explain the connections between the input and the label (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>). An example input prompt is shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. The prompt consists of three distinct parts that are designed to provide information on the DSL interface and dataset of the downstream task while inducing a chain-of-thought reasoning process in the model. The final section, **Question**, contains an example input question from the user. Using this prompt schema, the rationals and DSL JSON output labels were extracted from the teacher model.</p>
<fig position="float" id="F2">
<label>Figure 2</label>
<caption><p>Example input prompt. The prompt contains instructions for properly formatting DSL queries, descriptions of the downstream task dataset, and Chain-of-Thought prompting.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0002.tif">
<alt-text content-type="machine-generated">Screenshot of text instructions and field descriptions guiding the creation of an OpenSearch DSL query in JSON format, including field definitions for sales reports, transaction areas, and locations, followed by a question about sales records trends in Europe over the last ten years.</alt-text>
</graphic>
</fig>
<p>Once rationales and labels were extracted for all inputs, the rationales were incorporated into the fine-tuning dataset by setting up the model fine-tuning as a multi-task problem. For each input, a task prefix was prepended to the input features. Each task prefix corresponds to the resulting output (label or rationale) and enables the student model to differentiate between each task (label generation or rationale generation). Thus, the student model was trained to generate the correct rationales and labels for each input.</p>
</sec>
<sec>
<label>3.2</label>
<title>Dataset description</title>
<p>This work utilized a fine-tuning dataset designed around the conversion of natural language questions to Query Domain-Specific Language, or Query DSL, for use by practitioners to query organizational data without requiring knowledge of database querying languages. Fine-tuning a language model to perform this translation task would enable practitioners to interact with databases in a more conversational way, leading to broader use. Query DSL is a flexible search language used by the OpenSearch data-analysis platform to parse through databases (<xref ref-type="bibr" rid="B3">Contributors, 2024</xref>). The fine-tuning dataset input features consisted of 1000 natural language questions to be translated to Query DSL for searching through the OpenSearch database. Each of these input questions was matched to an output label that consisted of a properly formatted Query DSL JavaScript Object Notation (JSON) object that provided the desired OpenSearch results based upon the input question and an output rationale that described how to generate the corresponding Query DSL JSON in a concise, step-by-step fashion. <xref ref-type="fig" rid="F3">Figure 3</xref> gives an example rationale and label generation process for a given input. <xref ref-type="fig" rid="F4">Figure 4</xref> shows an example training step using labels and rationales. Both the Query DSL output labels and rationales were generated as described in the previous subsection, using the open-source model Mixtral 8x22B, a Sparse Mixture of Experts (MOE) LLM (<xref ref-type="bibr" rid="B25">Team, 2024</xref>) as the teacher model. In total, the dataset consisted of 1000 input questions, along with 1000 output labels and rationale pairs. The full dataset was used for fine-tuning.</p>
<fig position="float" id="F3">
<label>Figure 3</label>
<caption><p>Overview of dataset creation using DSS. Chain-of-Thought prompting is used to extract rationales and labels from the teacher model. In this work, the Mixtral 8x22B LLM was used as the teacher model.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0003.tif">
<alt-text content-type="machine-generated">Flowchart showing the process of generating OpenSearch DSL queries from a prompt. A user question about changes in Europe sales records over ten years is input into a &#x0201C;Teacher Model,&#x0201D; which outputs a JSON query as the Generated DSL Label and reasoning steps as the Generated Rationale Label, including identifying fields and building an aggregation query.</alt-text>
</graphic>
</fig>
<fig position="float" id="F4">
<label>Figure 4</label>
<caption><p>Overview of an example fine-tuning training step using a dataset generated by DSS. An input tag (rationale or label) was pre-pended to the input, and the student model was trained to generate the label that matches the input tag.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0004.tif">
<alt-text content-type="machine-generated">Flowchart illustrating an input question about changes in sales records in Europe over the past 10 years being processed by a student model, which outputs a DSL query label and a rationale label describing query construction steps.</alt-text>
</graphic>
</fig>
</sec>
<sec>
<label>3.3</label>
<title>Model architecture</title>
<p>The multiple architectures of the FLAN T5 model that were used in this work are shown in <xref ref-type="table" rid="T1">Table 1</xref>. There are several key differences between each model variant. Most notably, the total number of parameters is governed by the number of encoder and decoder layers, the dimensionality (i.e., size) of the feedforward layers, and the number of attention heads per attention block within each encoder and decoder layer. Between the FLAN T5 Small, T5 Base, and T5 Large, there is a consistent increase in the dimensionality of the feedforward layers, the number of encoder and decoder layers, and the number of attention heads, which results in a consistent increase in the total number of parameters as the model size increases. However, the FLAN T5 XL has feedforward layers with four times as many parameters as the FLAN T5 Large and twice as many attention heads per block, resulting in a large increase in the total number of parameters in the FLAN T5 XL over the FLAN T5 Large. In this work, we utilized FLAN T5 Small, Base, Large, and XL architectures to determine the performance and efficiency of the three fine-tuning strategies (full-precision, LoRA, and QLoRA) using the Query DSL dataset.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>FLAN T5 model variants used in this work.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="center"><bold>Parameters</bold></th>
<th valign="top" align="center"><bold>Layers</bold></th>
<th valign="top" align="center"><bold>Size</bold></th>
<th valign="top" align="center"><bold>Heads</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FLAN T5 Small</td>
<td valign="top" align="center">76.956</td>
<td valign="top" align="center">6</td>
<td valign="top" align="center">2,048</td>
<td valign="top" align="center">8</td>
</tr>
<tr>
<td valign="top" align="left">FLAN T5 Base</td>
<td valign="top" align="center">247.577</td>
<td valign="top" align="center">12</td>
<td valign="top" align="center">3,072</td>
<td valign="top" align="center">12</td>
</tr>
<tr>
<td valign="top" align="left">FLAN T5 Large</td>
<td valign="top" align="center">770.567</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">4,096</td>
<td valign="top" align="center">16</td>
</tr>
<tr>
<td valign="top" align="left">FLAN T5 XL</td>
<td valign="top" align="center">2,884.497</td>
<td valign="top" align="center">24</td>
<td valign="top" align="center">16,384</td>
<td valign="top" align="center">32</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Model size is measured in millions of parameters. Layers column gives the number of encoder and decoder layers per model. Linear size gives the dimension of the linear layers. Number of Attention Heads are given per attention block.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<label>3.4</label>
<title>Fine-tuning methods</title>
<p>Fine-tuning is the process by which a pre-trained language model is trained on additional tasks using a smaller dataset than the pre-training corpus. The classical fine-tuning method is to make some or all of the language model parameters available for training using full floating-point precision model weights, then train the model weights using a lower learning rate than the learning rate used during pre-training. In this work, this classical fine-tuning method is referred to as full-precision fine-tuning. Although the full-precision method can be very effective in terms of performance, it comes at a high computational cost, especially in terms of GPU memory consumption (<xref ref-type="bibr" rid="B18">Quentin and Stella Biderman, 2023</xref>). Due to this high cost, efficient methods have been developed to provide a balance between cost and performance. Two such methods, along with full-precision fine-tuning, were investigated: LoRA and QLoRA fine-tuning (see <xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig position="float" id="F5">
<label>Figure 5</label>
<caption><p>Comparison of fine-tuning methods from <xref ref-type="bibr" rid="B4">Dettmers et al. (2023</xref>, <xref ref-type="fig" rid="F1">Figure 1</xref>, CC4.0). Full-precision fine-tuning directly updates the model weights. LoRA fine-tuning trains rank decomposition weight matrices while holding full-precision model weights constant. QLoRA quantizes the full-precision model weights and utilizes memory-ecient paged optimizers.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0005.tif">
<alt-text content-type="machine-generated">Diagram compares Full Finetuning, LoRA, and QLoRA approaches for transformer models, indicating connections between optimizer state, adapters, and base model with separate gradient flow, parameter updates, and paging flow symbols, showing Full Finetuning uses a 16-bit transformer without adapters, LoRA adds 16-bit adapters, while QLoRA uses 4-bit transformers, adapters, and offloads to CPU for efficient computation.</alt-text>
</graphic>
</fig>
<p>LoRA fine-tuning, originally demonstrated in <xref ref-type="bibr" rid="B10">Hu et al. (2021)</xref>, freezes the pre-trained model weights and injects trainable rank decomposition matrices into specified layers of the transformer architecture, which, due to reducing the number of trainable parameters for downstream tasks, theoretically reduces the computational cost of fine-tuning the model. As shown in <xref ref-type="bibr" rid="B10">Hu et al. (2021)</xref>, a GPT-3 175B model fine-tuned with the Adam optimizer and LoRA fine-tuning reduced the number of trainable parameters by 10,000 times and the GPU memory requirements by 3 times while performing on par or better than standard fine-tuning on several benchmarks (<xref ref-type="bibr" rid="B10">Hu et al., 2021</xref>).</p>
<p>QLoRA fine-tuning is designed for further reductions in computational cost without sacrificing model performance. Originally shown in <xref ref-type="bibr" rid="B4">Dettmers et al. (2023)</xref>, QLoRA quantizes the pre-trained model weights to 4-bit NormalFloat, a theoretically optimal quantization data type (<xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>) and utilizes paged optimizers to avoid memory spikes that occur when processing mini-batches with long sequence lengths (<xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>). QLoRA has been shown to match full-precision fine-tuning and LoRA fine-tuning performance on multiple benchmarks (<xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>).</p>
</sec>
<sec>
<label>3.5</label>
<title>Distributed computing and training</title>
<p>To provide scalable distributed computing capabilities for model training, a two-node, GPU-enabled Rancher 2.0 Community Edition (<xref ref-type="bibr" rid="B22">Rancher, 2024</xref>) Kubernetes cluster was utilized for the hyperparameter search. The compute nodes were provisioned with two Intel Xeon Platinum 8480&#x0002B; processors, 2 TB RAM, four NVIDIA 80 GB H100 GPUs, and non-volatile memory express (NVME) solid-state storage.</p>
<p>Each variant of FLAN T5 was trained on the Query DSL dataset developed using DSS. Fine-tuning jobs were submitted using the Ray Train configured for Distributed Data Processing through the Accelerator API (<xref ref-type="bibr" rid="B26">Team, 2023</xref>).</p>
<p>The fine-tuning dataset was split in an 80/20 fashion into training and evaluation datasets, and all hyperparameter sweeps utilized the same split. The models were trained for 100 epochs with evaluation taking place after each epoch, and the evaluation loss was monitored in order to conduct learning rate reduction after 10 epochs of no improvement. Loss was calculated in accordance with (<xref ref-type="bibr" rid="B9">Hsieh et al. 2023</xref>):</p>
<disp-formula id="EQ1"><mml:math id="M1"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(1)</label></disp-formula>
<p>where <italic>L</italic><sub><italic>label</italic></sub> is the label prediction loss and <italic>l</italic> is the cross-entropy loss between the predicted tokens <italic>x</italic><sub><italic>i</italic></sub> and label tokens <italic>y</italic><sub><italic>i</italic></sub>. Additionally, we framed the learning as a multi-task problem in accordance with <xref ref-type="bibr" rid="B9">Hsieh et al. (2023)</xref>, where we train the model to predict not only the task labels but also the rationales given an input:</p>
<disp-formula id="EQ2"><mml:math id="M2"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mi>o</mml:mi><mml:mi>t</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x003B1;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>-</mml:mo><mml:mi>&#x003B1;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(2)</label></disp-formula>
<p>where <italic>L</italic><sub><italic>total</italic></sub> is the combined loss between <italic>L</italic><sub><italic>label</italic></sub>, described above, with <italic>L</italic><sub><italic>rationale</italic></sub></p>
<disp-formula id="EQ3"><mml:math id="M3"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>L</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:mfrac><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>l</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math><label>(3)</label></disp-formula>
<p>this is the cross-entropy loss between the predicted tokens <italic>x</italic><sub><italic>i</italic></sub> and the rationale for the label <italic>r</italic><sub><italic>i</italic></sub>. In <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref>, &#x003B1; was used as an additional training hyperparameter to tune how much loss the model accumulates from predicting rationales. For this work, &#x003B1; was set to 0.5 to balance the loss between predicted rationales and predicted labels.</p>
</sec>
<sec>
<label>3.6</label>
<title>Hyperparameter search</title>
<p>The hyperparameters that were utilized across all of the hyperparameter sweeps are shown in <xref ref-type="table" rid="T2">Table 2</xref>. The learning rate was based upon optimal results found in <xref ref-type="bibr" rid="B9">Hsieh et al. (2023)</xref> for the T5 neural network architectures, and the batch size was given as the number of training samples per batch across all nodes. In this work, one training sample per batch for each of the 8 nodes in the cluster results in a total batch size of 8 training samples per batch. As shown in <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref>, Alpha was set to 0.5 to balance the loss resulting from rationales and labels.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Hyperparameters that were utilized across all of the fine-tuning runs, excluding model architecture.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Hyperparameter</bold></th>
<th valign="top" align="center"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Learning rate</td>
<td valign="top" align="center">5e-5</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate patience</td>
<td valign="top" align="center">10 epochs</td>
</tr>
<tr>
<td valign="top" align="left">Learning rate factor</td>
<td valign="top" align="center">1e-1</td>
</tr>
<tr>
<td valign="top" align="left">Number of epochs</td>
<td valign="top" align="center">100</td>
</tr>
<tr>
<td valign="top" align="left">Total batch size</td>
<td valign="top" align="center">8</td>
</tr>
<tr>
<td valign="top" align="left">Alpha</td>
<td valign="top" align="center">0.5</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Total batch size is given as the number of training samples per batch across all nodes.</p>
</table-wrap-foot>
</table-wrap>
<p>Selected LoRA and QLoRA-specific hyperparameters (Target Modules and Dropout Rate) were held consistent throughout the hyperparameter sweeps for both methods. The target modules were selected based upon performance detailed in <xref ref-type="bibr" rid="B4">Dettmers et al. (2023)</xref>, and dropout rate (<xref ref-type="bibr" rid="B24">Srivastava et al., 2014</xref>) was selected based upon performance considerations given in <xref ref-type="bibr" rid="B14">Lin et al. (2024)</xref>. The other hyperparameter spaces searched during this work were the LoRA and QLoRA Rank and Alpha hyperparameters. The Rank hyperparameter determines the dimensionality of the LoRA/QLoRA matrices (<xref ref-type="bibr" rid="B10">Hu et al., 2021</xref>), and the Alpha hyperparameter serves as a scaling factor for the updates made by the LoRA/QLoRA matrices to the original model weights (<xref ref-type="bibr" rid="B10">Hu et al., 2021</xref>). For a given value of Rank, a series of Alpha hyperparameters were tested. The Rank values tested in this work were 32, 64, and 128. Alpha values paired with Rank 32 were 32, 64, and 128. Alpha values paired with Rank 64 were 32, 64, 128, and 256. Alpha values paired with Rank 128 were 32, 64, 128, 256, and 512. In this way, Alpha to Rank ratios of 1:4, 1:2, 1:1, 2:1, 3:1, and 4:1 were represented in the hyperparameter search to determine if an optimal Alpha to Rank ratio exists.</p>
<p>The hyperparameter search consisted of 86 hyperparameter sweeps spread across all three methods and for the four model types examined. For a given set of hyperparameters, a fine-tuning run for each model architecture and fine-tuning method was attempted, beginning with the smallest model. If the run was successful, the next model architecture, in order of increasing size, was utilized for the next run attempt. This process was continued for each successive model architecture until the models would not run due to GPU memory limitations on the cluster. After this failure point was reached, the next fine-tuning method would be selected, and the process would be repeated with the same hyperparameters, beginning with the smallest model. When all fine-tuning methods and model architecture sizes were attempted for a given set of hyperparameters, the hyperparameter set would be changed, and the entire process would begin anew. In total, 3 runs were performed using full-precision fine-tuning, 39 runs were performed with the LoRA fine-tuning method, and 44 runs with the QLoRA fine-tuning method. Twenty-nine hyperparameter sweeps were performed using the FLAN T5 Small model architecture, 29 using the FLAN T5 Base architecture, 29 using the FLAN T5 Large architecture, and 5 sweeps using the FLAN T5 XL architecture. Due to GPU memory usage constraints, all of the FLAN T5 XL runs were executed using the QLoRA fine-tuning method. The total compute time for the hyperparameter search was 499.6 hours.</p>
</sec>
<sec>
<label>3.7</label>
<title>Ablation study design</title>
<p>To quantify the contribution of DSS rationales to the performance of the downstream task, we performed an ablation comparing two variants of the loss function described in <xref ref-type="disp-formula" rid="EQ2">Equation 2</xref>: rational-augmented supervision (Alpha = 0.5), where both label and rationale predictions contribute equally to the total loss, and label-only supervision (Alpha = 1.0), where loss due to rationales is ignored during optimization. All other hyperparameters were kept constant throughout to ensure that differences in performance could be attributed exclusively to the presence or absence of rationale supervision.</p>
<p>The ablation spanned all three fine-tuning modalities and three model sizes: FLAN-T5 Small, Base, and Large. These models were selected to capture the variation in the model size while allowing all modalities to be compared, given the computational constraints. Each ablation run used the same data set and train-validation split as the full experiments, and the results were averaged over two random seeds to reduce the variance due to initialization effects.</p>
</sec>
<sec>
<label>3.8</label>
<title>Evaluation strategy and metrics</title>
<p>For the final evaluation, the best evaluation loss achieved for each hyperparameter sweep was used to perform the final comparison. The fine-tuned student models across all modalities were tested using the same evaluation data. Additionally, the computational cost of each fine-tuning method was compared by using training samples per second and total training time metrics for each model and fine-tuning modality that were captured by the Rancher Kubernetes Cluster.</p></sec>
</sec>
<sec sec-type="results" id="s4">
<label>4</label>
<title>Results</title>
<p>First, the average metrics of the hyperparameter search for each model and fine-tuning method are shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. The bottom left chart shows the mean GPU memory usage per model type and fine-tuning method. For the FLAN-T5 Small and FLAN-T5 Base model types, the full precision method utilized the most memory on average, followed by the LoRA and QLoRA methods. For the FLAN-T5 Large model type, the full-precision method uses, on average, the least amount of GPU memory, followed by the QLoRA method. The LoRA method, on average, used the most GPU memory. The average memory usage in each fine-tuning method increased with increasing model size, but a marked difference was observed in the rate of increase of memory usage between the LoRA and QLoRA methods and the full precision method. The full-precision method had a relatively small increase in memory usage between the FLAN-T5 Base and FLAN-T5 Large model types, whereas the LoRA and QLoRA methods demonstrated over twice the average memory consumption during the FLAN-T5 Large model runs over the FLAN-T5 Base model runs.</p>
<fig position="float" id="F6">
<label>Figure 6</label>
<caption><p>Quad chart details the average computational cost per model for each fine-tuning method, along with the mean evaluation loss per model for each fine-tuning method.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0006.tif">
<alt-text content-type="machine-generated">Four bar charts compare FLAN-T5 model sizes (Small, Base, Large, XL) and finetuning methods (FP, LoRA, QLoRA). Metrics shown are mean evaluation loss, training samples per second, GPU memory usage, and mean training time. QLoRA generally yields higher evaluation loss and memory usage, while LoRA reduces training time and memory needs. A legend identifies FP in purple, LoRA in brown, and QLoRA in light blue.</alt-text>
</graphic>
</fig>
<p>Next, computational metrics aside from GPU memory consumption are shown in the top right chart in <xref ref-type="fig" rid="F6">Figure 6</xref>; this chart shows the average training samples per second for each model and fine-tuning method. The training samples per second metric represents the total number of training samples processed across all workers in the cluster per second, not per individual worker. The full-precision method demonstrates the highest average training samples per second with the FLAN-T5 Small architecture, with the LoRA and QLoRA methods demonstrating the second-highest and third-highest averages, respectively. All fine-tuning methods demonstrate reduced training samples per second averages as the model architectures increase in size, but the full-precision method experiences the largest drop across architectures. The LoRA method demonstrates the highest average training samples per second with the FLAN-T5 Base, while the full-precision method comes in second, and QLoRA in third. For the FLAN-T5 Large model architecture, the full-precision method demonstrates the lowest average training samples per second, while LoRA has the highest, and QLoRA is in the middle. Notably, the FLAN-T5 XL model run with the QLoRA method demonstrates a small improvement in average training samples per second over the FLAN-T5 Large model runs. While average training samples per second are useful for showing a close look at training efficiency, the average total training time for an individual hyperparameter sweep gives an estimate of the total computational cost of training a model.</p>
<p>Third, the average total time of a hyperparameter sweep is shown in <xref ref-type="fig" rid="F6">Figure 6</xref> bottom right chart; this chart compares the average total time of a hyperparameter sweep, given in seconds, for each model architecture and fine-tuning method. The full-precision model method demonstrates a rapid increase in average total training time between the FLAN-T5 Small, Base and Large model architectures, going from the fastest method on average from the FLAN-T5 Small, to the second-fastest with FLAN-T5 Base, to the slowest method with FLAN-T5 Large. The LoRA and QLoRA methods, by comparison, demonstrate smaller, more consistent increases in average training time. Both methods demonstrate, roughly, a doubling of average training time between the FLAN-T5 Base and FLAN-T5 Large architectures. The QLoRA method shows a minor increase in average training time between the FLAN-T5 Large and FLAN-T5 XL architectures. The observation that the LoRA and QLoRA methods, on average, required more time to train than the full-precision method on smaller model sizes can be explained by the additional overhead required by these methods to compute forward and backward passes on the adapter matrices and, for QLoRA, to dequantize the 4-bit model during forward and backward passes. The full-precision method does not require this additional overhead, but the overall cost of tuning the full model leads to greater training times at the larger model sizes.</p>
<p>Fourth, the average evaluation loss per model architecture and fine-tuning method is shown in <xref ref-type="fig" rid="F6">Figure 6</xref>, top left chart. These averages take into account the entire hyperparameter search for each model architecture and fine-tuning method. The fine-tuning methods are consistent in their performance ranking across all of the model architectures. The full-precision fine-tuned models all perform better, on average, than the LoRA and QLoRA fine-tuned models, with the QLoRA fine-tuned models performing, on average, worse than the LoRA fine-tuned models. Observing all four charts together, the full-precision method, on average, has better performance and faster training times at the cost of higher GPU memory usage than the other fine-tuning methods for the FLAN-T5 Small and FLAN-T5 Base model architectures.</p>
<p>For the FLAN-T5 Large architecture, however, the full-precision fine-tuning method used less GPU memory, on average, than the LoRA and QLoRA methods. Although this finding may seem counterintuitive given the theoretical memory efficiencies of the LoRA and QLoRA methods, the result aligns with previous research (<xref ref-type="bibr" rid="B7">He et al., 2022</xref>). Specifically, the implementation of these methods introduces additional memory overhead in several different ways. First, while full-precision only updates the original model parameters, LoRA adds extra trainable matrices that, while small in comparison to the size of the original model, introduce additional parameters and optimizer states that must be stored in memory. This memory penalty increases as the number of layers in the original model increases. In this case, there are twice as many layers in the FLAN T5 Large and XL models as in the FLAN T5 Base model (see <xref ref-type="table" rid="T1">Table 1</xref>). These LoRA parameters and optimizer states are held in memory in addition to the original model during forward and backward passes, leading to additional memory consumption over full-precision fine-tuning. Next, the QLoRA method, while using a quantized original model during the training process, requires that the 4-bit model weights be dequantized to half-precision (FP16) during forward and backward passes (<xref ref-type="bibr" rid="B4">Dettmers et al., 2023</xref>), which, when combined with the aforementioned LoRA penalty, may help explain the increased memory consumption for the FLAN T5 Large and XL model sizes. Additionally, implementation differences between the full-precision and LoRA/QLoRA methods are another potential source of additional memory consumption. Full-precision utilizes standard PyTorch libraries (<xref ref-type="bibr" rid="B17">PyTorchFoundation, 2025</xref>) for training, whereas the other methods utilize Hugging Face Parameter-Efficient Fine-Tuning (PEFT) (<xref ref-type="bibr" rid="B13">HuggingFace, 2025b</xref>) and bitsandbytes (<xref ref-type="bibr" rid="B12">HuggingFace, 2025a</xref>) libraries, which may not be optimized for memory in the same way as the standard PyTorch libraries.</p>
<p>Shifting from hyperparameter search computational costs to performance results, beginning with the performance variations pertaining to the LoRA Alpha and Rank hyperparameters, <xref ref-type="fig" rid="F7">Figure 7</xref> shows the average evaluation loss, including all model architectures, for each combination of LoRA Rank and Alpha for the LoRA and QLoRA fine-tuning methods. On average, the LoRA fine-tuning method has better performance than the QLoRA fine-tuning method at higher Rank and Alpha, except Rank 64 Alpha 256. Interesting to note is that the average evaluation loss decreases with increasing Rank and increasing Alpha, and peak average performance for a given Rank parameter value occurs when Alpha is maximized. Additionally, peak average performance for any given Rank is achieved when the Alpha parameter value is maximized at four times the Rank value. These observations indicate that, for the QLoRA fine-tuning methods, performance is strongly correlated with increasing Rank and Alpha, and, for QLoRA in particular, increasing the ratio of Alpha to Rank. Furthermore, there is no crossover point observed where increasing Rank, Alpha, and the ratio of Rank to Alpha results in a worse average performance than lower values for those hyperparameters.</p>
<fig position="float" id="F7">
<label>Figure 7</label>
<caption><p>Average evaluation loss of each rank and alpha combination for the LoRA and QLoRA fine-tuning methodologies across all model architectures.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="frai-09-1665992-g0007.tif">
<alt-text content-type="machine-generated">Two side-by-side heatmaps present mean evaluation loss values for LoRA and QLoRA runs, respectively, with axes labeled Alpha and Rank. Lower values, indicated by darker green shading, are found at higher Alpha and Rank levels.</alt-text>
</graphic>
</fig>
<p>The eight best fine-tuned models were ranked in terms of evaluation loss (see <xref ref-type="table" rid="T3">Table 3</xref>). The model architecture, LoRA Alpha and Rank parameters, if applicable, GPU memory usage, and total training time are also shown. The top-performing model, the full-precision fine-tuned FLAN-T5 Large, utilized the second-least amount of GPU memory during training but required the longest training time. While the LoRA and QLoRA methods are specifically designed to reduce GPU memory consumption, the opposite was observed in <xref ref-type="table" rid="T3">Table 3</xref> and <xref ref-type="fig" rid="F6">Figure 6</xref>; the LoRA and QLoRA methods required significantly more memory to train the highest-performing model type, the FLAN-T5 Large, than the full-precision method. The exception to this observed trend is the FLAN-T5 Large model fine-tuned with the LoRA method with a LoRA Rank of 128 and a LoRA Alpha of 512. This model required the least amount of GPU memory to train among the top eight models, with the second-highest training samples processed per second and the second-lowest total training time. An observed trend with the LoRA and QLoRA methods is that the ratio of LoRA Rank to LoRA Alpha among the top-performing models is, with two exceptions, either a 2:1 or 4:1 ratio. Another trend with the LoRA and QLoRA fine-tuned models is that the total training times for the LoRA method are all significantly less than the QLoRA method, while the GPU memory usage for the QLoRA method trends lower than the LoRA method. This trend is consistent with the implementation differences between LoRA and QLoRA, as QLoRA dequantizes the base model during forward and backward passes, incurring additional training time penalties. Also notable is that the top eight models are dominated by the FLAN-T5 Large model architecture, with the FLAN-T5 XL making a sole appearance in third place. Also notable is that the LoRA fine-tuning method appears five times in the top eight, the QLoRA method appears twice, and the full-precision method appears only once.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Top eight models with lowest evaluation loss.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Loss</bold></th>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>LoRA Alpha</bold></th>
<th valign="top" align="center"><bold>LoRA rank</bold></th>
<th valign="top" align="center"><bold>GPU usage</bold></th>
<th valign="top" align="center"><bold>Training time (s)</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">0.06384</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">Full-precision</td>
<td valign="top" align="center">X</td>
<td valign="top" align="center">X</td>
<td valign="top" align="center">341</td>
<td valign="top" align="center">67,419</td>
</tr>
<tr>
<td valign="top" align="left">0.06870</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">512</td>
<td valign="top" align="center">128</td>
<td valign="top" align="center">291</td>
<td valign="top" align="center">28,680</td>
</tr>
<tr>
<td valign="top" align="left">0.06874</td>
<td valign="top" align="left">FLAN-T5 XL</td>
<td valign="top" align="left">QLoRA</td>
<td valign="top" align="center">256</td>
<td valign="top" align="center">64</td>
<td valign="top" align="center">497</td>
<td valign="top" align="center">31,297</td>
</tr>
<tr>
<td valign="top" align="left">0.06959</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">QLoRA</td>
<td valign="top" align="center">512</td>
<td valign="top" align="center">128</td>
<td valign="top" align="center">499</td>
<td valign="top" align="center">34,208</td>
</tr>
<tr>
<td valign="top" align="left">0.07039</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">64</td>
<td valign="top" align="center">64</td>
<td valign="top" align="center">580</td>
<td valign="top" align="center">27,745</td>
</tr>
<tr>
<td valign="top" align="left">0.07057</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">64</td>
<td valign="top" align="center">128</td>
<td valign="top" align="center">584</td>
<td valign="top" align="center">29,116</td>
</tr>
<tr>
<td valign="top" align="left">0.07089</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">256</td>
<td valign="top" align="center">64</td>
<td valign="top" align="center">573</td>
<td valign="top" align="center">29,317</td>
</tr>
<tr>
<td valign="top" align="left">0.07101</td>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">256</td>
<td valign="top" align="center">128</td>
<td valign="top" align="center">576</td>
<td valign="top" align="center">29,582</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Fine-tuning method, GPU Memory usage (GB), LoRA Rank and Alpha (if applicable), and Total Training Time (seconds) are shown for each model.</p>
</table-wrap-foot>
</table-wrap>
<p>Finally, the results from the ablation study compared the loss obtained when training with and without DSS rationales across model sizes and fine-tuning methods. <xref ref-type="table" rid="T4">Table 4</xref> summarizes the results. Across all eight configurations in the ablation study, training with DSS rationales (&#x003B1; = 0.5) yielded lower evaluation loss than label-only training (&#x003B1; = 1.0).</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Ablation study comparing DSS (&#x003B1; &#x0003D; 0.5) vs. label-only training (&#x003B1; &#x0003D; 1.0) across model architectures and fine-tuning methods.</p></caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th valign="top" align="left"><bold>Model</bold></th>
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>Loss (&#x003B1; &#x0003D; 0.5)</bold></th>
<th valign="top" align="center"><bold>Loss (&#x003B1; &#x0003D; 1.0)</bold></th>
<th valign="top" align="center"><bold>Loss difference</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">FLAN-T5 small</td>
<td valign="top" align="left">Full-precision</td>
<td valign="top" align="center">0.08826</td>
<td valign="top" align="center">0.08966</td>
<td valign="top" align="center">&#x0002B;1.4e-3</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 small</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">0.08247</td>
<td valign="top" align="center">0.08356</td>
<td valign="top" align="center">&#x0002B;1.1e-3</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 small</td>
<td valign="top" align="left">QLoRA</td>
<td valign="top" align="center">0.08163</td>
<td valign="top" align="center">0.09758</td>
<td valign="top" align="center">&#x0002B;1.6e-2</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 base</td>
<td valign="top" align="left">Full-precision</td>
<td valign="top" align="center">0.07533</td>
<td valign="top" align="center">0.08021</td>
<td valign="top" align="center">&#x0002B;4.9e-3</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 base</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">0.07578</td>
<td valign="top" align="center">0.07603</td>
<td valign="top" align="center">&#x0002B;2.5e-4</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 base</td>
<td valign="top" align="left">QLoRA</td>
<td valign="top" align="center">0.07739</td>
<td valign="top" align="center">0.08211</td>
<td valign="top" align="center">&#x0002B;4.7e-3</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">LoRA</td>
<td valign="top" align="center">0.06870</td>
<td valign="top" align="center">0.06977</td>
<td valign="top" align="center">&#x0002B;1.1e-3</td>
</tr>
<tr>
<td valign="top" align="left">FLAN-T5 large</td>
<td valign="top" align="left">QLoRA</td>
<td valign="top" align="center">0.06959</td>
<td valign="top" align="center">0.07326</td>
<td valign="top" align="center">&#x0002B;3.7e-3</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Metrics are averaged over two random seeds.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec sec-type="discussion" id="s5">
<label>5</label>
<title>Discussion</title>
<p>This work evaluated the performance of three fine-tuning methods&#x02014;full-precision, LoRA, and QLoRA&#x02013;across multiple FLAN-T5 model architectures, providing insights into the trade-offs between computational cost, training time, and evaluation performance between those methods. Additionally, the DSS method was utilized to efficiently create a unique dataset utilized for the natural language query to Query DSL translation task, and an ablation study was performed to test the hypothesis of DSS rationales improving model training. Together, these components demonstrate a scalable development pipeline for deploying fine-tuned language models in real-world applications that align and expand on recent advances in efficient fine-tuning methodologies and dataset creation techniques (<xref ref-type="bibr" rid="B11">Hu et al., 2024</xref>; <xref ref-type="bibr" rid="B16">Oliver and Wang, 2024</xref>; <xref ref-type="bibr" rid="B20">Rajabzadeh et al., 2024</xref>).</p>
<p>While the experimental results and ablation findings provide empirical support for the proposed fine-tuning strategy, the broader application is that efficient datasets and efficient computation are directly linked. The following sections interpret the results toward applications in production environments via trade-offs between performance and resource cost, scalability across model sizes and compute budgets, and the specific contribution of DSS rationales to downstream task learning.</p>
<sec>
<label>5.1</label>
<title>Performance trade-offs</title>
<p>The results shown in <xref ref-type="table" rid="T3">Table 3</xref> demonstrate clear trade-offs between evaluation performance and GPU memory consumption in fine-tuning methods and model architectures. Although the full-precision fine-tuning method consistently achieved better evaluation loss across the model architectures, with the exception of the FLAN-T5 XL, where resource limitations prevented its employment, it did so at the cost of significantly higher average GPU memory usage for the smaller model architectures (FLAN-T5 Small and FLAN-T5 Base). This result highlights the inherent challenge of employing the full-precision fine-tuning method in resource-constrained environments, where GPU memory is often limited. These GPU memory limitations often prevent, and in this case, did prevent, the application of larger and potentially more capable model architectures to specific problems in environments where GPU memory is a finite resource (<xref ref-type="fig" rid="F6">Figure 6</xref>). This GPU memory cost, however, is offset by the smaller number of training runs required to find the hyperparameter combination that yielded better evaluation performance than the other methods. This offset suggests that while GPU memory-based costs may be a higher up-front cost associated with full-precision fine-tuning, the lower number of runs results in a cheaper total hyperparameter search in terms of total compute time. This demonstrates that for environments where GPU compute capacity is not a constraint and/or environments where time-based usage is a constraint, then full-precision is the most efficient and appropriate method. In practical terms, these findings support a decision framework for selecting a fine-tuning method based on deployment constraints. When peak task performance and shortest model-development cycles are the primary objective, full-precision is the most effective choice. When memory availability is the dominant constraint and model size must be maximized within fixed GPU memory, QLoRA is a suitable option. When both time and GPU memory are constraints, LoRA is the optimal choice. Rather than a single optimal method, the results indicate that environments with varied constraints benefit from different fine-tuning methodology selections.</p>
</sec>
<sec>
<label>5.2</label>
<title>Scalability</title>
<p>When GPU resources are a primary constraint, the LoRA and QLoRA methods provide memory-efficient alternatives to full-precision fine-tuning for the smaller model architectures like FLAN-T5 Small and FLAN-T5 Base based upon the comparison of evaluation loss performance and memory consumption. When combined with DSS, which reduces the effort required to generate a high-quality dataset for fine-tuning, these methods offer practical solutions for real-world fine-tuning applications where GPU resources are the primary constraint. By reducing dataset creation time, dataset complexity, and memory consumption simultaneously, this pipeline (DSS plus LoRA/QLoRA fine-tuning) provides an opportunity for rapidly deploying fine-tuned, task-specific, language models on memory-constrained devices or smaller cloud configurations.</p>
<p>The memory efficiency of these fine-tuning methods became less evident as model size increased. The LoRA and QLoRA methods required significantly more memory, on average, than the full-precision method when using the FLAN-T5 Large architecture, but the QLoRA method was the only one that enabled the fine-tuning of the FLAN-T5 XL model with the available resources. Given this, the QLoRA method provides an avenue to fine-tune large model architectures on smaller compute clusters and devices at a reasonable performance trade-off to the full-precision method. DSS complements this scalability by enabling the efficient creation of datasets optimized for model fine-tuning, further reducing the total time required to achieve strong performance on domain-specific tasks. In practice, employing the outlined strategy using QLoRA will enable the fine-tuning of large model architectures in memory-constrained environments.</p>
<p>The marginal improvement in QLoRA&#x00027;s training samples per second was observed for the FLAN-T5 XL architecture over the FLAN-T5 Large architecture (<xref ref-type="fig" rid="F4">Figure 4</xref>). This shows that QLoRA&#x00027;s theoretical optimizations over the LoRA method, such as its quantization and paged-memory strategies, provide real-world advantages in computational cost for fine-tuning larger models. This efficiency advantage in samples per second is contrasted by the increased mean training time per run for the QLoRA method. The advantages of requiring less compute are offset by utilizing those resources for longer periods of time.</p>
<p>With regards to the scalability of the hyperparameter search, full-precision fine-tuning included searching over 7 hyperparameters (<xref ref-type="table" rid="T2">Table 2</xref>) that were baselined according to previous research and only required 3 runs to achieve strong performance. The LoRA and QLoRA methods required searching over an additional 4 hyperparameters (Rank, Alpha, Target Modules, Dropout Rate) that required a large number of runs to converge upon values that enabled comparable performance to the full-precision method.</p>
<p>The analysis of LoRA and QLoRA fine-tuned models revealed a strong correlation between evaluation performance and the values of the LoRA Rank and Alpha hyperparameters. Higher Rank and Alpha values led to improved average evaluation loss, with peak performance observed at ratios of 4:1 for Alpha to Rank. This suggests that increasing both parameters enables the model to learn richer representations without significantly impacting computational efficiency. It is important to note that while this work did not examine Alpha and Rank values that would exceed that 4:1 ratio, there was no indication in the data that increasing Rank and Alpha further would decrease model performance or significantly decrease the computational efficiency of fine-tuning. This suggests that further increases in Rank and Alpha values, and larger ratios of Alpha to Rank, may provide additional boosts to fine-tuning performance. For this reason, this work recommends using a LoRA Alpha to Rank ratio of at least 4:1, with the highest values for Alpha and Rank feasible, as a starting point for hyperparameter searches using these methods.</p>
</sec>
<sec>
<label>5.3</label>
<title>Effect of DSS rationales on model training</title>
<p>The ablation results in <xref ref-type="table" rid="T4">Table 4</xref> were designed to isolate the effect of DSS rationales by comparing training with rationales included (&#x003B1; = 0.5) with training with only labels (&#x003B1; = 1.0) across multiple model sizes and fine-tuning methods. Across the eight configurations examined, training with included DSS rationales yielded lower evaluation loss than label-only training, demonstrating that DSS training consistently improves model fine-tuning on the Query DSL translation task.</p>
<p>The magnitude of the improvement varied as a function of model size and fine-tuning strategy. The largest reduction in evaluation loss occurred in settings where model size and update precision were most limited: FLAN-T5 Small and, in particular, FLAN-T5 Small using QLoRA. In contrast, the smallest performance differences were observed for larger model sizes with higher precision, suggesting that the larger models were able to learn much of the task structure from the prompt and output labels alone. Taken together, these results indicate that rationale supervision acts as a structured input signal that improves learning efficiency for all models and is more impactful when computational constraints impose limits on the size of the student model. Even for larger models, where improvements were smaller, rationales continued to provide measurable improvement, indicating a comprehensive added benefit rather than an effect limited to small model architectures.</p>
<p>These observations align with prior work demonstrating that rationales improve learning efficiency by imposing intermediate reasoning that structures model learning (<xref ref-type="bibr" rid="B9">Hsieh et al., 2023</xref>). The present study extends those findings by showing that rationale supervision remains beneficial even when paired with parameter-efficient fine-tuning and quantized model weights on a more complex task, variables that were not present in earlier DSS research. This suggests that DSS rationales are not only a mechanism for improving few-shot generalization but also a tool for accelerating model learning convergence in resource-constrained fine-tuning pipelines.</p>
</sec>
<sec>
<label>5.4</label>
<title>Comparative analysis of fine-tuning methods</title>
<p>Among the fine-tuning methods, full-precision is shown to be the highest-performing and most computationally efficient method in terms of total time to converge on optimal hyperparameters. These attributes make the combination of DSS and full-precision fine-tuning the most attractive choice for a wide range of applications, particularly in scenarios requiring rapid development cycles or constrained computational budgets. The combination of DSS with the LoRA and QLoRA methods, while less effective in terms of evaluation loss than full-precision, exhibited more efficient GPU memory usage for most model architectures and enabled the training of larger models at the cost of longer training times and a longer total hyperparameter search. These observations make LoRA and QLoRA fine-tuning methods practical options for tasks where memory constraints are a primary consideration and/or where lengthy hyperparameter searches may not be a concern, such as edge deployments, GPU-constrained systems, or limited cloud environments.</p>
<p>Taken together, the performance benchmarking, scalability assessment, and ablation findings indicate that efficiency in dataset construction and efficiency in model fine-tuning accelerate the application of fine-tuned models to specific tasks. DSS reduces the data requirements across all fine-tuning methods, while the constraints of a given resource environment inform the choice between full-precision, LoRA, and QLoRA without altering the underlying training pipeline. From this, the approach introduced in this study can be applied as a modular framework where DSS serves as a consistent foundation for efficient dataset creation, and the fine-tuning method can be selected based on available compute resources, model size, and operational constraints. As a result, the choice of fine-tuning method becomes a deployment decision rather than a performance constraint.</p>
</sec>
<sec>
<label>5.5</label>
<title>Limitations</title>
<p>Several limitations should be considered when interpreting this work and can be considered directions for future study. First, the empirical evaluation is confined to a single downstream task: translation from natural language questions to Query DSL for a specific downstream data environment. Although this task is representative of a broader family of natural language to formal language translation problems, the conclusions may not directly transfer to other domains, such as open-ended text generation, conversational dialogue, or classification tasks without further validation. Second, the evaluation metrics used in this study are limited. The primary metric is token-level evaluation loss on a held-out validation set, supplemented by computational metrics such as training samples per second, total training time, and GPU memory usage. While useful for characterizing training dynamics and resource consumption, and while all training examples were screened for correctness and validity, the metrics do not directly capture task-level correctness, such as exact match rates on the DSL JSON. Although the ablation shows consistent gains in loss when rationales are included, future work should incorporate additional metrics, such as Bilingual Evaluation Understudy (BLEU), Metric for Evaluation of Translation with Explicit Ordering (METEOR), and Translation Edit Rate (TER) scores, to provide a more complete picture of model performance.</p>
<p>Due to the hyperparameter search and ablation experiments being constrained by finite computational resources, not all combinations of model size and fine-tuning methods could be explored, and therefore, conclusions about scalability are extrapolated from a subset of possible configurations. Similarly, the ablation results are averaged over a limited number of random seeds, which may not fully characterize the variance in training outcomes. Additionally, the methodology focuses on the FLAN-T5 encoder-decoder family and does not include decoder-only architectures that are prevalent in many production LLM deployments. Although the methods presented here are compatible with decoder-only models with only minor code changes, empirical validation on such architectures remains an open direction.</p></sec></sec>
<sec id="s6">
<label>6</label>
<title>Conclusions</title>
<p>Large language models are providing large steps forward in the fields of deep learning and natural language processing. In this work, we emphasize the importance of efficiently fine-tuning these models for domain-specific tasks that fall outside of their original training data. The efficiency of the methods used in creating the datasets used for fine-tuning models toward these tasks, and the computational efficiency of the methods used in fine-tuning the models themselves, have significant ramifications on the widespread adoption and use of LLMs in production settings where these models need to be tuned for specific uses. This study provided an efficient strategy for fine-tuning large language models for specific tasks. The combination of the Distilling Step-by-Step method, which enables the rapid creation of fine-tuning datasets that include rationales in addition to labels, and full-precision fine-tuning provides an end-to-end fine-tuning method for rapid development and deployment of task-specific models that match or exceed the performance of larger models on those tasks. Additionally, the comprehensive hyperparameter search and comparative analysis of the fine-tuning methods using the FLAN T5 architecture as an exemplar demonstrate that for resource-constrained environments, DSS with LoRA for fine-tuning, using a LoRA Alpha to Rank ratio of 4:1, balances performance and computation consumption. For environments where GPU compute capacity is a primary concern, utilizing DSS with QLoRA fine-tuning with an Alpha to Rank ratio of 4:1 is a performant strategy. The integration of these methods into production pipelines using a variety of LLM architectures has the potential to significantly decrease the time and cost to produce task-specific models, enabling further integration and research into the utility of these systems.</p></sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The datasets presented in this article are not readily available because dataset utilized in the submission is Controlled Unclassified Information (CUI) from US Department of Defense computer information systems. The methods detailed in the submission are Distribution A. Requests to access the datasets should be directed to <email>benjamin.marsh&#x00040;usmc.mil</email>.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>BM: Conceptualization, Data curation, Formal analysis, Investigation, Methodology, Project administration, Software, Supervision, Validation, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing, Visualization. AM: Conceptualization, Funding acquisition, Project administration, Resources, Software, Supervision, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. DR: Conceptualization, Formal analysis, Investigation, Methodology, Project administration, Supervision, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SM: Data curation, Methodology, Project administration, Resources, Software, Visualization, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SZ: Data curation, Formal analysis, Methodology, Validation, Visualization, Writing &#x02013; review &#x00026; editing, Writing &#x02013; original draft.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s10">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s11">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s12">
<title>Author disclaimer</title>
<p>Any opinions, findings, conclusions or recommendations expressed in this material are those ofthe author(s) and do not necessarily reflect the views of the Department of the Air Force.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bengio</surname> <given-names>Y.</given-names></name> <name><surname>Ducharme</surname> <given-names>R.</given-names></name> <name><surname>Vincent</surname> <given-names>P.</given-names></name></person-group> (<year>2000</year>). <article-title>A neural probabilistic language model</article-title>. <source>J. Mach. Learn. Res</source>. <volume>3</volume>, <fpage>932</fpage>&#x02013;<lpage>938</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname> <given-names>L.</given-names></name> <name><surname>Zaharia</surname> <given-names>M.</given-names></name> <name><surname>Zou</surname> <given-names>J.</given-names></name></person-group> (<year>2023</year>). <article-title>FrugalGPT: how to use large language models while reducing cost and improving performance</article-title>. <source>arXiv</source> [preprint] arXiv:/2305.05176. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2305.05176</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Contributors</surname> <given-names>O.</given-names></name></person-group> (<year>2024</year>). <source>Query DSL</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://opensearch.org/docs/latest/query-dsl/">https://opensearch.org/docs/latest/query-dsl/</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Dettmers</surname> <given-names>T.</given-names></name> <name><surname>Pagnoni</surname> <given-names>A.</given-names></name> <name><surname>Holtzman</surname> <given-names>A.</given-names></name> <name><surname>Zettlemoyer</surname> <given-names>L.</given-names></name></person-group> (<year>2023</year>). <article-title>QLoRA: Efficient finetuning of quantized LLMs</article-title>. <source>arXiv</source> [preprint] arXiv:2305.14314. 10.48550/arXiv.2305.14314</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gururangan</surname> <given-names>S.</given-names></name> <name><surname>Marasovi&#x00107;</surname> <given-names>A.</given-names></name> <name><surname>Swayamdipta</surname> <given-names>S.</given-names></name> <name><surname>Lo</surname> <given-names>K.</given-names></name> <name><surname>Beltagy</surname> <given-names>I.</given-names></name> <name><surname>Downey</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;Don&#x00027;t stop pretraining: Adapt language models to domains and tasks,&#x0201D;</article-title> in <source>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</source>, eds. <person-group person-group-type="editor"><name><surname>Jurafsky</surname> <given-names>D.</given-names></name> <name><surname>Chai</surname> <given-names>J.</given-names></name> <name><surname>Schluter</surname> <given-names>N.</given-names></name> and <name><surname>Tetreault</surname> <given-names>J.</given-names></name></person-group> (<publisher-loc>Kerrville, TX</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>8342</fpage>&#x02013;<lpage>8360</lpage>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Han</surname> <given-names>Z.</given-names></name> <name><surname>Gao</surname> <given-names>C.</given-names></name> <name><surname>Liu</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>J.</given-names></name> <name><surname>Zhang</surname> <given-names>S. Q.</given-names></name></person-group> (<year>2024</year>). <article-title>Parameter-efficient fine-tuning for large models: a comprehensive survey</article-title>. <source>arXiv</source> [preprint] arXiv:2403.14608. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2403.14608</pub-id></mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>He</surname> <given-names>J.</given-names></name> <name><surname>Zhou</surname> <given-names>C.</given-names></name> <name><surname>Ma</surname> <given-names>X.</given-names></name> <name><surname>Berg-Kirkpatrick</surname> <given-names>T.</given-names></name> <name><surname>Neubig</surname> <given-names>G.</given-names></name></person-group> (<year>2022</year>). <article-title>&#x0201C;Towards a unified view of parameter-efficient transfer learning,&#x0201D;</article-title> in <source>Proceedings of the 10th International Conference on Learning Representations (ICLR-2022)</source> (<publisher-loc>Red Hook, NY</publisher-loc>).</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hinton</surname> <given-names>G.</given-names></name> <name><surname>Vinyals</surname> <given-names>O.</given-names></name> <name><surname>Dean</surname> <given-names>J.</given-names></name></person-group> (<year>2015</year>). <source>Distilling the knowledge in a neural network.</source> arXiv preprint arXiv:1503.02531.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hsieh</surname> <given-names>C.-Y.</given-names></name> <name><surname>Li</surname> <given-names>C.-L.</given-names></name> <name><surname>Yeh</surname> <given-names>C.-K.</given-names></name> <name><surname>Nakhost</surname> <given-names>H.</given-names></name> <name><surname>Fujii</surname> <given-names>Y.</given-names></name> <name><surname>Ratner</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Distilling step-by-step! outperforming larger language models with less training data and smaller model sizes</article-title>. <source>arXiv</source> [preprint] arXiv:2305.02301. doi: <pub-id pub-id-type="doi">10.18653/v1/2023.findings-acl.507</pub-id></mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>E. J.</given-names></name> <name><surname>Shen</surname> <given-names>Y.</given-names></name> <name><surname>Wallis</surname> <given-names>P.</given-names></name> <name><surname>Allen-Zhu</surname> <given-names>Z.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Wang</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>LORA: low-rank adaptation of large language models</article-title>. <source>arXiv</source> [preprint] arXiv:2106.09685.</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>J.</given-names></name> <name><surname>Liao</surname> <given-names>X.</given-names></name> <name><surname>Gao</surname> <given-names>J.</given-names></name> <name><surname>Qi</surname> <given-names>Z.</given-names></name> <name><surname>Zheng</surname> <given-names>H.</given-names></name> <name><surname>Wang</surname> <given-names>C.</given-names></name></person-group> (<year>2024</year>). <article-title>Optimizing large language models with an enhanced lora fine-tuning algorithm for efficiency and robustness in nlp tasks</article-title>. <source>arXiv</source> [preprint] arXiv:2412.18729. doi: <pub-id pub-id-type="doi">10.1109/ICCTIT64404.2024.10928552</pub-id></mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="web"><collab>HuggingFace</collab> (<year>2025a</year>). <source>bitsandbytes</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/docs/bitsandbytes/main/en/index">https://huggingface.co/docs/bitsandbytes/main/en/index</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="web"><collab>HuggingFace</collab> (<year>2025b</year>). <source>uppercasePEFT (Parameter-Efficient Fine-Tuning)</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/docs/peft/en/index">https://huggingface.co/docs/peft/en/index</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lin</surname> <given-names>Y.</given-names></name> <name><surname>Ma</surname> <given-names>X.</given-names></name> <name><surname>Chu</surname> <given-names>X.</given-names></name> <name><surname>Jin</surname> <given-names>Y.</given-names></name> <name><surname>Yang</surname> <given-names>Z.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <etal/></person-group>. (<year>2024</year>). <article-title>Lora dropout as a sparsity regularizer for overfitting control</article-title>. <source>arXiv</source> [preprint] arXiv:2404.09610. doi: <pub-id pub-id-type="doi">10.1016/j.knosys.2025.114241</pub-id></mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Nori</surname> <given-names>H.</given-names></name> <name><surname>King</surname> <given-names>N.</given-names></name> <name><surname>McKinney</surname> <given-names>S. M.</given-names></name> <name><surname>Carignan</surname> <given-names>D.</given-names></name> <name><surname>Horvitz</surname> <given-names>E.</given-names></name></person-group> (<year>2023</year>). <article-title>Capabilities of GPT-4 on medical challenge problems</article-title>. <source>arXiv</source> [preprint] arXiv:2303.13375. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Oliver</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>G.</given-names></name></person-group> (<year>2024</year>). <article-title>Crafting efficient fine-tuning strategies for large language models</article-title>. <source>arXiv</source> [preprint] arXiv:2407.13906. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2407.13906</pub-id></mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="web"><collab>PyTorchFoundation</collab> (<year>2025</year>). <source>Pytorch</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://pytorch.org/">https://pytorch.org/</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Quentin</surname> <given-names>A.</given-names></name> <name><surname>Stella Biderman</surname> <given-names>H. S.</given-names></name></person-group> (<year>2023</year>). <source>Transformer Math 101</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://blog.eleuther.ai/transformer-math/">https://blog.eleuther.ai/transformer-math/</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Raffel</surname> <given-names>C.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Roberts</surname> <given-names>A.</given-names></name> <name><surname>Lee</surname> <given-names>K.</given-names></name> <name><surname>Narang</surname> <given-names>S.</given-names></name> <name><surname>Matena</surname> <given-names>M.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Exploring the limits of transfer learning with a unified text-to-text transformer</article-title>. <source>arXiv</source> [preprint] arXiv:1910.10683. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1910.10683</pub-id></mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rajabzadeh</surname> <given-names>A.</given-names></name> <name><surname>Zhao</surname> <given-names>M.</given-names></name> <name><surname>Patel</surname> <given-names>K.</given-names></name></person-group> (<year>2024</year>). <article-title>&#x0201C;Repeatability of fine-tuning large language models illustrated using QLoRA,&#x0201D;</article-title> in <source>Proceedings of the IEEE International Conference on Machine Learning and Applications</source> (<publisher-loc>Los Alamitos, CA</publisher-loc>: <publisher-name>IEEE Computer Society</publisher-name>), <fpage>455</fpage>&#x02013;<lpage>460</lpage>.</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Rajbhandari</surname> <given-names>S.</given-names></name> <name><surname>Rasley</surname> <given-names>J.</given-names></name> <name><surname>Ruwase</surname> <given-names>O.</given-names></name> <name><surname>He</surname> <given-names>Y.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;ZeRO: memory optimizations toward training trillion parameter models,&#x0201D;</article-title> in <source>Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis, SC &#x00027;20</source> (<publisher-loc>Atlanta, GA</publisher-loc>: <publisher-name>IEEE Press</publisher-name>).</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="web"><collab>Rancher</collab> (<year>2024</year>). <source>Rancher Community Edition Kubernetes Environment</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://github.com/rancher/rancher">https://github.com/rancher/rancher</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Scholak</surname> <given-names>T.</given-names></name> <name><surname>Schucher</surname> <given-names>N.</given-names></name> <name><surname>Bahdanau</surname> <given-names>D.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;PICARD: Parsing incrementally for constrained auto-regressive decoding from language models,&#x0201D;</article-title> in <source>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</source>, eds. <person-group person-group-type="editor"><name><surname>Moens</surname> <given-names>M. F.</given-names></name> <name><surname>Huang</surname> <given-names>X.</given-names></name> <name><surname>Specia</surname> <given-names>L.</given-names></name> and <name><surname>Yih</surname> <given-names>S. W.</given-names></name></person-group> (<publisher-loc>Punta Cana</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>9895</fpage>&#x02013;<lpage>9901</lpage>.</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Srivastava</surname> <given-names>N.</given-names></name> <name><surname>Hinton</surname> <given-names>G.</given-names></name> <name><surname>Krizhevsky</surname> <given-names>A.</given-names></name> <name><surname>Sutskever</surname> <given-names>I.</given-names></name> <name><surname>Salakhutdinov</surname> <given-names>R.</given-names></name></person-group> (<year>2014</year>). <article-title>Dropout: A simple way to prevent neural networks from overfitting</article-title>. <source>J. Mach. Learn. Res</source>. <volume>15</volume>, <fpage>1929</fpage>&#x02013;<lpage>1958</lpage>. doi: <pub-id pub-id-type="doi">10.5555/2627435.2670313</pub-id></mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Team</surname> <given-names>M. A.</given-names></name></person-group> (<year>2024</year>). <source>Cheaper, Better, Faster, Stronger</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://mistral.ai/news/mixtral-8x22b/">https://mistral.ai/news/mixtral-8x22b/</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="web"><person-group person-group-type="author"><name><surname>Team</surname> <given-names>T. R.</given-names></name></person-group> (<year>2023</year>). <source>Get Started with Distributed Training Using Hugging Face Accelerate</source>. Available online at: <ext-link ext-link-type="uri" xlink:href="https://docs.ray.io/en/latest/train/huggingface-accelerate.html">https://docs.ray.io/en/latest/train/huggingface-accelerate.html</ext-link> (Accessed October 10, 2025).</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Tinn</surname> <given-names>R.</given-names></name> <name><surname>Cheng</surname> <given-names>H.</given-names></name> <name><surname>Gu</surname> <given-names>Y.</given-names></name> <name><surname>Usuyama</surname> <given-names>N.</given-names></name> <name><surname>Liu</surname> <given-names>X.</given-names></name> <name><surname>Naumann</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Fine-tuning large neural language models for biomedical natural language processing</article-title>. <source>Patterns</source> <volume>4</volume>:<fpage>100729</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.patter.2023.100729</pub-id><pub-id pub-id-type="pmid">37123444</pub-id></mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname> <given-names>A.</given-names></name> <name><surname>Shazeer</surname> <given-names>N.</given-names></name> <name><surname>Parmar</surname> <given-names>N.</given-names></name> <name><surname>Uszkoreit</surname> <given-names>J.</given-names></name> <name><surname>Jones</surname> <given-names>L.</given-names></name> <name><surname>Gomez</surname> <given-names>A. N.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Attention is all you need</article-title>. <source>arXiv</source> [preprint] arXiv:1706.03762. doi: <pub-id pub-id-type="doi">10.48550/arXiv.1706.03762</pub-id></mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Bosma</surname> <given-names>M.</given-names></name> <name><surname>Zhao</surname> <given-names>V.</given-names></name> <name><surname>Guu</surname> <given-names>K.</given-names></name> <name><surname>Yu</surname> <given-names>A. W.</given-names></name> <name><surname>Lester</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2022</year>). <article-title>&#x0201C;Finetuned language models are zero-shot learners,&#x0201D;</article-title> in <source>International Conference on Learning Representations</source> (<publisher-loc>Red Hook, NY</publisher-loc>).</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>X.</given-names></name> <name><surname>Schuurmans</surname> <given-names>D.</given-names></name> <name><surname>Bosma</surname> <given-names>M.</given-names></name> <name><surname>Ichter</surname> <given-names>B.</given-names></name> <name><surname>Xia</surname> <given-names>F.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>. <source>arXiv</source> [preprint] arXiv:/2201.11903. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2201.11903</pub-id></mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>T.</given-names></name> <name><surname>Zhang</surname> <given-names>R.</given-names></name> <name><surname>Yang</surname> <given-names>K.</given-names></name> <name><surname>Yasunaga</surname> <given-names>M.</given-names></name> <name><surname>Wang</surname> <given-names>D.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2018</year>). <article-title>&#x0201C;Spider: a large-scale human-labeled dataset for complex and cross-domain semantic parsing and text-to-SQL task,&#x0201D;</article-title> in <source>Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing</source>, eds. <person-group person-group-type="editor"><name><surname>Riloff</surname> <given-names>E.</given-names></name> <name><surname>Chiang</surname> <given-names>D.</given-names></name> <name><surname>Hockenmaier</surname> <given-names>J.</given-names></name> and <name><surname>Tsujii</surname> <given-names>J.</given-names></name></person-group> (<publisher-loc>Brussels</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>3911</fpage>&#x02013;<lpage>3921</lpage>.</mixed-citation>
</ref>
<ref id="B32">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Li</surname> <given-names>Y.</given-names></name> <name><surname>Cui</surname> <given-names>L.</given-names></name> <name><surname>Cai</surname> <given-names>D.</given-names></name> <name><surname>Liu</surname> <given-names>L.</given-names></name> <name><surname>Fu</surname> <given-names>T.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Siren&#x00027;s song in the ai ocean: a survey on hallucination in large language models</article-title>. <source>arXiv</source> [preprint] arXiv:2309.01219. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2309.01219</pub-id></mixed-citation>
</ref>
<ref id="B33">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhong</surname> <given-names>V.</given-names></name> <name><surname>Xiong</surname> <given-names>C.</given-names></name> <name><surname>Socher</surname> <given-names>R.</given-names></name></person-group> (<year>2017</year>). <article-title>Seq2SQL: generating structured queries from natural language using reinforcement learning</article-title>. <source>arXiv preprint</source> arXiv:1709.00103.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/354571/overview">Federico Cruciani</ext-link>, Ulster University, United Kingdom</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1135117/overview">&#x00160;ar&#x0016B;nas Grigali&#x0016B;nas </ext-link>, Kaunas University of Technology, Lithuania</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2705123/overview">Prasan Yapa</ext-link>, Kyoto University of Advanced Science, Japan</p>
</fn>
</fn-group>
</back>
</article> 
