<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" article-type="data-paper">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. High Perform. Comput.</journal-id>
<journal-title-group>
<journal-title>Frontiers in High Performance Computing</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. High Perform. Comput.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2813-7337</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fhpcp.2026.1771927</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Data Report</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>OpenMP-annotated code dataset for large language model fine-tuning on parallel programming tasks</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name><surname>Etienne</surname> <given-names>Nichole</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3323532"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Garcia de Gonzalo</surname> <given-names>Simon</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3391206"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Arnold</surname> <given-names>Dorian</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x00026; editing</role>
<uri xlink:href="https://loop.frontiersin.org/people/3386733"/>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>Department of Computer Science, Emory University</institution>, <city>Atlanta, GA</city>, <country country="us">United States</country></aff>
<aff id="aff2"><label>2</label><institution>Sandia National Laboratories</institution>, <city>Albuquerque, NM</city>, <country country="us">United States</country></aff>
<author-notes>
<corresp id="c001"><label>&#x0002A;</label>Correspondence: Nichole Etienne, <email xlink:href="mailto:nichole.etienne@emory.edu">nichole.etienne@emory.edu</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-23">
<day>23</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>4</volume>
<elocation-id>1771927</elocation-id>
<history>
<date date-type="received">
<day>19</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>13</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2026 Etienne, Garcia de Gonzalo and Arnold.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Etienne, Garcia de Gonzalo and Arnold</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-23">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<kwd-group>
<kwd>code generation</kwd>
<kwd>dataset</kwd>
<kwd>fine-tuning</kwd>
<kwd>high-performance computing</kwd>
<kwd>large language models</kwd>
<kwd>OpenMP</kwd>
<kwd>parallel programming</kwd>
<kwd>pragma completion</kwd>
</kwd-group>
<funding-group>
  <funding-statement>The author(s) declared that financial support was received for this work and/or its publication. Sandia National Laboratories is a multi-mission laboratory managed and operated by National Technology &#x00026; Engineering Solutions of Sandia, LLC (NTESS), a wholly owned subsidiary of Honeywell International Inc., for the U.S. Department of Energy&#x00027;s National Nuclear Security Administration (DOE/NNSA) under contract DE-NA0003525.</funding-statement>
</funding-group>
<counts>
<fig-count count="0"/>
<table-count count="2"/>
<equation-count count="0"/>
<ref-count count="10"/>
<page-count count="6"/>
<word-count count="2993"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Architecture and Systems</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>High-performance computing (HPC) plays a critical role in scientific discovery, engineering simulation, and data-intensive applications. Open Multi-Processing (OpenMP) is one of the most widely adopted shared-memory parallel programming interfaces, enabling developers to write multi-threaded applications in C, C&#x0002B;&#x0002B;, and Fortran. However, correctly implementing OpenMP directives requires significant expertise, as developers must understand parallel programming concepts, data dependencies, and performance optimization strategies.</p>
<p>Recent advances in large language models (LLMs) have demonstrated remarkable capabilities in code generation and completion tasks (<xref ref-type="bibr" rid="B6">Jiang et al., 2024</xref>; <xref ref-type="bibr" rid="B5">Huynh and Lin, 2025</xref>; <xref ref-type="bibr" rid="B10">Zan et al., 2023</xref>). These models, trained on vast corpora of source code, can assist developers by generating code snippets, completing partial implementations, and even translating natural language descriptions into executable programs. However, general-purpose code LLMs often struggle with domain-specific parallel programming constructs such as OpenMP pragmas. The scarcity of high-quality, task-specific training data for HPC code represents a significant barrier to developing effective AI assistants for parallel programming (<xref ref-type="bibr" rid="B9">Nichols et al., 2023</xref>). While existing code generation benchmarks focus primarily on sequential programming tasks in languages like Python and Java, there is limited availability of curated datasets specifically targeting parallel programming paradigms.</p>
<p>This data report presents a curated dataset specifically designed for fine-tuning LLMs on OpenMP pragma completion tasks. The dataset contains 77,890 source files comprising over 15 million lines of code extracted from 387 GitHub repositories. Each training sample is structured to teach models the relationship between code context, loop structures, and appropriate OpenMP directives. This dataset addresses a critical gap in available resources for training AI models on parallel programming tasks and provides a foundation for developing intelligent code completion tools for HPC developers.</p>
<p>The primary contributions of this dataset are: (1) a systematically collected corpus of real-world OpenMP code from active HPC projects, (2) structured annotations that isolate OpenMP pragmas and their associated loop contexts, and (3) comprehensive preprocessing and quality filtering to ensure dataset integrity. This resource enables researchers to develop and evaluate LLMs specifically tailored for parallel programming assistance.</p></sec>
<sec id="s2">
<label>2</label>
<title>Methods</title>
<sec>
<label>2.1</label>
<title>Data collection period and source selection</title>
<p>Data collection was conducted between June and July 2024 using the GitHub API. Source repositories were identified through a systematic query process targeting HPC-relevant codebases. The selection criteria were designed to ensure code quality and relevance to OpenMP development practices:</p>
<list list-type="bullet">
<list-item><p><bold>Primary languages:</bold> C and C&#x0002B;&#x0002B; (specified as repository primary language).</p></list-item>
<list-item><p><bold>Repository topics:</bold> HPC, OpenMP, parallel-computing, scientific-computing, high-performance-computing, computational-science, proxy-application, and mini-app.</p></list-item>
<list-item><p><bold>Minimum stars:</bold> &#x02265;3 [indicating community validation and active use (<xref ref-type="bibr" rid="B2">Borges and Valente, 2018</xref>)].</p></list-item>
<list-item><p><bold>Repository scope:</bold> publicly accessible repositories with permissive licenses.</p></list-item>
</list>
<p>The GitHub API query utilized a custom Python script that performed systematic searches combining repository topics and language filters (<xref ref-type="bibr" rid="B4">Gousios and Spinellis, 2012</xref>). Authentication was handled through GitHub personal access tokens to enable comprehensive repository access. The search process iterated through pagination (up to 34 pages per query) to exhaustively collect matching repositories, with rate-limiting safeguards to comply with API restrictions.</p>
<p>This filtering strategy yielded 387 unique repositories, balancing dataset size with code quality (<xref ref-type="bibr" rid="B3">Cosentino et al., 2016</xref>; <xref ref-type="bibr" rid="B8">Munaiah et al., 2017</xref>). The minimum star threshold ensured that repositories had achieved some level of community review (<xref ref-type="bibr" rid="B2">Borges and Valente, 2018</xref>), while the topic filters specifically targeted HPC applications where OpenMP usage reflects realistic parallel programming patterns.</p>
</sec>
<sec>
<label>2.2</label>
<title>Repository cloning and file extraction</title>
<p>Identified repositories were cloned locally using GitPython library to create local copies organized by repository full name (owner/repository structure). This organization preserves provenance information for each source file.</p>
<p>Source file extraction focused exclusively on C/C&#x0002B;&#x0002B; implementation and header files, identified by extensions: .c, .cc, .cpp, .cxx, .C, .h, .hh, .hpp, .H, .hxx, .Hxx, and .HXX (<xref ref-type="bibr" rid="B1">Allamanis and Sutton, 2013</xref>). Recursive directory traversal proved approximately 2&#x02013;3 &#x000D7; faster than glob-based approaches for large directory structures. Files containing invalid path characters (brackets) were excluded to prevent filesystem conflicts.</p>
<p>Initial extraction yielded 105,861 source files totaling 22,653,593 lines of code (0.71 GB).</p>
</sec>
<sec>
<label>2.3</label>
<title>Preprocessing and quality filtering</title>
<p>To ensure dataset quality and prevent training bias, a multi-stage preprocessing pipeline was implemented:</p>
<sec>
<label>2.3.1</label>
<title>Encoding validation</title>
<p>Files containing non-UTF-8 characters were removed by attempting to read each file with UTF-8 encoding. Files that could not be decoded were excluded to prevent tokenization issues during model training.</p></sec>
<sec>
<label>2.3.2</label>
<title>Size-based filtering</title>
<p>Two size constraints were applied:</p>
<list list-type="bullet">
<list-item><p><bold>Minimum token count:</bold> files with fewer than 15 tokens (whitespace-delimited) were excluded as they typically contained only boilerplate or comments.</p></list-item>
<list-item><p><bold>File size limits:</bold> files exceeding 1 MB were removed, as these typically represented embedded libraries, generated code, or raw data rather than human-authored source code.</p></list-item>
</list></sec>
<sec>
<label>2.3.3</label>
<title>Deduplication</title>
<p>Duplicate files are prevalent across GitHub repositories due to forking, vendored dependencies, and copied implementations (<xref ref-type="bibr" rid="B7">Markovtsev and Long, 2018</xref>). SHA-256 hashes of file contents were computed using a memory-efficient streaming approach. Files with identical content hashes were deduplicated, retaining only the first occurrence.</p>
<p>After preprocessing, the dataset comprised 77,890 unique source files with 15,367,210 lines of code (0.49 GB), representing an 18% reduction from the initial collection primarily due to deduplication.</p>
</sec>
</sec>
<sec>
<label>2.4</label>
<title>OpenMP pragma extraction and annotation</title>
<p>The core dataset transformation extracts individual OpenMP <monospace>parallel for</monospace> constructs and formats them for pragma completion tasks. An automated extraction pipeline implements this transformation:</p>
<sec>
<label>2.4.1</label>
<title>Pattern matching</title>
<p>A compiled regular expression (<monospace>&#x00023;pragma omp parallel for.*</monospace>) identified all OpenMP parallel for directives in each source file. The pattern uses multiline matching to handle pragmas spanning multiple lines.</p></sec>
<sec>
<label>2.4.2</label>
<title>Loop boundary detection</title>
<p>For each identified pragma, a bracket-matching algorithm extracted the complete associated loop structure:</p>
<list list-type="order">
<list-item><p>Locate the opening brace <monospace>{</monospace> following the pragma.</p></list-item>
<list-item><p>Traverse the code incrementing a bracket stack counter for each <monospace>{</monospace> and decrementing for each <monospace>}</monospace>.</p></list-item>
<list-item><p>Extract the complete loop body when the stack returns to zero.</p></list-item>
<list-item><p>Exclude pragmas where bracket matching failed (typically single-statement loops without braces).</p></list-item>
</list></sec>
<sec>
<label>2.4.3</label>
<title>Comment stripping</title>
<p>C/C&#x0002B;&#x0002B; style comments (<monospace>//</monospace> and <monospace>/* */</monospace>) were removed from pragma lines using pattern matching to isolate the actual directive syntax. This prevents the model from learning spurious comment&#x02018;patterns.</p></sec>
<sec>
<label>2.4.4</label>
<title>Context window extraction</title>
<p>Each sample includes a configurable amount of preceding code context, enabling the model to learn contextual patterns such as variable declarations, array definitions, and computational patterns that inform appropriate pragma selection.</p></sec>
<sec>
<label>2.4.5</label>
<title>Structural annotation</title>
<p>Extracted samples are annotated with special tokens to clearly delineate components:</p>
<list list-type="bullet">
<list-item><p><monospace> &#x0003C; LOOP-START&#x0003E;</monospace> and <monospace> &#x0003C; LOOP-END&#x0003E;</monospace>: mark loop boundaries.</p></list-item>
<list-item><p><monospace> &#x0003C; OMP-START&#x0003E;</monospace> and <monospace> &#x0003C; OMP-END&#x0003E;</monospace>: mark pragma boundaries.</p></list-item>
</list>
<p>This tokenization strategy enables the model to distinguish between code to be generated (pragma) and conditioning context (loop body).</p></sec>
<sec>
<label>2.4.6</label>
<title>Directive scope</title>
<p>This dataset focuses on extracting <monospace>&#x00023;pragma omp parallel for</monospace> directives and their variants [e.g., <monospace>parallel for simd</monospace>, <monospace>parallel for reduction(</monospace><monospace>&#x02026;)</monospace>] as prediction targets. The extraction captures various clauses including scheduling policies [<monospace>schedule(static)</monospace>, <monospace>schedule(dynamic)</monospace>, and <monospace>schedule(guided)</monospace>], data-sharing (<monospace>private</monospace>, <monospace>shared</monospace>, <monospace>reduction</monospace>, <monospace>firstprivate</monospace>, and <monospace>lastprivate</monospace>), synchronization (<monospace>nowait</monospace>), loop transformations (<monospace>collapse</monospace>), thread control (<monospace>num_threads</monospace>), and ordering (<monospace>ordered</monospace>). Other OpenMP directive types such as <monospace>target</monospace> (for accelerator offloading), <monospace>task</monospace>/<monospace>taskloop</monospace> (for task-based parallelism), and the OpenMP 5.0&#x0002B; <monospace>loop</monospace> directive are not systematically extracted as prediction targets, though they may appear in surrounding code context.</p>
</sec>
</sec>
<sec>
<label>2.5</label>
<title>Dataset structure and format</title>
<p>The final dataset is stored in JSON Lines format (.jsonl), with each line representing a single training sample. Each sample contains:</p>
<list list-type="bullet">
<list-item><p><bold>Source file path:</bold> full path to the original source file (enables provenance tracking).</p></list-item>
<list-item><p><bold>Pragma directive:</bold> the extracted OpenMP pragma directive.</p></list-item>
<list-item><p><bold>Context length:</bold> number of context characters included before the loop.</p></list-item>
<list-item><p><bold>Annotated sample:</bold> the complete annotated sample (context &#x0002B; annotated loop &#x0002B; annotated pragma).</p></list-item>
</list>
<p>This format enables efficient streaming during training and preserves metadata for subsequent analysis or dataset versioning.</p>
</sec>
<sec>
<label>2.6</label>
<title>Intended use and model training protocol</title>
<p>This dataset is specifically designed for fine-tuning causal language models on OpenMP pragma completion tasks. The recommended training protocol involves:</p>
<list list-type="order">
<list-item><p><bold>Tokenization:</bold> process the annotated sample field using the target model&#x00027;s tokenizer with padding and truncation (recommended maximum length: 512 tokens).</p></list-item>
<list-item><p><bold>Label preparation:</bold> use standard causal language modeling where input sequences serve as both inputs and labels.</p></list-item>
<list-item><p><bold>Training objective:</bold> cross-entropy loss on next-token prediction.</p></list-item>
<list-item><p><bold>Evaluation task:</bold> given context and loop structure, generate the appropriate OpenMP pragma.</p></list-item>
</list>
<p>The dataset enables models to learn:</p>
<list list-type="bullet">
<list-item><p>Relationships between computational patterns and parallelization strategies.</p></list-item>
<list-item><p>Appropriate scheduling policies (static, dynamic, and guided).</p></list-item>
<list-item><p>Data sharing clauses (private, shared, and reduction).</p></list-item>
<list-item><p>Loop dependency analysis informing pragma selection.</p></list-item>
</list></sec></sec>
<sec id="s3">
<label>3</label>
<title>Dataset analysis and characteristics</title>
<sec>
<label>3.1</label>
<title>Corpus statistics</title>
<p>The final OpenMP dataset exhibits the following characteristics (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>OpenMP dataset statistics.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Metric</bold></th>
<th valign="top" align="center"><bold>Value</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Total repositories</td>
<td valign="top" align="center">387</td>
</tr>
<tr>
<td valign="top" align="left">Unique source files</td>
<td valign="top" align="center">77,890</td>
</tr>
<tr>
<td valign="top" align="left">Lines of code</td>
<td valign="top" align="center">15,367,210</td>
</tr>
<tr>
<td valign="top" align="left">Total size</td>
<td valign="top" align="center">0.49 GB</td>
</tr></tbody>
</table>
</table-wrap>
<p>The preprocessing pipeline reduced the initial collection by approximately 18%, primarily through deduplication (17.0% reduction) and size filtering (additional 11.3% reduction of the deduplicated set) as shown in <xref ref-type="table" rid="T2">Table 2</xref>.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Dataset properties after each preprocessing step.</p></caption>
<table frame="box" rules="all">
<thead>
<tr>
<th valign="top" align="left"><bold>Step</bold></th>
<th valign="top" align="center"><bold>Files</bold></th>
<th valign="top" align="center"><bold>LOC</bold></th>
<th valign="top" align="center"><bold>Size</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Initial collection</td>
<td valign="top" align="center">105,861</td>
<td valign="top" align="center">22,653,593</td>
<td valign="top" align="center">0.71 GB</td>
</tr>
<tr>
<td valign="top" align="left">After deduplication</td>
<td valign="top" align="center">87,805</td>
<td valign="top" align="center">18,553,891</td>
<td valign="top" align="center">0.58 GB</td>
</tr>
<tr>
<td valign="top" align="left">After filtering</td>
<td valign="top" align="center">77,890</td>
<td valign="top" align="center">15,367,210</td>
<td valign="top" align="center">0.49 GB</td>
</tr></tbody>
</table>
</table-wrap>
</sec>
<sec>
<label>3.2</label>
<title>File type distribution</title>
<p>The dataset spans multiple C/C&#x0002B;&#x0002B; file types, with the distribution of lines of code revealing the predominance of implementation files:</p>
<list list-type="bullet">
<list-item><p><bold>Implementation files (.c, .cpp, .cc, .cxx, .C):</bold> majority of LOC, containing actual OpenMP-annotated compute kernels.</p></list-item>
<list-item><p><bold>Header files (.h, .hpp, .hh, .H):</bold> lower LOC count, typically containing template implementations and inline functions.</p></list-item>
</list>
<p>This distribution reflects typical C/C&#x0002B;&#x0002B; project structure, where OpenMP pragmas appear predominantly in implementation files containing computational loops.</p>
</sec>
<sec>
<label>3.3</label>
<title>OpenMP pragma patterns</title>
<p>The dataset captures diverse OpenMP usage patterns from real-world HPC applications:</p>
<list list-type="bullet">
<list-item><p><bold>Scheduling policies:</bold> examples include static, dynamic, and guided scheduling with various chunk sizes.</p></list-item>
<list-item><p><bold>Data sharing clauses:</bold> private variables, shared arrays, and reduction operations [e.g., <monospace>reduction(&#x0002B;:s)</monospace>].</p></list-item>
<list-item><p><bold>Nested parallelism:</bold> some samples include nested parallel regions (though single-level pragmas dominate).</p></list-item>
<list-item><p><bold>Loop variants:</bold> both increment and decrement loops, varying iteration patterns.</p></list-item>
</list>
<p>The pragma <monospace>&#x00023;pragma omp parallel for schedule(static)</monospace> appears frequently, representing a common parallelization pattern for regular, independent loop iterations. More complex pragmas with reduction clauses reflect numerical computation patterns (e.g., dot products, norm calculations).</p>
</sec>
<sec>
<label>3.4</label>
<title>Application domains</title>
<p>The source repositories span diverse HPC application domains:</p>
<list list-type="bullet">
<list-item><p><bold>Scientific computing:</bold> numerical methods, linear algebra operations (BLAS-level implementations).</p></list-item>
<list-item><p><bold>Geospatial analysis:</bold> GIS applications with parallel raster processing.</p></list-item>
<list-item><p><bold>Proxy applications:</bold> miniapps designed to represent computational kernels from large-scale simulations.</p></list-item>
<list-item><p><bold>Computational libraries:</bold> reusable parallel algorithm implementations.</p></list-item>
</list>
<p>This domain diversity ensures the dataset captures varied computational patterns and parallelization strategies, rather than overfitting to specific application characteristics.</p>
</sec>
<sec>
<label>3.5</label>
<title>Data quality considerations</title>
<p>Several quality features distinguish this dataset:</p>
<list list-type="order">
<list-item><p><bold>Real-world code:</bold> extracted from actively maintained repositories rather than synthetic examples.</p></list-item>
<list-item><p><bold>Community validation:</bold> minimum star threshold indicates code review and usage.</p></list-item>
<list-item><p><bold>Deduplication:</bold> SHA-256 hashing prevents training on redundant patterns.</p></list-item>
<list-item><p><bold>Size filtering:</bold> eliminates both trivial files and unwieldy library dumps.</p></list-item>
<list-item><p><bold>Encoding validation:</bold> UTF-8 compliance ensures tokenization compatibility.</p></list-item>
</list>
</sec>
<sec>
<label>3.6</label>
<title>Limitations and biases</title>
<p>Users should consider the following characteristics when interpreting results:</p>
<list list-type="bullet">
<list-item><p><bold>Language scope:</bold> limited to C/C&#x0002B;&#x0002B;; Fortran OpenMP code is not represented.</p></list-item>
<list-item><p><bold>Directive scope:</bold> this dataset is designed for loop-level shared-memory parallelization with <monospace>parallel for</monospace> pragmas. The following are not included as prediction targets:</p></list-item>
</list>
<list list-type="simple">
<list-item><p>- <monospace>target</monospace> directives for GPU/accelerator offloading.</p></list-item>
<list-item><p>- <monospace>task</monospace>/<monospace>taskloop</monospace> constructs for task-based parallelism.</p></list-item>
<list-item><p>- Standalone <monospace>simd</monospace> directives for explicit vectorization.</p></list-item>
<list-item><p>- <monospace>sections</monospace> for functional parallelism.</p></list-item>
<list-item><p>- OpenMP 5.0&#x0002B; features such as the <monospace>loop</monospace> directive.</p></list-item>
</list>
<list list-type="simple">
<list-item><p>While these constructs may appear in surrounding code context, they are not prediction targets. Models trained on this dataset may not generalize to other parallelization patterns without additional training data.</p></list-item>
</list>
<list list-type="bullet">
<list-item><p><bold>GitHub bias:</bold> repository availability and topic tagging may not represent all HPC development practices.</p></list-item>
<list-item><p><bold>Temporal snapshot:</bold> collected in June-July 2024; newer OpenMP features may be underrepresented.</p></list-item>
<list-item><p><bold>Single-statement loops excluded:</bold> bracket-matching algorithm currently skips loops without explicit braces.</p></list-item>
</list>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Data reuse and interpretation guidelines</title>
<sec>
<label>4.1</label>
<title>Recommended use cases</title>
<p>This dataset is optimized for:</p>
<list list-type="order">
<list-item><p>Fine-tuning code generation models on OpenMP pragma completion and translation.</p></list-item>
<list-item><p>Developing intelligent code assistants for parallel programming.</p></list-item>
<list-item><p>Analyzing OpenMP usage patterns across HPC codebases.</p></list-item>
<list-item><p>Benchmarking model performance on domain-specific code tasks.</p></list-item>
<list-item><p>Training retrieval systems for parallel programming documentation.</p></list-item>
</list>
</sec>
<sec>
<label>4.2</label>
<title>Data access and format</title>
<p>The dataset is distributed in JSON Lines format, enabling:</p>
<list list-type="bullet">
<list-item><p><bold>Streaming processing</bold> for memory-constrained environments.</p></list-item>
<list-item><p><bold>Parallel loading</bold> via line-based sharding.</p></list-item>
<list-item><p><bold>Easy filtering</bold> based on metadata fields (source file path, pragma type, and context length).</p></list-item>
</list>
</sec>
<sec>
<label>4.3</label>
<title>Preprocessing recommendations</title>
<p>Researchers reusing this data should consider:</p>
<list list-type="bullet">
<list-item><p><bold>Tokenizer compatibility:</bold> ensure the tokenizer handles special annotation tokens (<monospace> &#x0003C; LOOP-START&#x0003E;</monospace>, etc.) appropriately.</p></list-item>
<list-item><p><bold>Context window sizing:</bold> the context length metadata enables experimentation with varying context window sizes.</p></list-item>
<list-item><p><bold>Train/validation splitting:</bold> implement file-based or repository-based splitting to prevent data leakage.</p></list-item>
<list-item><p><bold>Pragma diversity:</bold> consider stratified sampling if balancing representation of different pragma types.</p></list-item>
</list>
</sec>
<sec>
<label>4.4</label>
<title>Dataset updates and versioning</title>
<p>As OpenMP standards evolve and new HPC repositories emerge, this dataset may be extended to address current scope limitations. Planned extensions include expanding directive coverage to systematically extract <monospace>target</monospace> directives for GPU/accelerator offloading, <monospace>task</monospace>/<monospace>taskloop</monospace> constructs for task-based parallelism, standalone <monospace>simd</monospace> directives for explicit vectorization, and OpenMP 5.0&#x0002B; features such as the <monospace>loop</monospace> directive as prediction targets. Future work may also explore multi-task learning approaches that enable directive-type selection in addition to clause generation.</p>
<p>Any dataset updates should be:</p>
<list list-type="bullet">
<list-item><p>Deposited as independent versions in the repository.</p></list-item>
<list-item><p>Documented with collection date ranges and repository counts.</p></list-item>
<list-item><p>Published as Addendum articles linking to this initial Data Report.</p></list-item>
</list></sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>The OpenMP-Annotated Code Dataset described in this article is publicly available at: <ext-link ext-link-type="uri" xlink:href="https://huggingface.co/datasets/LLMforParallelCode1/OMP-FT-Source">https://huggingface.co/datasets/LLMforParallelCode1/OMP-FT-Source</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s6">
<title>Author contributions</title>
<p>NE: Conceptualization, Data curation, Formal analysis, Methodology, Writing &#x02013; original draft, Writing &#x02013; review &#x00026; editing. SG: Conceptualization, Investigation, Supervision, Writing &#x02013; review &#x00026; editing. DA: Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s8">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Allamanis</surname> <given-names>M.</given-names></name> <name><surname>Sutton</surname> <given-names>C.</given-names></name></person-group> (<year>2013</year>). <article-title>&#x0201C;Mining source code repositories at massive scale using language modeling,&#x0201D;</article-title> in <source>Proceedings of the 10th IEEE Working Conference on Mining Software Repositories (MSR)</source> (<publisher-loc>Ney York City, NY</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>207</fpage>&#x02013;<lpage>216</lpage>.</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Borges</surname> <given-names>H.</given-names></name> <name><surname>Valente</surname> <given-names>M. T.</given-names></name></person-group> (<year>2018</year>). <article-title>What&#x00027;s in a github star? Understanding repository starring practices in a social coding platform</article-title>. <source>J. Syst. Softw.</source> <volume>146</volume>, <fpage>112</fpage>&#x02013;<lpage>129</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.jss.2018.09.016</pub-id></mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Cosentino</surname> <given-names>V.</given-names></name> <name><surname>Luis</surname> <given-names>J.</given-names></name> <name><surname>Cabot</surname> <given-names>J.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Findings from github: methods, datasets and limitations,&#x0201D;</article-title> in <source>Proceedings of the 13th International Conference on Mining Software Repositories (MSR)</source> (<publisher-loc>Austin, TX</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>137</fpage>&#x02013;<lpage>141</lpage>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Gousios</surname> <given-names>G.</given-names></name> <name><surname>Spinellis</surname> <given-names>D.</given-names></name></person-group> (<year>2012</year>). <article-title>&#x0201C;Ghtorrent: Github&#x00027;s data from a firehose,&#x0201D;</article-title> in <source>Proceedings of the 9th IEEE Working Conference on Mining Software Repositories (MSR)</source> (<publisher-loc>Ney York City, NY</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>12</fpage>&#x02013;<lpage>21</lpage>.</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Huynh</surname> <given-names>N.</given-names></name> <name><surname>Lin</surname> <given-names>B.</given-names></name></person-group> (<year>2025</year>). <article-title>Large language models for code generation: a comprehensive survey of challenges, techniques, evaluation, and applications</article-title>. <source>arXiv preprint arXiv:2503.01245</source>.</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Jiang</surname> <given-names>J.</given-names></name> <name><surname>Wang</surname> <given-names>F.</given-names></name> <name><surname>Shen</surname> <given-names>J.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name> <name><surname>Kim</surname> <given-names>S.</given-names></name></person-group> (<year>2024</year>). <article-title>A survey on large language models for code generation. ACM Transactions on Software Engineering and Methodology</article-title>. <source>arXiv:2406.00515</source>.</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Markovtsev</surname> <given-names>V.</given-names></name> <name><surname>Long</surname> <given-names>W.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Public git archive: a big code dataset for all,&#x0201D;</article-title> in <source>Proceedings of the 15th International Conference on Mining Software Repositories (MSR)</source> (<publisher-loc>Gothenburg</publisher-loc>: <publisher-name>ACM</publisher-name>), <fpage>34</fpage>&#x02013;<lpage>37</lpage>.</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Munaiah</surname> <given-names>N.</given-names></name> <name><surname>Kroh</surname> <given-names>S.</given-names></name> <name><surname>Cabrey</surname> <given-names>C.</given-names></name> <name><surname>Nagappan</surname> <given-names>M.</given-names></name></person-group> (<year>2017</year>). <article-title>Curating github for engineered software projects</article-title>. <source>Empir. Softw. Eng.</source> <volume>22</volume>, <fpage>3219</fpage>&#x02013;<lpage>3253</lpage>.</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Nichols</surname> <given-names>D.</given-names></name> <name><surname>Marathe</surname> <given-names>A.</given-names></name> <name><surname>Menon</surname> <given-names>H.</given-names></name> <name><surname>Gamblin</surname> <given-names>T.</given-names></name> <name><surname>Bhatele</surname> <given-names>A.</given-names></name></person-group> (<year>2023</year>). <article-title>&#x0201C;Hpc-coder: modeling parallel programs using large language models,&#x0201D;</article-title> in <source>Proceedings of the 39th International Conference on ISC High Performance 2024 Research Paper</source> (<publisher-loc>Hamburg</publisher-loc>), <fpage>1</fpage>&#x02013;<lpage>12</lpage>.</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book"><person-group person-group-type="author"><name><surname>Zan</surname> <given-names>D.</given-names></name> <name><surname>Chen</surname> <given-names>B.</given-names></name> <name><surname>Zhang</surname> <given-names>F.</given-names></name> <name><surname>Lu</surname> <given-names>D.</given-names></name> <name><surname>Wu</surname> <given-names>B.</given-names></name> <name><surname>Guan</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>&#x0201C;Large language models meet nl2code: a survey,&#x0201D;</article-title> in <source>Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics, Vol. 1: Long Papers</source> (<publisher-loc>Toronto</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>7443</fpage>&#x02013;<lpage>7464</lpage>.</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by" id="fn0001">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2853890/overview">Richard Gerber</ext-link>, Berkeley Lab (DOE), United States</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by" id="fn0002">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3235542/overview">Akash Dutta</ext-link>, Advanced Micro Devices, United States</p>
</fn>
</fn-group>
</back>
</article>