<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Earth Sci.</journal-id>
<journal-title>Frontiers in Earth Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Earth Sci.</abbrev-journal-title>
<issn pub-type="epub">2296-6463</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1530004</article-id>
<article-id pub-id-type="doi">10.3389/feart.2025.1530004</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Earth Science</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Assessing named entity recognition by using geoscience domain schemas: the case of mineral systems</article-title>
<alt-title alt-title-type="left-running-head">Villacorta Chambi et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/feart.2025.1530004">10.3389/feart.2025.1530004</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Villacorta Chambi</surname>
<given-names>Sandra Paula</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/555657/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/investigation/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Lindsay</surname>
<given-names>Mark</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2763614/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/resources/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Klump</surname>
<given-names>Jens</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2694886/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gessner</surname>
<given-names>Klaus</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Gray</surname>
<given-names>Erin</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>McFarlane</surname>
<given-names>Helen</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>IAEG Peruvian Group</institution>, <addr-line>Lima</addr-line>, <country>Peru</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>CSIRO, Mineral Resources</institution>, <addr-line>Kensington</addr-line>, <addr-line>WA</addr-line>, <country>Australia</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>The School of Earth Sciences</institution>, <institution>The University of Western Australia</institution>, <addr-line>Crawley</addr-line>, <addr-line>WA</addr-line>, <country>Australia</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>ARC centre for Data Analytics for Resources and Environments (DARE)</institution>, <addr-line>Perth</addr-line>, <addr-line>WA</addr-line>, <country>Australia</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Department of Energy, Mines, Industry Regulation and Safety</institution>, <institution>Geological Survey of Western Australia</institution>, <addr-line>East Perth</addr-line>, <addr-line>WA</addr-line>, <country>Australia</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1983588/overview">Fan Xiao</ext-link>, Sun Yat-sen University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2055378/overview">Antony Mamuse</ext-link>, Midlands State University, Zimbabwe</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2980951/overview">Feng Han</ext-link>, Guangxi Minzu University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Sandra Paula Villacorta Chambi, <email>villacortasp@gmail.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>06</day>
<month>05</month>
<year>2025</year>
</pub-date>
<pub-date pub-type="collection">
<year>2025</year>
</pub-date>
<volume>13</volume>
<elocation-id>1530004</elocation-id>
<history>
<date date-type="received">
<day>18</day>
<month>11</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>31</day>
<month>03</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2025 Villacorta Chambi, Lindsay, Klump, Gessner, Gray and McFarlane.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Villacorta Chambi, Lindsay, Klump, Gessner, Gray and McFarlane</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Named Entity Recognition (NER) is crucial for accurately extracting and classifying specialized domain terms from textual data. This study introduces the Schema for Mineral Systems (SMS), designed through domain characterization, word disambiguation, taxonomy development, and expert input to refine NER approaches in geosciences. SMS, featuring nine geological and five general entity classes, enhances the precision of term identification in mineral system texts. Utilizing domain-specific dictionaries and schema-linked annotations, the schema facilitates the distinct recognition of unique terms, underscored by iterative expert validation to refine NER accuracy. Applied to iron and lithium deposit corpora in Western Australia, SMS highlights the challenges and effectiveness of context-specific schemas in specialized knowledge extraction and accurate entity recognition within complex domains.</p>
</abstract>
<kwd-group>
<kwd>knowledge management</kwd>
<kwd>NLP</kwd>
<kwd>NER</kwd>
<kwd>geological terminology</kwd>
<kwd>ontologies</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Geoinformatics</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Named Entity Recognition (NER) has evolved significantly since <xref ref-type="bibr" rid="B48">Grishman and Sundheim, (1996)</xref> foundational work, where NER was defined as &#x201c;the task of identifying and classifying proper names in text into predefined categories,&#x201d; such as Persons, Organizations, Locations, Dates, and Times. NER&#x2019;s importance has been recognized in various fields, including geosciences, where it helps in the automatic extraction and classification of geological terminology from unstructured scholarly literature and reports. A critical component of developing NER systems is the use of text corpora&#x2014;large, structured sets of texts used for linguistic analysis and model training. Corpora provide empirical data essential for refining lexical and grammatical theories, as well as supporting the development of Natural Language Processing (NLP) models (<xref ref-type="bibr" rid="B49">Biber et al., 1998</xref>).</p>
<p>In the context of geosciences, the adoption of Information Extraction (IE) technologies has played a vital role in automating knowledge discovery and reducing the need for manual intervention. <xref ref-type="bibr" rid="B2">Angeli et al. (2015)</xref> emphasize that IE technologies have been instrumental in extracting valuable information from large datasets, which is particularly relevant for managing the vast amount of unstructured data in the geoscience domain. With the advent of advanced Machine Learning (ML) and NLP techniques, especially with models like BERT (<xref ref-type="bibr" rid="B9">Devlin et al., 2018</xref>) and GPT-3 (<xref ref-type="bibr" rid="B7">Brown, 2020</xref>), NER tasks have enhanced capabilities in capturing language context and semantics. Building on these technological advancements, it is recognized that specific challenges exist in geological text processing, such as ambiguity and variability in the analysed text (<xref ref-type="bibr" rid="B17">Huber and Klump, 2015</xref>; <xref ref-type="bibr" rid="B37">Qiu et al., 2019</xref>). Ambiguity can arise from poorly written text or a lack of sufficient context, making it difficult for NER models to capture meaning successfully. Variability relates to subtle but important differences between domains (for example, terms like &#x2018;formation&#x2019; or &#x2018;basin&#x2019;). They recognise the need to develop flexible and scalable models tailored to the unique characteristics of the geoscience domain. To address these challenges, the use of structured frameworks such as ontologies has emerged as a crucial strategy for establishing and defining relationships among concepts within a specific domain. Ontologies like OntoGeonous (<xref ref-type="bibr" rid="B24">Lombardo et al., 2018</xref>), GeoCore (<xref ref-type="bibr" rid="B11">Garcia, 2020</xref>) and the GeoScience Ontology (GSO) (<xref ref-type="bibr" rid="B50">Brodaric and Richard, 2020</xref>) illustrate how these frameworks enable systematic NER and contextual understanding in the field of geosciences. OntoGeonous integrated semantic technologies for geologic mapping across various geological concepts (<xref ref-type="bibr" rid="B27">Mantovani et al., 2020</xref>). GeoCore&#x2019;s structured approach enabled categorization and retrieval of geoscientific information, demonstrating how well-defined ontological frameworks can facilitate semantic consistency across diverse datasets. GSO, developed in Canada, exemplifies a structured representation of key geoscience knowledge through a three-layer framework, enabling comprehensive representation and customization for specific requirements. Despite this advancements, there are downsides in fully capturing the dynamic and complex nature geosciences. For example, <xref ref-type="bibr" rid="B3">Babiae et al. (2023)</xref> illustrates in the case of mineral systems that even well-designed ontologies are not suitable for direct application in NER tasks without substantial transformation and adaptation. These ontologies also are not readable accessible for the public research. OntoSimilarly, GeoCore and GSO faced challenges in integrating knowledge with data usage and adapting to emerging terminologies across various geological sub-schemas (cf. sub-disciplines and sub-categories used elsewhere). Problems arise when updating ontologies, as geoscience is constantly evolving. Incorporating new subschemas requires expert validation and periodic revisions to include new interdisciplinary terms that often do not fit into existing categories, complicating integration efforts. Additionally, reliance on foundational frameworks creates difficulties in adapting to new terminology and connecting with other domain-specific ontologies, resulting in a labor-intensive process (<xref ref-type="bibr" rid="B11">Garcia, 2020</xref>; <xref ref-type="bibr" rid="B50">Brodaric and Richard, 2020</xref>). Addressing these challenges, <xref ref-type="bibr" rid="B36">Qiu et al. (2023)</xref> introduced a geological domain ontology with over 50,000 terms across twenty-three sub-categories, representing a significant step forward in enhancing NER in geosciences. Their research classifies geological entities into six main types, including geological time and structures, facilitating systematic labeling of academic literature in Chinese language.</p>
<p>Schemas form the core structure of these ontologies enabling the organization of data, the definition of relationships between words, and the recognition of domain-specific terminology. The development of effective schemas for NER in geosciences has been highlighted by <xref ref-type="bibr" rid="B26">Ma (2022)</xref> and further explored by <xref ref-type="bibr" rid="B51">Wang et al. (2022)</xref> who discussed the use of schemas in structuring geoscientific data, stressing the importance of having clear objectives and focused classification types relevant to the field. Despite these advancements, a significant challenge in achieving optimal NER precision in geosciences, is the need for large annotated corpora verified and validated by experts (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>).</p>
<p>Geoscience-specific controlled vocabularies, hosted by commissions and national or state surveys, exist for many concepts, such as stratigraphic rank (<xref ref-type="bibr" rid="B52">Cox and Richards, 2015</xref>) and lithology, further supporting the organization and standardization of geoscientific data. These controlled vocabularies provide an ontological framework emphasizing the importance of clarity and consistency in communication within specialized topics like mineral exploration (<xref ref-type="bibr" rid="B22">Lindsay et al., 2024</xref>), further illustrating the need for domain-specific tools and frameworks in NER tasks.</p>
<p>While significant progress has been made in NER for geosciences, comprehensive, ontology-driven approaches remain a critical challenge. Such approaches are essential for enhancing the capabilities of automated annotation systems and for effectively exploring the intricate relationships between geological entities. To contribute to addressing this challenge, building geological knowledge graphs offers a new approach to structuring complex geoscience texts and provides a practical visual analysis of the insights from geoscience papers (<xref ref-type="bibr" rid="B47">Zhou et al., 2021</xref>). Addressing these gaps offers an opportunity to broaden the scope of NER research to encompass a wider variety of entities and relationships, thereby increasing its relevance and applicability to geoscience research and exploration.</p>
<p>This study aims to contribute to these ongoing efforts on improving NER by analyzing the use of specialized geological schemas tailored for this field. Building on prior research in corpora creation, this research explores avenues for achieving a semantic understanding of geoscientific language. Specifically, our research focuses on the development and application of three distinct geological schemas: OzRock, GeoIElite_rev, and SMS, applied to corpora concerning iron and lithium mineral deposits in Western Australia. This approach exemplifies ML applications in geological contexts and contributes to the understanding and processing of geoscientific language, targeting a notably underexplored area in geosciences. The primary objectives are to enhance geoscientific data processing and knowledge representation, thereby optimizing the extraction of information from geoscientific texts. Such improvements are crucial for facilitating more efficient knowledge discovery and data management within the field.</p>
<p>In the following sections of this article, we will show that assessing NER in the geoscience domain enables more reliable results consistent with geological reasoning. Accordingly, adhering to the methodologies outlined in this paper provides a practical approach to assess the effectiveness of NER in this intricate field.</p>
</sec>
<sec sec-type="methods" id="s2">
<title>2 Methodology</title>
<p>This section outlines the creation of a domain-specific schema, emphasizing its key role in enhancing NER and classification. Developing such a schema is connected to addressing specific research questions for understanding complex domain issues, and schemas are more effective when they are aligned with the domain&#x2019;s processes and when they can capture lexical entities and their semantic relationships. In response to the identified gaps within existing ontological frameworks as discussed in the introduction, we are including the process of adapting these frameworks into a functional schema tailored specifically for the NER system for this paper. This adaptation involved customization to align vocabularies with the unique lexical and semantic challenges presented by texts on mineral systems, ensuring the final schema could effectively support entity recognition and classification. The exploration of the OntoLex-Lemon model&#x2014;the primary mechanism for representing lexical data on the Semantic Web, demonstrates how detailed semantic relationships, context-specific usage, and multilingual representation can be effectively captured (<xref ref-type="bibr" rid="B29">McCrae et al., 2017</xref>). This model defines lexical concepts with metadata about usage contexts, capturing nuanced differences in word usage across domains and languages, including specialized terms. Through two use cases, representing multilingual dictionaries and the WordNet Collaborative Interlingual Index, <xref ref-type="bibr" rid="B29">McCrae et al. (2017)</xref> illustrate how the model addresses complex linguistic structures and domain-specific terminology. Such a semantic approach when designing the ontological framework is reinforced by <xref ref-type="bibr" rid="B5">Bikaun et al. (2024)</xref> who developed a schema for the maintenance domain. It focuses on maintenance work, order texts generated by technicians during engineering tasks, primarily describing equipment conditions. Their schema is structured around critical questions, such as &#x2018;who is performing what action on what component, and why?&#x2019; For instance, &#x2018;who?&#x2019; refers to a technician, &#x2018;what action?&#x2019; could be &#x2018;replacing,&#x2019; &#x2018;what component?&#x2019; could be &#x2018;a broken alternator bolt,&#x2019; and &#x2018;why?&#x2019; would be &#x2018;due to failure.&#x2019; By organizing the schema around these questions, <xref ref-type="bibr" rid="B5">Bikaun et al. (2024)</xref> emphasize the importance of a question-driven design in improving information extraction and knowledge representation. This approach ensures technical robustness and relevance to research objectives, leading to more precise and meaningful entity recognition outcomes while avoiding the ambiguity that arises from non-specific constraints, such as excessive class options that confuse AI systems when selecting the correct class in a given context.</p>
<sec id="s2-1">
<title>2.1 Steps to create a schema in a specialized domain</title>
<p>Adhering to the methodology proposed by <xref ref-type="bibr" rid="B20">Lamparter et al. (2004)</xref> and incorporating the insights provided by <xref ref-type="bibr" rid="B36">Qiu et al. (2023)</xref>, the procedure entails the following stages:<list list-type="simple">
<list-item>
<p>&#x2022; Domain Characterization: This involves identifying and defining the scope and relevant concepts within the specific domain to ensure comprehensive coverage and precision in entity recognition. Collaboration with domain experts to capture the intricacies of the selected field and create annotation guidelines and a domain ontology are considered.</p>
</list-item>
<list-item>
<p>&#x2022; Word Disambiguation: This step is crucial for distinguishing between multiple meanings of terms, which is common in technical fields like geoscience. It involves deciding the most appropriate meanings and improving clarity in entity recognition to ensure that the schema accurately reflects the intended connotation of terms. This potentially reduces ambiguity and enhances precision in domain-specific entity classification.</p>
</list-item>
<list-item>
<p>&#x2022; Taxonomy Creation: Develop a hierarchical organization by identifying and structuring the domain&#x2019;s classes, entities, and relations. This taxonomy forms the backbone of the schema, facilitating systematic classification and information retrieval (<xref ref-type="fig" rid="F1">Figure 1</xref>). It requires defining parent-child relationships, attribute hierarchies, and cross-references among entities. The taxonomy should be flexible enough to accommodate new findings and scalable to manage large datasets. Tools like ontology editors can assist in visualizing and managing this complex structure.</p>
</list-item>
<list-item>
<p>&#x2022; Identification of Other Relations: Beyond hierarchical classifications, capturing complex interactions within the analysed data is essential for accurately representing the nuances of geological information. This step involves identifying and defining relational attributes that illustrate how different entities interact or influence one another, including temporal relationships, spatial dependencies, and causal links. These relations enrich the schema, allowing for more dynamic querying and analysis of geoscientific data. The approach of <xref ref-type="bibr" rid="B36">Qiu et al. (2023)</xref> in leveraging a knowledge graph to capture these intricate relationships can serve as a model for this process.</p>
</list-item>
</list>
</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Typical academic papers database schema (from <xref ref-type="bibr" rid="B14">Guo et al., 2024</xref>).</p>
</caption>
<graphic xlink:href="feart-13-1530004-g001.tif"/>
</fig>
</sec>
<sec id="s2-2">
<title>2.2 Tools and libraries</title>
<p>The following tools are common in these kinds of applications:<list list-type="simple">
<list-item>
<p>&#x2022; NLTK (Natural Language Toolkit): This library is used for text tokenization, particularly for breaking down the extracted text from PDF documents into individual sentences (<xref ref-type="bibr" rid="B25">Loper and Bird, 2002</xref>). It is widely recognized for its extensive collection of text-processing libraries suitable for tokenization, parsing, and classification tasks.</p>
</list-item>
<list-item>
<p>&#x2022; Pdfplumber and Pdfminer: These libraries are employed to extract text from PDF files. <italic>pdfplumber</italic> (<xref ref-type="bibr" rid="B40">Singer-Vine, 2020</xref>) offers robust capabilities for extracting text, tables, and other data from PDFs, while <italic>pdfminer</italic> (<xref ref-type="bibr" rid="B39">Shinyama and Guglielmetti, 2014</xref>) handles exceptions related to PDF parsing errors.</p>
</list-item>
<list-item>
<p>&#x2022; Flair: This library is popular for its good performance when training and applying the NER model for specialized domains (<xref ref-type="bibr" rid="B1">Akbik et al., 2019</xref>). Flair provides a simple interface for training and applying state-of-the-art sequence taggers, such as NER models.</p>
</list-item>
<list-item>
<p>&#x2022; Pandas: A data manipulation and analysis library used here to load and handle annotated datasets from CSV files (<xref ref-type="bibr" rid="B30">McKinney, 2010</xref>). <italic>Pandas</italic> is essential for managing and preprocessing structured data efficiently.</p>
</list-item>
<list-item>
<p>&#x2022; Scikit-learn: This library provides ML and statistical modelling tools, including the <italic>confusion_matrix</italic> and <italic>classification_report</italic> functions used to evaluate the NER model&#x2019;s performance (<xref ref-type="bibr" rid="B34">Pedregosa et al., 2011</xref>). These functions are fundamental for generating performance metrics that offer insights into the model&#x2019;s accuracy and error rates.</p>
</list-item>
<list-item>
<p>&#x2022; Matplotlib and Seaborn: These libraries are utilized for data visualization, specifically for plotting the confusion matrix. <italic>Matplotlib</italic> is a versatile plotting library, while <italic>Seaborn</italic> builds on <italic>Matplotlib</italic> by providing an interface for creating informative statistical graphics.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2-3">
<title>2.3 Validation and performance evaluation</title>
<p>The validation process evaluates the suitability of schemas when combined with the NER model for recognizing and classifying specific domain entities. This aims to assess NER model performance and identify areas for improvement. It also helps to identify common misclassifications of the NER algorithm and understand the underlying reasons for these errors. Key steps of this process are:<list list-type="simple">
<list-item>
<p>&#x2022; Annotation and Benchmarking: These involve manually annotated benchmark datasets. These datasets contain specific-domain entities that experts annotate (verify) for correct classification using the selected schema. They serve as a reference for evaluating the performance of the NER models.</p>
</list-item>
<list-item>
<p>&#x2022; Evaluation Using Confusion Matrices: The model&#x2019;s predictions must be compared to the benchmark dataset to visualize the correspondence between the NER model&#x2019;s predictions and the verified dataset categories. The visual representation helps identify areas where the model performed well and struggled when classifying entities. Darker diagonal cells in the matrices indicate correct predictions, while lighter, non-diagonal cells highlighted misclassifications.</p>
</list-item>
<list-item>
<p>&#x2022; Weighted F1 score: This is an evaluation metric that combines precision and recall assessing the performance of NER systems, especially in scenarios where class distribution may be uneven or complex (<xref ref-type="bibr" rid="B41">Tjong Kim Sang and De Meulder, 2003</xref>). Unlike the standard F1 score, the weighted F1 score calculates the F1 score for each class and then takes a weighted average based on the number of instances of each class. This ensures that the score reflects the model&#x2019;s performance across all entity classes, not just the most frequent ones.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2-4">
<title>2.4 Script pipeline</title>
<p>The application starts by converting PDF documents into corpora and then applies a pre-trained NER model to finalize evaluating its performance using a confusion matrix and classification report. It includes the following parts (<xref ref-type="fig" rid="F2">Figure 2</xref>):<list list-type="simple">
<list-item>
<p>&#x2022; Pre-processing: The script begins by extracting text from PDF files located in a specified directory on the virtual environment (workspace). The <italic>pdfplumber</italic> library is used to open and read the text from each PDF. The text is then tokenized into sentences using <italic>nltk.sent_tokenize</italic>, which facilitates subsequent processing by the NER model. This step converts the raw textual data into a format the model can process (<xref ref-type="bibr" rid="B45">Villacorta and Lindsay, 2023</xref>).</p>
</list-item>
<list-item>
<p>&#x2022; NER: A pre-trained NER model (<italic>best-model.pt</italic>), previously obtained using an annotated dataset on a domain-specific corpus, is loaded using the Flair library to recognize entities relevant to the geosciences.</p>
</list-item>
<list-item>
<p>&#x2022; Sentence Extraction: Sentences are extracted from the PDF files stored in the workspace for further processing. Using pandas, an annotated dataset (validated by experts) containing manually labelled sentences is loaded from a CSV file. These annotations serve as a benchmark for evaluating the model&#x2019;s predictions.</p>
</list-item>
<list-item>
<p>&#x2022; Entity Classification: The NER model is applied to each sentence from the annotated dataset. The model identify terms and assigns entity labels for each sentence.</p>
</list-item>
<list-item>
<p>&#x2022; Evaluation: The evaluate function compares the previously identified entity labels to the true labels in the annotated dataset. It calculates and prints a classification report, which includes precision, recall, and F1-scores for each entity class. A confusion matrix is also generated to visually represent the model&#x2019;s performance. This step is essential for assessing the model&#x2019;s accuracy.</p>
</list-item>
</list>
</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Pipeline of the NER process.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g002.tif"/>
</fig>
</sec>
</sec>
<sec id="s3">
<title>3 Case study: comparing the efficacy of the NER Flair model using geological schemas and geoscience papers on iron and lithium deposits in Australia</title>
<p>This section outlines the creation of the Schema for Mineral System (SMS), designed from a controlled vocabulary for mineral exploration. This schema incorporates the critical components of mineral systems unifying cross-discipline geoscientific concepts to capture various physical processes and spatial-temporal elements associated with the formation of economically viable mineral resources (<xref ref-type="bibr" rid="B22">Lindsay et al., 2024</xref>). By using SMS we illustrate the benefits of domain-specific ontological approaches and ML tools for entity recognition and classification of complex terminology. In this study, we use the SMS to structure corpora derived from academic literature related to iron and lithium deposits and to assess the NER of mineral systems terminology. Following the method explained in <xref ref-type="sec" rid="s2">Section 2</xref>, while any kind of question can guide schema development, they need to specifically focus on the &#x2018;what&#x2019;, &#x2018;how&#x2019;, &#x2018;when&#x2019;, or &#x2018;why&#x2019; of geological processes and phenomena, rather than broad, no-process-oriented questions. For example, general &#x2018;who&#x2019; questions are not relevant here, as our schema is centered on natural processes rather than human actions. Considering that, three specific questions were selected to benchmark our schema design and evaluate its effectiveness in capturing relevant terminology (geological entities) in the context of the mineral deposits we have chosen:<list list-type="simple">
<list-item>
<p>&#x2022; &#x201c;What are the important tectonic processes for lithium-bearing deposits?&#x201d;;</p>
</list-item>
<list-item>
<p>&#x2022; &#x201c;What are the important structures for iron deposits in Western Australia?&#x201d;;</p>
</list-item>
<list-item>
<p>&#x2022; &#x201c;What are the important mineralogical associations for iron and lithium deposits in Australia?&#x201d;</p>
</list-item>
</list>
</p>
<p>These questions, serving as benchmarks rather than research inquiries, set specific objectives for the schema. By assessing how accurate the schema identifies and classifies terms related to these questions, we can have an approximation on its ability to cover essential terminology (critical geological concepts relevant to mineral systems), as well as the contextually appropriateness of the selected entity classes.</p>
<p>The tools summarized on <xref ref-type="sec" rid="s2-2">Section 2.2</xref> were integrated into the EASI Hub high-performance cluster (<xref ref-type="bibr" rid="B53">Woodcock et al., 2018</xref>), which has a Tesla V100 GPU, a high-performance processor designed specifically for deep learning and parallel computing tasks. The complete script with usage instructions is included in <xref ref-type="sec" rid="s13">Supplementary Appendix 1</xref>.</p>
<sec id="s3-1">
<title>3.1 Training of the Flair NER model</title>
<p>In this study, Flair was employed to read geological corpora using three geological schemas: OzRock (<xref ref-type="bibr" rid="B10">Enkhsaikhan, 2021</xref>), GeoIElite_rev (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>), and SMS. Flair&#x2019;s effectiveness in domain-specific applications is well-documented across diverse fields such as the biomedical sector for extracting entities like diseases and genes (<xref ref-type="bibr" rid="B33">Patel, 2020</xref>), the legal field for identifying statutes and case law (<xref ref-type="bibr" rid="B28">Mathis, 2022</xref>), and the business sector for recognizing financial entities like organizations and currencies (<xref ref-type="bibr" rid="B4">Bhattacharya, 2023</xref>). The training process involved fine-tuning the Flair NER model on annotated datasets associated with OzRock, GeoIElite_rev and SMS. Each of these schemas was selected for its characteristics wich make them adequate within geoscientific text processing. OzRock offers a comprehensive overview of general geological entity classes and serves as the baseline for understanding common geological terms and categories relevant to mineral exploration (<xref ref-type="bibr" rid="B10">Enkhsaikhan, 2021</xref>). GeoIElite_rev was developed to delve deeper into specialized geological entity classes and focuses on processing academic papers concerning iron deposits in Western Australia. It enhances the granularity of geological classifications beyond the foundational OzRock. Complementing these, the SMS schema was developed by the research group involved on writing this paper to address the most complex and nuanced aspects of geoscientific terminology, particularly those associated with mineral systems. The training process involved fine-tuning the Flair NER model on annotated datasets corresponding to these schemas.</p>
<sec id="s3-1-1">
<title>3.1.1 Datasets</title>
<p>
<list list-type="simple">
<list-item>
<p>- Iron and lithium deposits in Western Australia hold considerable economic and environmental importance, influencing global markets, particularly in steel production and battery manufacturing domestically and internationally (<xref ref-type="bibr" rid="B54">Angerer et al., 2015</xref>; <xref ref-type="bibr" rid="B35">Perring et al., 2020</xref>; <xref ref-type="bibr" rid="B13">Greim et al., 2020</xref>). Western Australia&#x2019;s geological setting and tectonic development are prospective for these deposits and thus worthwhile for scientific investigation. This study explores academic literature about these deposits within the framework described here using three geoscience schemas: OzRock, GeoIElite_rev, and SMS.</p>
</list-item>
<list-item>
<p>- <italic>OzRock</italic> (<xref ref-type="bibr" rid="B10">Enkhsaikhan, 2021</xref>) was generated from a corpus of hundreds of documents and is focused on mineral exploration. In this dataset, the geological entities are categorised into six types (<xref ref-type="table" rid="T1">Table 1</xref>). For our research, the OzRock Evaluation set, the annotations based on which comprise 83,838 sentences and 3,238 entities, was utilized as a schema to train Flair NER to produce a customized NER model for the geoscience domain. This model enabled recognition of geological classes from the explored geoscience papers related to iron and lithium deposits. This dataset, publicly available on GitHub (<ext-link ext-link-type="uri" xlink:href="https://github.com/majiga/OzROCK">https://github.com/majiga/OzROCK</ext-link>), was already annotated by domain experts, allowing us to utilize it directly for model training without additional annotation. The wide coverage and extensive representation of general geological entities provided a robust base for the model to learn from well-defined classes.</p>
</list-item>
<list-item>
<p>- GeoIElite_rev (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>) was developed to compare other geological classes with those included in OzRock. It was constructed from concepts detailed in 20 PDF papers focused on iron deposits (list of papers in <xref ref-type="sec" rid="s13">Supplementary Appendix 2</xref>). This dataset encompasses eighteen distinct entity classes (<xref ref-type="table" rid="T2">Table 2</xref>) and required manual annotation to ensure its entity classes were appropriately applied to this project. The annotation count for GeoIElite_rev includes 5,400 sentences and 5,028 entities.</p>
</list-item>
<list-item>
<p>- SMS: The Schema for Mineral System (SMS) was developed to evaluate NER within the domain of mineral systems literature. Following the steps indicated in <xref ref-type="sec" rid="s2">Section 2</xref>, the SMS schema was defined collaboratively with geoscientists, and defines complex mineral systems terminology through literature review. Critical entities and relationships were identified, and a hierarchical taxonomy was created to organize the schema&#x2019;s 14 classes (<xref ref-type="table" rid="T3">Table 3</xref>), providing flexibility to accommodate updates for large datasets. After several discussions, subject matter experts selected nine from the twenty-four geological classes defined to be part of this schema. These include the specific terminology associated with the selected questions considered as relevant. Additionally and following the methodology outlined by <xref ref-type="bibr" rid="B55">Ding et al. (2021)</xref>, specific categories such as Country, Province/State, and City were consolidated into a single class, GPE (Geopolitical Entity), to address context-based ambiguities. Additionally, general domain classes, such as Person-Scholar (PS), were included to capture mentions to researchers (for example, geologists, biologists, and palaeontologists) and ensure the semantics are understandable to the machine. Similarly to the case of GeoIElite, this dataset required manual annotation to ensure it was appropriately applied this project. The annotations count is 910 sentences and 832 entities.</p>
</list-item>
</list>
</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Description of OzRock entity types (<xref ref-type="bibr" rid="B10">Enkhsaikhan (2021)</xref>.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Label (class)</th>
<th align="left">Description</th>
<th align="left">Example</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">MINERAL</td>
<td align="left">Mineral</td>
<td align="left">Copper, fire opal, goethite, gold, iceland spar, magnesite, iron, natural salt, silica</td>
</tr>
<tr>
<td align="left">ROCK</td>
<td align="left">Lithology</td>
<td align="left">Conglomerate, sandstone, felsic volcanic rock, migmatite, volcaniclastic sedimentary rock</td>
</tr>
<tr>
<td align="left">ORE_DEPOSIT</td>
<td align="left">Ore types</td>
<td align="left">Channel iron deposit, iron ore, nickel ore, silver ore</td>
</tr>
<tr>
<td align="left">TIMESCALE</td>
<td align="left">Geological time</td>
<td align="left">Archean, Lower Proterozoic, Paleoproterozoic, Triassic, Upper Cretaceous</td>
</tr>
<tr>
<td align="left">STRAT</td>
<td align="left">Stratigraphy</td>
<td align="left">Angas Hills Formation, Bingy Bingy Basalt Member, Marra Mamba Iron Formation</td>
</tr>
<tr>
<td align="left">LOCATION</td>
<td align="left">Geographical location</td>
<td align="left">Kalgoorlie Terrane, Kimberley Craton, Perth, Pilbara, Pilbara Craton, Western Australia</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Description of GeoIElite_Rev entity types (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>).</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Label (class)</th>
<th align="left">Description</th>
<th align="left">Example</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">FORMATION</td>
<td align="left">Geological formation</td>
<td align="left">Angas Hills Formation, Bingy Bingy Basalt Member, Marra Mamba Iron Formation</td>
</tr>
<tr>
<td align="left">AGE</td>
<td align="left">Age of the rocks</td>
<td align="left">4,000 to 2,500 million years ago, 2,500&#x2013;541 million years ago</td>
</tr>
<tr>
<td align="left">TIMESCALE</td>
<td align="left">Geological time</td>
<td align="left">Archean, Lower Proterozoic, Paleoproterozoic, Triassic, Upper Cretaceous</td>
</tr>
<tr>
<td align="left">MINERAL</td>
<td align="left">Mineral</td>
<td align="left">Fire opal, goethite, martite, Iceland spar, natural salt</td>
</tr>
<tr>
<td align="left">ROCK</td>
<td align="left">Lithology</td>
<td align="left">Conglomerate, sandstone, felsic volcanic rock, migmatite, volcaniclastic sedimentary rock, metamorphic gneisses</td>
</tr>
<tr>
<td align="left">PROCESS</td>
<td align="left">Geological process</td>
<td align="left">Deposition, erosion, basin development, mountain building, volcanism, weathering, hydrothermal alteration and mineralization, karst formation</td>
</tr>
<tr>
<td align="left">ELEMENT</td>
<td align="left">Metal/elements</td>
<td align="left">Iron, gold, nickel, lithium, bauxite, copper, zinc, lead, cobalt, rare earth elements, tantalum and niobium, vanadium, platinum group elements, uranium, manganese</td>
</tr>
<tr>
<td align="left">CHARACTERISTIC</td>
<td align="left">Geological feature</td>
<td align="left">Fractured, metamorphosed, pelitic, altered, folded, weathered, intruded, granitic, foliated, sheared, veined</td>
</tr>
<tr>
<td align="left">LOCATION</td>
<td align="left">Geographical location</td>
<td align="left">Countries, cities, states, places like: Kalgoorlie, Kimberley, Pilbara, Pilbara, Western Australia</td>
</tr>
<tr>
<td align="left">ORE_DEP_REG</td>
<td align="left">Locations where mineral resources have been discovered or explored</td>
<td align="left">Mines, exploration sites like: Kalgoorlie Terrane, Kimberley Craton, Perth, Pilbara, Pilbara Craton, Western Australia</td>
</tr>
<tr>
<td align="left">LANDFORM</td>
<td align="left">Geomorphological forms</td>
<td align="left">Channel, cratons, mountain, basin, hill, ophiolite (represents ancient oceanic crust and upper mantle rocks), karst systems, river, lava flows, lakes, dunes, regolith, pluton</td>
</tr>
<tr>
<td align="left">TYPE</td>
<td align="left">Type of ore deposits</td>
<td align="left">Banded iron, nickel sulfide, volcanogenic massive sulfide, copper&#x2013;gold porphyry</td>
</tr>
<tr>
<td align="left">METHOD</td>
<td align="left">Methods of exploration activities</td>
<td align="left">Drilling, sampling, or testing</td>
</tr>
<tr>
<td align="left">YEAR</td>
<td align="left">Year of exploration activities</td>
<td align="left">1970, 1980, 1990, 2000, 2003, 2005</td>
</tr>
<tr>
<td align="left">COMPANY</td>
<td align="left">Company responsible for exploration/production</td>
<td align="left">BHP Group, Rio Tinto, Fortescue Metals Group, Gold Fields, Western Areas, IGO Limited, Pilbara Minerals, Woodside Energy</td>
</tr>
<tr>
<td align="left">INSTITUTION</td>
<td align="left">Government entity involved</td>
<td align="left">Department of Energy, Mines, Industry Regulation and Safety, Geological Survey of Western Australia, Australian Government Department of Industry, Science and Resources, Minerals Research Institute of Western Australia<break/>Environmental Protection Authority (EPA) of Western Australia, Western Australian Planning Commission, Aboriginal Lands Trust, Office of the Environmental Protection Authority, Water and Environmental Regulation Department</td>
</tr>
<tr>
<td align="left">PERM_LIC</td>
<td align="left">Permissions/licenses for exploration and production</td>
<td align="left">Exploration License, Mining Lease, Prospecting License, Retention License, Miscellaneous License, General Purpose Lease, Program of Work Approval, Environmental Approvals, Native Title Agreements, Water License, Cultural Heritage Clearances</td>
</tr>
<tr>
<td align="left">IMPACT</td>
<td align="left">Environmental impact of exploration</td>
<td align="left">Impact on water, air, or land</td>
</tr>
</tbody>
</table>
</table-wrap>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Description of SMS entity types.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Label (class)</th>
<th align="left">Description</th>
<th align="left">Example</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">LITHOSPHERIC ARCHITECTURE</td>
<td align="left">The geometric structure of the solid Earth (Earth&#x2019;s crust and lithospheric mantle) as defined by domains of similar chemical composition and the discontinuities that separate them. Lithospheric architecture is the result of geodynamic processes</td>
<td align="left">Chemical compositional change, age, geometry, physical property change, mineralogy, domain</td>
</tr>
<tr>
<td align="left">TECTONIC HISTORY</td>
<td align="left">The temporal sequence of events that forms lithospheric architecture, such as magmatism, deformation, metamorphism, subsidence or exhumation. These events can be interpreted in paradigmatic frameworks and attributed to geodynamic processes such as subduction, seafloor spreading, mantle plumes, large igneous provinces, and the resulting geodynamic environments</td>
<td align="left">Magmatism, deformation, metamorphism, subsidence, uplift</td>
</tr>
<tr>
<td align="left">TECTONIC SETTING</td>
<td align="left">Lithospheric region deformed by contiguous geodynamic conditions resulting in characteristic geological processes</td>
<td align="left">Deformational regime, thermal regime</td>
</tr>
<tr>
<td align="left">TEMPORAL EXTENT</td>
<td align="left">Period during which processes responsible for forming or developing a particular mineral system occur</td>
<td align="left">Time units (chronostratigraphic units): Archean, Lower Proterozoic, Paleoproterozoic, Triassic, Upper Cretaceous</td>
</tr>
<tr>
<td align="left">MINERAL</td>
<td align="left">Inorganic elements or compounds (apart from liquid mercury and a few organic minerals) and defined by their chemical composition and crystal structure</td>
<td align="left">Fire opal, goethite, martite, Iceland spar, natural salt, quartz, magnetite, columbite, monazite</td>
</tr>
<tr>
<td align="left">ROCK</td>
<td align="left">Solid mass of aggregate of minerals (lithology)</td>
<td align="left">Conglomerate, sandstone, felsic volcanic rock, migmatite, volcaniclastic sedimentary rock, metamorphic gneisses</td>
</tr>
<tr>
<td align="left">GEODYNAMIC ENVIRONMENT</td>
<td align="left">Dynamic setting characterized by planetary-scale events and physical processes in the Earth&#x2019;s crustal and mantle envelopes. Differs from tectonic settings in time and space</td>
<td align="left">Boundary condition, subduction, seafloor spreading, mantle plume, large igneous province, volcanism, mountain building, basin formation, hotspot</td>
</tr>
<tr>
<td align="left">SOURCE</td>
<td align="left">A volume of rock, fluid or magma that, by its chemical composition, acts as an origin for a particular chemical compound (ion, ligand, crystal or lithic fragment) that is subsequently transported from its primary site to a secondary site (of mineralization)</td>
<td align="left">Sedimentary pile, felsic magmas, mafic magmas, source rock, crystalline basement, metal, fluid, chemical species: iron, gold, nickel, lithium, bauxite, copper, zinc, lead, cobalt, rare earth elements, tantalum and niobium, vanadium, platinum group elements, uranium, manganese</td>
</tr>
<tr>
<td align="left">ROCK DEFORMATION</td>
<td align="left">The change of shape or the displacement of a mineral aggregate through crystal-plastic (ductile) processes or by fracturing (brittle) due to mechanical failure</td>
<td align="left">Crystal-plastic deformation, brittle deformation</td>
</tr>
<tr>
<td align="left">GPE (GEOGRAPHIC LOCATION)</td>
<td align="left">Geographical location of the place entities as represented by latitude and longitude values</td>
<td align="left">Countries, cities, states, geographical coordinates</td>
</tr>
<tr>
<td align="left">DATE</td>
<td align="left">Absolute or relative dates or periods (general domain)</td>
<td align="left">5000 BC, 1750 AD, 20th Century, 18th Century, 1990s, 1500s, 2010, 2011, 2012, 2013, etc.</td>
</tr>
<tr>
<td align="left">LOC</td>
<td align="left">Non-GPE locations, mountain ranges, bodies of water</td>
<td align="left">Places like: Kalgoorlie, Kimberley, Pilbara, Pilbara, Western Australia</td>
</tr>
<tr>
<td align="left">PS (PERSON-SCHOLAR)</td>
<td align="left">Researchers&#x2019; names (for example, geologists, biologists, and palaeontologists)</td>
<td align="left">During, Perring, Ramanaidou, Thorne, Angerer, Rodger, etc.</td>
</tr>
<tr>
<td align="left">QUANTITY</td>
<td align="left">Measurements, as of weight or distance</td>
<td align="left">km, metric, tons, degrees, mm, litres, percent, etc.</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s3-1-2">
<title>3.1.2 Training process</title>
<p>The training process involved fine-tuning the Flair NER model individually for each geological schema resulting in three distinct best-model.pt files, each tailored to its specific entity classes. For each schema, the model was loaded and fine tuned on its respective annotated dataset, integrating each schema&#x2019;s structured vocabulary to specialize in recognizing terms relevant to each schema. For instance, the SMS schema emphasized classes like TECTONIC_SETTING, critical for analyzing mineral systems, while the OzRock schema covered broader geological categories like MINERAL and ROCK. This structured approach enabled the model to differentiate and contextualize geological terms according to each schema&#x2019;s focus, ensuring tailored recognition capabilities across the three different geological datasets.</p>
</sec>
</sec>
<sec id="s3-2">
<title>3.2 Validation and performance evaluation</title>
<p>The validation process for the geological schemas used in this project involved testing the Flair NER model on our annotated datasets. These annotations were performed by subject matter experts consisting of the co-authors of this paper along with additional colleagues from CSIRO Minerals Resources. Their expertise delivered a rich depth of knowledge necessary for accurately tagging geological entities in the corpora.</p>
<p>The methodology for annotation was jointly developed by the authors of this paper. Given the logistical challenges of working in different locations and the limitations of open-access annotation tools, which were not suitable for handling large corpora, we as annotators, opted for a more flexible approach. Annotations were made directly within online Google Sheets documents, which facilitated easy access editing. This approach allowed for real-time collaboration, ensuring all annotators could participate effectively despite geographical disparities.</p>
<p>The expert-validated annotations and the Flair NER model&#x2019;s predicted classifications were continuously compared using the collaborative Google Sheets documents as online platform. This setup ensured a dynamic and responsive validation process, allowing for immediate expert inputs and adjustments.</p>
<p>To visually represent the accuracy of classifications, confusion matrices were generated. These matrices showcased the alignment between the model&#x2019;s predictions and the expert-validated categories, highlighting any discrepancies and common misclassifications. The evaluation also included calculating weighted F1 scores, providing a detailed measure of the model&#x2019;s precision and recall, particularly for handling the diverse and occasionally rare classes within the SMS schema. This metric was crucial for assessing the nuanced performance of the NER system across different geological terminologies.</p>
</sec>
</sec>
<sec sec-type="results" id="s4">
<title>4 Results</title>
<p>The comparison of F1 scores across the OzRock, GeoIElite_rev, and SMS schemas reveal significant variations in performance (<xref ref-type="fig" rid="F3">Figures 3</xref>&#x2013;<xref ref-type="fig" rid="F8">8</xref>). <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F5">5</xref>, <xref ref-type="fig" rid="F7">7</xref> present Flair NER confusion matrices and F1 scores for the three geoscience schemas applied to iron deposit literature. These matrices utilize a blue palette to indicate the count of predictions made by the model, with darker shades of blue representing higher frequencies of classifications within each category. Similarly, <xref ref-type="fig" rid="F4">Figures 4</xref>, <xref ref-type="fig" rid="F6">6</xref>, <xref ref-type="fig" rid="F8">8</xref> use the same visual representation for the analysis of lithium deposit research papers. The following results provide insights into the strengths and limitations of each schema and highlight areas for further enhancement. Previous research (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>) indicated that increasing the number of papers does not improve the F1 score. Hence, we compared schemas based on different class types and numbers. Note in the figures that an &#x2018;O&#x2019; was used to indicate tokens that do not belong to any entity like &#x201c;by&#x201d; or &#x201c;and&#x201d;.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>
<bold>(A)</bold> Ozrock Confusion matrix (Wgt F1: 0.72). <bold>(B)</bold> F1 scores for NER on Iron Deposits Datasets by class.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g003.tif"/>
</fig>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>
<bold>(A)</bold> Ozrock Confusion matrix (Wgt F1: 0.71). <bold>(B)</bold> F1 scores for NER on Lithium Deposits Datasets by class.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g004.tif"/>
</fig>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>
<bold>(A)</bold> GeoIElite_rev Confusion matrix (Wgt F1: 0.69). <bold>(B)</bold> F1 scores for NER on Iron Deposits Datasets by class.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g005.tif"/>
</fig>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>
<bold>(A)</bold> GeoIElite_rev Confusion matrix (Wgt F1: 0.70). <bold>(B)</bold> F1 scores for NER on Lithium Deposits Datasets by class.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g006.tif"/>
</fig>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>SMS <bold>(A)</bold> Confusion matrix (Wgt F1: 0.27). <bold>(B)</bold> F1 scores for NER on Iron Deposits Datasets by class.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g007.tif"/>
</fig>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>SMS <bold>(A)</bold> Confusion matrix (Wgt F1: 0.34). <bold>(B)</bold> F1 scores for NER on Lithium Deposits Datasets by class.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g008.tif"/>
</fig>
<sec id="s4-1">
<title>4.1 Dataset comparisons</title>
<p>
<list list-type="simple">
<list-item>
<p>&#x2022; OzRock (<xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref>) confusion matrices and F1 score bars show that it performs robustly in identifying entity classes like &#x201c;MINERAL&#x201d;, &#x201c;ROCK&#x201d;, and &#x201c;ORE_DEPOSIT&#x201d;. The high weighted F1 scores (0.72 and 0.71) indicate better precision and recall balance than the other schemas in categorizing geological terms.</p>
</list-item>
<list-item>
<p>&#x2022; GeoIElite_rev (<xref ref-type="fig" rid="F5">Figures 5</xref>, <xref ref-type="fig" rid="F6">6</xref>) presents slightly lower weighted F1 scores of 0.69 and 0.70. This result suggests moderate effectiveness, due to the schema&#x2019;s expansive inclusion of diverse entity types, which might introduce complexity in accurately tagging less distinct classes such as &#x201c;PROCESS&#x201d; and &#x201c;METHOD&#x201d;.</p>
</list-item>
<list-item>
<p>&#x2022; SMS (<xref ref-type="fig" rid="F7">Figures 7</xref>, <xref ref-type="fig" rid="F8">8</xref>) depicts a considerable drop in weighted F1 scores to 0.27 and 0.35, indicating challenges in entity recognition.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s4-2">
<title>4.2 Entity class performance</title>
<p>
<list list-type="simple">
<list-item>
<p>&#x2022; The F1 scores across different entity classes reveal that core geological categories (&#x201c;MINERAL&#x201d;, &#x201c;ROCK&#x201d;, &#x201c;TIMESCALE&#x201d;) consistently achieve higher accuracy.</p>
</list-item>
<list-item>
<p>&#x2022; Lesser-defined classes, such as &#x201c;PROCESS&#x201d;, &#x201c;METHOD&#x201d; and &#x201c;IMPACT&#x201d;, demonstrate lower F1 scores.</p>
</list-item>
</list>
</p>
</sec>
</sec>
<sec sec-type="discussion" id="s5">
<title>5 Discussion</title>
<sec id="s5-1">
<title>5.1 Challenges and limitations</title>
<p>The limitations of NER models in recognizing annotated geological classes and its misclassification patterns are closely tied to the complexity of geoscientific terminology, the challenges in design and annotating geological schemas and the difficulties of ensuring high-quality annotations. These factors impact model performance and highlight the need for continuous refinement of schemas and training datasets.</p>
<sec id="s5-1-1">
<title>5.1.1 Schema classes selection</title>
<p>While the detailed processes involved in the development and application of the SMS schema are outlined in the case study section, it is crucial to emphasize the broader implications of our findings here. The use of the SMS schema shows the critical need for ontological resources in geosciences that are not only scientifically rigorous but also adaptable to the evolving landscape of geological research. The encountered challenges highlight the importance of developing frameworks that can be easily updated and refined to accommodate new scientific insights and terminologies.</p>
<p>The selection of geological classes in the SMS schema was guided by their relevance to characterizing iron and lithium deposits, illustrates a targeted approach to ontology design. For instance, the inclusion of LITHOSPHERIC_ARCHITECTURE and TECTONIC_SETTING helped to understand the formation of iron deposits, such as banded iron formations (BIFs), which are influenced by regional tectonic activity and large-scale lithospheric processes. This specificity in class selection is crucial for enhancing the precision of NER tasks in complex domain like geosciences, where the accuracy of terminology recognition directly impacts the quality of data extracted from scholarly texts.</p>
<p>Similarly, the MINERAL and ROCK classes are key terms for both deposits, as they determine the feasibility of extraction by directly influencing the concentration and accessibility of valuable minerals. For lithium pegmatites, minerals like spodumene and lepidolite are pivotal for extraction viability, while the mineralogy and composition of iron banded formations play a critical role in determining the grade and recoverability of iron. Additionally, TEMPORAL_EXTENT aids in understanding the timeframes of geological processes critical to the formation of these deposits.</p>
<p>The analysis of confusion matrices and F1 scores for the SMS schema (<xref ref-type="fig" rid="F7">Figures 7</xref>, <xref ref-type="fig" rid="F8">8</xref>) reveals that while classes such as LITHOSPHERIC_ARCHITECTURE and GEODYNAMIC_ENVIRONMENT were recognized with reasonable accuracy, others like TEMPORAL_EXTENT and TECTONIC_SETTING experienced significant misclassification. These findings highlight the challenges in distinguishing closely related or complex classes, particularly in geosciences, where terms often have nuanced and overlapping meanings. Overly detailed categories could possible have overwhelmed the Flair model, underscoring the importance of schema simplicity and specificity in achieving accurate NER performance.</p>
<p>The model demonstrated higher performance for schemas with general categories, as reflected by the higher F1 scores for MINERAL and ROCK in the tree geoschemas. However, the low F1 scores for SMS suggest that a more nuanced definition of certain classes is necessary to capture the complexity of mineral system vocabularies. As noted by <xref ref-type="bibr" rid="B37">Qiu et al. (2019)</xref>, an effective schema starts with a focused set of terms that are representative of the domain-specific entities. Although the SMS schema was tailored to address geological questions related to lithium pegmatites and iron deposits in Western Australia, the results indicate that further refinement is required.</p>
<p>The findings suggest that the challenge lies not only in the number of classes but also in selecting foundational and contextually relevant ones. Expert input and iterative validation are critical to ensure the schema maintains classification consistency and accurately reflects geoscientific terminology, ultimately improving NER performance for specialized domains.</p>
</sec>
<sec id="s5-1-2">
<title>5.1.2 Annotation</title>
<p>Annotating large corpora for geoscience NER presents considerable challenges due to the need for substantial manual efforts. Automated tools, such as Python-based packages and specialized NER models like Flair, offer potential solutions; however, the complexity of automatic annotation and expert validation remains significant (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>; <xref ref-type="bibr" rid="B6">Bikaun et al., 2022</xref>). This study corroborated the major challenge when validating annotations across extensive geological datasets. Despite efforts to comprehensively verify the annotations in a corpus of twenty papers, only one was fully annotated and validated due to time constraints and the lack of specialized annotators. This partial validation served as a foundation for automating the annotation of the rest of the corpus. The automated process faced limitations; the Flair NER model recognized only a subset of the annotated classes. The weighted F1 scores for the SMS dataset were 0.27 and 0.34 for both datasets (iron and lithium deposits), indicating variability in the model&#x2019;s performance across different entity types. Specifically, the confusion matrices revealed that classes such as &#x2018;DATE&#x2019; and &#x2018;PS&#x2019; achieved high accuracy with minimal misclassifications (accuracy of 0.98 and 0.96, respectively). The geological classes like &#x2018;TECTONIC SETTING&#x2019; and LITHOSPHERIC ARCHITECTURE&#x2019; are often misclassified (<xref ref-type="fig" rid="F7">Figures 7</xref>, <xref ref-type="fig" rid="F8">8</xref>).</p>
<p>In addressing these challenges, <xref ref-type="bibr" rid="B36">Qiu et al. (2023)</xref> implemented a systematic approach to annotation and validation in the geological domain. Their annotation platform allowed input from domain experts, categorizing entities into six main types. Using a specialized Python-based annotation tool, it was facilitated manual annotation and iterative consistency checks, achieving a high level of annotation consistency and expert involvement enabling the authors to construct large-scale, high-quality corpora in the Chinese language. To mirror Qiu et al.&#x2019;s efforts, we can adopt a similar approach in the future by developing domain-specific annotation guidelines in collaboration with experts. Additionally we can utilize specialized tools such as INCEpTION, an open-source platform that supports collaborative and interactive annotation in general domains (<xref ref-type="bibr" rid="B19">Klie et al., 2018</xref>). Implementing this approach, with automated checks and expert validation, can potentially produce high-quality data for training and validating NER models in geoscience, significantly improving their accuracy and reliability.</p>
</sec>
</sec>
<sec id="s5-2">
<title>5.2 Misclassification cases</title>
<p>The analysis of the SMS schema&#x2019;s misclassification patterns reveal three primary types of misclassification: overlap due to complex terminology, context dependency, and the underrepresentation of rare classes. <xref ref-type="fig" rid="F7">Figures 7</xref>, <xref ref-type="fig" rid="F8">8</xref> provide a detailed visual representation of these misclassification patterns. Several factors contribute to this misclassification, including the complexity and variability of language in the corpus, nuanced distinctions between similar classes, and potential inconsistencies in initial manual annotations. Addressing these challenges requires refining iterative annotation schemas, improving ML algorithms, and potentially expanding the manually validated sample size to improve the model&#x2019;s accuracy and coverage.</p>
<sec id="s5-2-1">
<title>5.2.1 Overlap due to complex terminology</title>
<p>Certain classes, such as LITHOSPHERIC_ARCHITECTURE and TECTONIC_SETTING, have nuanced meanings that the model struggled to capture without expert guidance. For example, as seen in <xref ref-type="fig" rid="F9">Figure 9</xref>, terms like &#x201c;Yilgarn&#x201d; were misclassified as LITHOSPHERIC_ARCHITECTURE instead of LOC (Location), likely because they refer to geological regions. This demonstrates the model&#x2019;s difficulty in distinguishing between geological structures and geographic regions, where context plays a significant role. Another notable example is TECTONIC_SETTING, which was occasionally misclassified as ROCK (confusion matrices 7A, 7B) due to overlapping terminology with geological formations, as shown in <xref ref-type="fig" rid="F9">Figure 9</xref>. This frequent overlap, exemplified by terms such as &#x2018;continental collision,&#x2019; which may reference both tectonic settings and rock-associated processes, highlights a significant challenge in AI: the generation of spurious concept relationships or hallucinations. <xref ref-type="bibr" rid="B18">Jiang et al. (2024)</xref> emphasize that such errors in entity recognition can propagate through subsequent stages of analysis, compounding inaccuracies in data interpretation. To address this issue, further refinement of our schema is essential. By enhancing its capacity to distinguish between closely related terms, and incorporating advanced AI techniques that apply deep contextual analysis, we can improve the accuracy of entity recognition.</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Automatic annotation using as base the SMS schema, highlighting misclassification of geological entities.</p>
</caption>
<graphic xlink:href="feart-13-1530004-g009.tif"/>
</fig>
</sec>
<sec id="s5-2-2">
<title>5.2.2 Context dependency</title>
<p>The Flair NER model&#x2019;s limited capacity to incorporate contextual cues significantly complicates its handling of context-dependent classes within the SMS dataset. For instance, as depicted in <xref ref-type="fig" rid="F9">Figure 9</xref>, the terms &#x201c;Iron&#x201d; and &#x201c;Ore&#x201d; frequently receive the label &#x2018;SOURCE&#x2019; instead of the more appropriate &#x2018;MINERAL&#x2019; or &#x2018;ROCK&#x2019;, contingent upon the surrounding textual context. This inadequacy in context processing not only underscores the model&#x2019;s struggle with polysemous terms but also impacts its ability to deliver precise geological classifications. The inclusion of an additional class category like &#x2018;ELEMENT&#x2019; has potential to enhance the model&#x2019;s discernment of such nuances. However, the fundamental resolution involves not just technological enhancements but also rigorous expert validation to ensure the accuracy and consistency of labeling, critical in the domain of geosciences where the exactitude of each term holds substantial implications. Further, the misclassification of terms like &#x2018;TECTONIC_SETTING&#x2019;, which might be incorrectly annotated due to overlapping or ambiguous context, can severely distort the geological interpretations essential for addressing specific research inquiries, such as understanding relevant tectonic processes. This limitation is crucial because accurate classification directly influences the integrity and utility of data used in determining geological dynamics, which are foundational to mineral exploration and geological mapping strategies. <xref ref-type="bibr" rid="B32">Orellana et al. (2020)</xref> and <xref ref-type="bibr" rid="B16">Hu et al. (2024)</xref> emphasize the importance of enhancing NER systems&#x2019; contextual comprehension to mitigate AI-induced misinformation and improve the reliability of information extraction processes. They advocate for the adoption of advanced NLP strategies to deepen the contextual understanding of NER models, which would enhance their precision and recall. This is crucial as these metrics are essential for validating the effectiveness of entity recognition and classification within complex, domain-specific datasets.</p>
</sec>
<sec id="s5-2-3">
<title>5.2.3 Rare classes</title>
<p>Rare classes in the SMS dataset, such as PROCESS, METHOD, and IMPACT, demonstrated lower F1 scores, highlighting the challenges of limited representation in training data. The SMS dataset, with only 91 sentences and 832 entities, provided insufficient data for these classes, resulting in reduced generalization capability. For comparison, the OzRock dataset, which contained 83,838 sentences and 3,278 entities, offered broader coverage and higher F1 scores for general classes like ROCK and MINERAL (<xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref>). However, even within OzRock, nuanced or less frequent classes were more prone to misclassification. The disparity in performance between these datasets highlights how insufficiently diverse or narrowly scoped training data can lead to suboptimal model performance, particularly for complex or infrequent classes. Integrating broader datasets and continuous expert feedback into the training process can help address these shortcomings by enhancing the diversity and representativeness of the training data, thus reducing the incidence of AI-induced errors and improving the overall reliability of the model.</p>
</sec>
</sec>
<sec id="s5-3">
<title>5.3 Future research</title>
<p>The limitations of automated recognition with the SMS schema highlights the need for diverse, representative training datasets, refined schemas to capture domain-specific nuances, and model adaptations to address the complexities of geological terminology and context, enhancing entity extraction accuracy. Increasing the number of annotated examples for rare classes, expanding the diversity of training data, and enhancing model adaptability to context would improve classification accuracy and the utility of NER models in geoscientific research.</p>
<p>Using advanced techniques such as few-shot learning (<xref ref-type="bibr" rid="B15">Hofer et al., 2018</xref>) can improve NER model&#x2019;s ability to recognize less frequent or underrepresented classes. Few-shot learning is a ML technique that allows NER models to generalize with a limited number of labelled examples which is common in specialized domains like geoscience. <xref ref-type="bibr" rid="B23">Liu et al. (2022)</xref> have been pioneers exploring few-shot learning in geosciences. They used GeoBERT and Few-shot learning approach for recognizing long geological terms using a minimal amount of annotated datasets. They fine-tuned a pre-trained model using a geological domain thesaurus achieving an F1 score of 0.80.</p>
<p>Additionally, a continuous feedback loop, where domain experts validate and refine the model&#x2019;s outputs, can help improve its accuracy and reliability over time. This aligns with our previous findings (<xref ref-type="bibr" rid="B44">Villacorta et al., 2024</xref>), which highlighted the importance of schema training data diversity in enhancing NER model performance in geosciences. Specifically, it was noticed that GeoIElite achieved a modest F1 score compared to OzRock, which performed better due to its broader linguistic diversity and the inclusion of hundreds of documents. The narrower scope and fewer entity classes in GeoIElite contributed to its lower scores. The analysis suggested that limited annotated data scope, as seen with GeoIElite, hinders robustness. Expanding the diversity and context of annotated data can improve contextual recognition. Additionally, Flair&#x2019;s performance tends to decline with an increasing number of entity classes, while F1 scores indicate that corpus size (7 PDFs vs. 20 PDFs) has a limited impact on overall NER accuracy. Misclassifications, particularly between geological entities such as &#x2018;ORE_DEPOSIT&#x2019; and &#x2018;MINERAL,&#x2019; emphasize the need for schema refinement. This can be addressed by ensuring that classes are well-defined and distinct, or in some cases, merging similar classes to reduce ambiguity and improve classification accuracy. Models with a generalized LOCATION class (GeoIElite_rev, OzRock) show different F1 scores, suggesting that class generalization may impact model accuracy. Expert validation which is reflected in the annotated dataset is crucial for creating schemas due to the challenges of integrating automated tools. Future research will focus on expanding and diversifying the annotated datasets to cover additional geological subdomains and terminology.</p>
<p>Large language Models (LLMs) like GPT-4, BERT, and others have shown significant potential in processing and analyzing geoscientific texts (<xref ref-type="bibr" rid="B42">Touvron et al., 2023</xref>). These models, trained on vast amounts of diverse data, can capture complex language patterns and contextual nuances, making them well-suited for handling the specialized terminology and varied contexts found in geoscientific literature. LLMs offer opportunities for processing geoscientific texts, some examples are GeoBERT (<xref ref-type="bibr" rid="B23">Liu et al., 2022</xref>) which promises improvemed NER tasks in geology; GeoGalactica (<xref ref-type="bibr" rid="B21">Lin et al., 2023</xref>), which was fine-tuned using geoscience-specific data to improve knowledge extraction, document classification, and question answering, and the use of LLM in analyzing climate-related questions (<xref ref-type="bibr" rid="B8">Bulian et al., 2023</xref>), enhancing understanding of environmental changes. However, their effectiveness vary significantly based on the specificity of the training data and the domain-specific challenges they are tailored to address. To fully harness their potential in geosciences, ongoing efforts are needed to increase the diversity and representativeness of training datasets, refine domain-specific schemas and ontologies, and develop ML techniques to enhance model performance with minimal data.</p>
<p>Moreover, collaboration with domain experts is essential to validate and improve model outputs, ensuring that LLMs can provide accurate, reliable insights in geoscientific research. The challenges and limitations of LLMs are that they may struggle with rare or underrepresented geological terms or concepts if the training data lacks diversity. This limitation can lead to incomplete or inaccurate entity recognition. While LLMs are powerful, they may still face challenges in generalizing across different geological contexts, particularly when encountering less common terms or unique geological formations or acronyms in geoscience contexts. As observed in previous discussions, this can result in a model that recognizes only a subset of the relevant geological classes as observed in this research. Yet, applying LLMs in geosciences requires significant computational resources, particularly when fine-tuning models on domain-specific data, which can be considered a limiting factor for smaller research teams or projects with constrained budgets.</p>
<p>While the initial phase of the project focused on the steps for implementing schemas, future developments of the SMS plan to integrate relational attributes, such as temporal relationships and spatial dependencies. Capture more complex relationships within geological entities, will enhance contextual recognition in this specialized domain. Also is planned to investigate how the mentioned LLM can be fine-tuned and integrated with our developed schemas to improve entity recognition accuracy and reduce errors related to context misinterpretation and ambiguous terminologies.</p>
</sec>
</sec>
<sec sec-type="conclusion" id="s6">
<title>6 Conclusion</title>
<p>This research highlights the potential and current limitations of automated annotation tools using open-access NER models, tailored for geoscience literature. The introduction of the Schema for Mineral Systems (SMS) has provided insights into the classification and recognition of geological entities, particularly emphasizing the schema&#x2019;s capability to detail the nuanced aspects of complex mineral systems.</p>
<p>Our findings demonstrate that while schemas such as OzRock and GeoIElite_rev establish essential frameworks for geological entity recognition, they occasionally fall short in capturing the more detailed and subtle geological features that SMS excels in identifying. However, our results also highlight a critical challenge: the detailed and comprehensive nature of SMS, while beneficial, can sometimes introduce complexities that hinder the effectiveness of NER systems. This intricacy necessitates significant fine-tuning and expert validation to achieve reliable performance.</p>
<p>The analysis of confusion matrices and performance evaluations from the datasets reveals a stark contrast in the effectiveness of different schemas. OzRock and GeoIElite_rev showed robust performance in general geological categorization, whereas SMS, despite its detailed approach, showed variability in its effectiveness, particularly struggling with classes that require deep contextual understanding or are less represented in the training data.</p>
<p>From this study, it is evident that achieving optimal NER performance requires a balance between schema detail and simplicity. Future research should thus focus on refining schema definitions to ensure they capture essential geological nuances without overwhelming the NER systems. Incorporating diverse and high-quality training data, along with leveraging advanced machine learning strategies such as few-shot learning and domain-specific language models, will be crucial in enhancing the precision and utility of NER systems for geoscientific applications.</p>
<p>Continued collaboration with domain experts is imperative to ensure the relevance and accuracy of schema classifications. Such partnerships are vital for aligning the schemas with evolving geological concepts and maintaining the high standards necessary for automated knowledge extraction from geoscientific literature.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s7">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/<xref ref-type="sec" rid="s13">Supplementary Material</xref>, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>SV: Conceptualization, Data curation, Formal Analysis, Investigation, Methodology, Project administration, Visualization, Writing &#x2013; original draft, Writing &#x2013; review and editing, Validation. ML: Methodology, Resources, Supervision, Validation, Writing &#x2013; review and editing, Conceptualization, Project administration. JK: Conceptualization, Supervision, Writing &#x2013; review and editing. KG: Writing &#x2013; review and editing, Methodology, Validation. EG: Writing &#x2013; review and editing, Validation. HM: Writing &#x2013; review and editing, Validation.</p>
</sec>
<sec sec-type="funding-information" id="s9">
<title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This research was funded by the CSIRO ResearchPlus Science Leader program.</p>
</sec>
<ack>
<p>The authors express their gratitude to Mario Iglesias and Marta So&#x15b;nicka for their assistance in annotating the SMS dataset. We appreciate Ryan Noble&#x2019;s insightful feedback on entity class classification and Behnam Sadeghi&#x2019;s contributions to the manuscript&#x2019;s approach. Thanks are also to Andy Wilkins and Tadro Abbot for their thorough review within the CSIRO peer review system. We are grateful to the Executive Director of the Geological Survey of Western Australia for granting K Gessner and E Gray the permission to participate in this study, which was pivotal for our research. Marta So&#x15b;nicka deserves additional acknowledgment for her comprehensive review and input on the practical applications of our findings in mineral exploration research. A special thanks to the journal reviewers, Dr. Antony Mamuse and Dr. Feng Han, for their constructive feedback and thoughtful suggestions, which greatly contributed to improving the quality of this article.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declare that Generative AI was used in the creation of this manuscript. To assist in editing the manuscript and enhancing its readability. AI is not credited as an author of the manuscript; it was solely utilized for summarizing text and reducing redundancy. All content edited with the help of Generative AI has been verified for factual accuracy and checked for plagiarism.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s13">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/feart.2025.1530004/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/feart.2025.1530004/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet2.pdf" id="SM1" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
<supplementary-material xlink:href="DataSheet1.pdf" id="SM2" mimetype="application/pdf" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Akbik</surname>
<given-names>T. B.</given-names>
</name>
<name>
<surname>Blythe</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Rasul</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Schweter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Vollgraf</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>FLAIR: an easy-to-use framework for state-of-the-art NLP</article-title>,&#x201d; in <conf-name>Proceedings of the 2019 conference of the North American chapter of the association for computational linguistics (demonstrations)</conf-name>, <fpage>54</fpage>&#x2013;<lpage>59</lpage>.</citation>
</ref>
<ref id="B2">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Angeli</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Premkumar</surname>
<given-names>M. J. J.</given-names>
</name>
<name>
<surname>Manning</surname>
<given-names>C. D.</given-names>
</name>
</person-group> (<year>2015</year>). &#x201c;<article-title>Leveraging linguistic structure for open domain information extraction</article-title>,&#x201d; in <conf-name>Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)</conf-name>, <fpage>344</fpage>&#x2013;<lpage>354</lpage>.</citation>
</ref>
<ref id="B54">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Angerer</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Duuring</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Hagemann</surname>
<given-names>S. G.</given-names>
</name>
<name>
<surname>Thorne</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>McCuaig</surname>
<given-names>T. C.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A mineral system approach to iron ore in archaean and palaeoproterozoic BIF of Western Australia</article-title>. <source>Geological Society, London, Special Publications</source> <volume>393</volume> <issue>1</issue>, <fpage>81</fpage>&#x2013;<lpage>115</lpage>.</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Babaie</surname>
<given-names>H. A.</given-names>
</name>
<name>
<surname>Davarpanah</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Elliott</surname>
<given-names>W. C.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Ontology of the complex rare-earth elements mineral system</article-title>. <source>Special Pap. Geol. Soc. Am.</source> <volume>558</volume>, <fpage>29</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1130/2022.2558(03</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bhattacharya</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2023</year>). <source>Custom named construct recognition in the business and management literature</source>. <publisher-loc>Ottawa, ON, Canada</publisher-loc>: <publisher-name>Carleton University</publisher-name>. <comment>Doctoral dissertation</comment>.</citation>
</ref>
<ref id="B49">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Biber</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Conrad</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Reppen</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>1998</year>). <source>Corpus Linguistics: Investigating Language Structure and Use</source>. <publisher-name>Cambridge University Press</publisher-name>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://academic.oup.com/dsh/article-abstract/14/2/305/936240">https://academic.oup.com/dsh/article-abstract/14/2/305/936240</ext-link>
</comment> (<comment>Accessed February 2, 2025</comment>).</citation>
</ref>
<ref id="B5">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Bikaun</surname>
<given-names>T. K.</given-names>
</name>
<name>
<surname>French</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Stewart</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Hodkiewicz</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>MaintIE: a fine-grained annotation schema and benchmark for information extraction from maintenance short texts</article-title>,&#x201d; in <conf-name>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)</conf-name>, <fpage>10939</fpage>&#x2013;<lpage>10951</lpage>.</citation>
</ref>
<ref id="B6">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Bikaun</surname>
</name>
<name>
<surname>Stewart</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Quickgraph: a rapid annotation tool for knowledge graph extraction from technical text</article-title>,&#x201d; in <source>Proceedings of the 60th annual meeting of the association for computational linguistics: system demonstrations</source>, <fpage>270</fpage>&#x2013;<lpage>278</lpage>.</citation>
</ref>
<ref id="B50">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brodaric</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Richard</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>The geoscience ontology. Abstract retrieved from AGU Fall Meeting Abstracts 2020 (IN030 07)</article-title>.</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Brown</surname>
<given-names>T. B.</given-names>
</name>
<name>
<surname>Mann</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ryder</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Subbiah</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kaplan</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Dhariwal</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Language models are few-shot learners</article-title>. <source>arXiv: 14165</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Bulian</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sch&#xe4;fer</surname>
<given-names>M. S.</given-names>
</name>
<name>
<surname>Amini</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lam</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Ciaramita</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Gaiarin</surname>
<given-names>B.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Assessing large language models on climate information</article-title>. <source>arXiv Prepr. arXiv:2310.02932</source>.</citation>
</ref>
<ref id="B52">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cox</surname>
<given-names>S. J.</given-names>
</name>
<name>
<surname>Richard</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>A geologic timescale ontology and service</article-title>. <source>Earth Science Informatics</source> <volume>8</volume>. <fpage>5</fpage>&#x2013;<lpage>19</lpage>.</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Devlin</surname>
<given-names>M.-W.</given-names>
</name>
<name>
<surname>Chang</surname>
</name>
<name>
<surname>Lee</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Toutanova</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title>. <source>arXiv Prepr. arXiv:1810.04805</source>.</citation>
</ref>
<ref id="B55">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ding</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Han</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>P.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Few-nerd: a few-shot named entity recognition dataset</article-title>. <source>arXiv</source> [Preprint]. <source>arXiv:2105.07464</source>.</citation>
</ref>
<ref id="B10">
<citation citation-type="thesis">
<person-group person-group-type="author">
<name>
<surname>Enkhsaikhan</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2021</year>). <source>Geological knowledge graph construction from mineral exploration text</source>. <comment>Doctoral thesis</comment> (<publisher-loc>UWA</publisher-loc>: <publisher-name>University of Western Australia</publisher-name>).</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Garcia</surname>
<given-names>L. F.</given-names>
</name>
<name>
<surname>Abel</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Perrin</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>dos Santos Alvarenga</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>The GeoCore ontology: a core ontology for general use in Geology</article-title>. <source>Comput. and Geosciences</source> <volume>135</volume>, <fpage>104387</fpage>. <pub-id pub-id-type="doi">10.1016/j.cageo.2019.104387</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Greim</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Solomon</surname>
<given-names>A. A.</given-names>
</name>
<name>
<surname>Breyer</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Assessment of lithium criticality in the global energy transition and addressing policy gaps in transportation</article-title>. <source>Nat. Commun.</source> <volume>11</volume> (<issue>1</issue>), <fpage>4570</fpage>. <pub-id pub-id-type="doi">10.1038/s41467-020-18402-y</pub-id>
</citation>
</ref>
<ref id="B48">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Grishman</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Sundheim</surname>
<given-names>B. M.</given-names>
</name>
</person-group> (<year>1996</year>). &#x201c;<article-title>Message understanding conference-6: A brief history</article-title>,&#x201d; in <source>COLING 1996 volume 1: The 16th international conference on computational linguistics</source>.</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Guo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>GeoKnowledgeFusion: a platform for multimodal data compilation from geoscience literature</article-title>. <source>Remote Sens.</source> <volume>16</volume> (<issue>9</issue>), <fpage>1484</fpage>. <pub-id pub-id-type="doi">10.3390/rs16091484</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hofer</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Kormilitzin</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Goldberg</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Nevado-Holgado</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Few-shot learning for named entity recognition in medical text</article-title>. <source>arXiv Prepr. arXiv:1811.05468</source>.</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Deep learning for named entity recognition: a survey</article-title>. <source>Neural Comput. Appl.</source> <volume>36</volume> (<issue>16</issue>), <fpage>8995</fpage>&#x2013;<lpage>9022</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-024-09646-6</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huber</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Klump</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2015</year>). <article-title>Agenames a stratigraphic information harvester and text parser</article-title>. <source>Earth Sci. Inf.</source> <volume>8</volume>, <fpage>125</fpage>&#x2013;<lpage>134</lpage>. <pub-id pub-id-type="doi">10.1007/s12145-014-0171-5</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Mitigating out-of-entity errors in named entity recognition: a sentence-level strategy</article-title>. <source>arXiv Prepr. arXiv:2412.08434</source>.</citation>
</ref>
<ref id="B19">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Klie</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Bugert</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Boullosa</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>De Castilho</surname>
<given-names>R. E.</given-names>
</name>
<name>
<surname>Gurevych</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>The inception platform: machine-assisted and knowledge-oriented interactive annotation</article-title>,&#x201d; in <conf-name>In Proceedings of the 27th international conference on computational linguistics: System demonstrations</conf-name> (<publisher-loc>Santa Fe, NM</publisher-loc>: <publisher-name>Association for Computational Linguistics</publisher-name>), <fpage>5</fpage>&#x2013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B20">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lamparter</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Ehrig</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Tempich</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2004</year>). &#x201c;<article-title>Knowledge extraction from classification schemas</article-title>,&#x201d; in <conf-name>On the Move to Meaningful Internet Systems 2004: CoopIS, DOA, and ODBASE: OTM Confederated International Conferences, CoopIS, DOA, and ODBASE 2004</conf-name>, <conf-loc>Agia Napa, Cyprus</conf-loc>, <conf-date>October 25&#x2013;29, 2004</conf-date> (<publisher-name>Springer Berlin Heidelberg</publisher-name>), <fpage>618</fpage>&#x2013;<lpage>636</lpage>.</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Deng</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>Y.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Geogalactica: a scientific large language model in geoscience</article-title>. <source>arXiv Prepr. arXiv:2401.00434</source>.</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lindsay</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Villacorta</surname>
<given-names>S. P.</given-names>
</name>
<name>
<surname>McFarlane</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Gessner</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Gray</surname>
<given-names>E.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Geo-semantics and ontologies: an approach to decode gold mineral systems using controlled vocabularies</article-title>. <source>arXiv</source>. <pub-id pub-id-type="doi">10.5281/zenodo.15151900</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Few-shot learning for name entity recognition in geological text based on GeoBERT</article-title>. <source>Earth Sci. Inf.</source> <volume>15</volume> (<issue>2</issue>), <fpage>979</fpage>&#x2013;<lpage>991</lpage>. <pub-id pub-id-type="doi">10.1007/s12145-022-00775-x</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lombardo</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Piana</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Mimmo</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>Semantics&#x2013;informed geological maps: conceptual modeling and knowledge encoding</article-title>. <source>Comput. and Geosciences</source> <volume>116</volume>, <fpage>12</fpage>&#x2013;<lpage>22</lpage>. <pub-id pub-id-type="doi">10.1016/j.cageo.2018.04.001</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Loper</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Bird</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2002</year>). <article-title>Nltk: the natural language toolkit</article-title>. <source>arXiv Prepr. cs/0205028</source>.</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Knowledge graph construction and application in geosciences: a review</article-title>. <source>Comput. and Geosciences</source> <volume>161</volume>, <fpage>105082</fpage>. <pub-id pub-id-type="doi">10.1016/j.cageo.2022.105082</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mantovani</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Piana</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Lombardo</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Ontology-driven representation of knowledge for geological maps</article-title>. <source>Comput. and Geosciences</source> <volume>139</volume>, <fpage>104446</fpage>. <pub-id pub-id-type="doi">10.1016/j.cageo.2020.104446</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Mathis</surname>
<given-names>B.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Extracting proceedings data from court cases with machine learning</article-title>. <source>Stats</source> <volume>5</volume> (<issue>4</issue>), <fpage>1305</fpage>&#x2013;<lpage>1320</lpage>. <pub-id pub-id-type="doi">10.3390/stats5040079</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>McCrae</surname>
<given-names>J. P.</given-names>
</name>
<name>
<surname>Bosque-Gil</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Gracia</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Buitelaar</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cimiano</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2017</year>). &#x201c;<article-title>The Ontolex-Lemon model: development and applications</article-title>,&#x201d; in <conf-name>Proceedings of eLex 2017 conference</conf-name>, <fpage>19</fpage>&#x2013;<lpage>21</lpage>.</citation>
</ref>
<ref id="B30">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>McKinney</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2010</year>). &#x201c;<article-title>Data structures for statistical computing in Python</article-title>,&#x201d; in <source>Proceedings of the 9th Python in science conference</source>. Editors <person-group person-group-type="editor">
<name>
<surname>van der Walt</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Millman</surname>
<given-names>J.</given-names>
</name>
</person-group> (<publisher-loc>Austin, TX</publisher-loc>: <publisher-name>SciPy</publisher-name>), <fpage>51</fpage>&#x2013;<lpage>56</lpage>. <pub-id pub-id-type="doi">10.25080/Majora-92bf1922-00a</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Orellana</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>F&#xe1;rez</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>C&#xe1;rdenas</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Evaluating Named Entities Recognition (NER) tools vs algorithms adapted to the extraction of locations</article-title>,&#x201d; in <conf-name>2020 International Conference of Digital Transformation and Innovation Technology (Incodtrin)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>123</fpage>&#x2013;<lpage>128</lpage>.</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Patel</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Bionerflair: biomedical named entity recognition using flair embedding and sequence tagger</article-title>. <source>arXiv Prepr. arXiv:2011.01504</source>.</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Pedregosa</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Varoquaux</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Gramfort</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Michel</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Thirion</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Grisel</surname>
<given-names>O.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Scikit-learn: machine learning in Python</article-title>. <source>J. Mach. Learn. Res.</source> <volume>12</volume>, <fpage>2825</fpage>&#x2013;<lpage>2830</lpage>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://www.jmlr.org/papers/v12/pedregosa11a.html">https://www.jmlr.org/papers/v12/pedregosa11a.html</ext-link>.</comment>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Perring</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Crowe</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Hronsky</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>A new fluid-flow model for the genesis of banded iron formation-hosted martite-goethite mineralization, with special reference to the north and south flank deposits of the Hamersley Province, Western Australia</article-title>. <source>Econ. Geol.</source> <volume>115</volume> (<issue>3</issue>), <fpage>627</fpage>&#x2013;<lpage>659</lpage>. <pub-id pub-id-type="doi">10.5382/econgeo.4734</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Tian</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Tan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Q.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Extracting named entity using entity labeling in geological text using deep learning approach</article-title>. <source>J. Earth Sci.</source> <volume>34</volume> (<issue>5</issue>), <fpage>1406</fpage>&#x2013;<lpage>1417</lpage>. <pub-id pub-id-type="doi">10.1007/s12583-022-1789-8</pub-id>
</citation>
</ref>
<ref id="B37">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Qiu</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Tao</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>GNER: a generative model for geological named entity recognition without labeled data using deep learning</article-title>. <source>Earth Space Sci.</source> <volume>6</volume> (<issue>6</issue>), <fpage>931</fpage>&#x2013;<lpage>946</lpage>. <pub-id pub-id-type="doi">10.1029/2019ea000610</pub-id>
</citation>
</ref>
<ref id="B39">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Shinyama</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Guglielmetti</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>
<italic>pdfminer.six</italic> (Version 20240706)</article-title>. <source>GitHub</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/pdfminer.six/">https://pypi.org/project/pdfminer.six/</ext-link> (Accessed September 4, 2024)</comment>.</citation>
</ref>
<ref id="B40">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Singer-Vine</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Pdfplumber (Version 0.11.0)</article-title>. <source>GitHub</source>. <comment>Available online at: <ext-link ext-link-type="uri" xlink:href="https://pypi.org/project/pdfplumber/">https://pypi.org/project/pdfplumber/</ext-link> (Accessed September 4, 2024)</comment>.</citation>
</ref>
<ref id="B41">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tjong Kim Sang</surname>
<given-names>E. F.</given-names>
</name>
<name>
<surname>De Meulder</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2003</year>). &#x201c;<article-title>Introduction to the CoNLL-2003 shared task: language-independent named entity recognition</article-title>,&#x201d; in <conf-name>Proceedings of the Seventh Conference on Natural Language Learning at HLT-NAACL 2003</conf-name>, <fpage>142</fpage>&#x2013;<lpage>147</lpage>.</citation>
</ref>
<ref id="B42">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Touvron</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Lavril</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Izacard</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Martinet</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Lachaux</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Lacroix</surname>
<given-names>T.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Llama: open and efficient foundation language models</article-title>. <source>arXiv Prepr. arXiv:2302.13971</source>.</citation>
</ref>
<ref id="B44">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Villacorta</surname>
<given-names>S. P.</given-names>
</name>
<name>
<surname>Lindsay</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Klump</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Francis</surname>
<given-names>N.</given-names>
</name>
</person-group> (<year>2024</year>). &#x201c;<article-title>Assessing named entity recognition efficacy using diverse geoscience datasets</article-title>,&#x201d; in <conf-name>2024 International Conference on Machine Intelligence for GeoAnalytics and Remote Sensing (MIGARS)</conf-name> (<publisher-name>IEEE</publisher-name>), <fpage>1</fpage>&#x2013;<lpage>3</lpage>.</citation>
</ref>
<ref id="B45">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Villacorta</surname>
</name>
<name>
<surname>Lindsay</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Exploring the importance of preprocessing operations in geoscience knowledge graphs through the application of a machine learning approach</article-title>,&#x201d; in <conf-name>Proceedings of the 26th World Mining Congress</conf-name>, <conf-loc>Brisbane, Australia</conf-loc>, <fpage>177</fpage>&#x2013;<lpage>188</lpage>.</citation>
</ref>
<ref id="B51">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Named entity annotation schema for geological literature mining in the domain of porphyry copper deposits. Abstract retrieved from AGU Fall Meeting Abstracts (IN12C-0276)</article-title>.</citation>
</ref>
<ref id="B53">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Woodcock</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Paget</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Held</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Accelerating industry innovation using the Open Data Cube in Australia</article-title>,&#x201d; in <source>IGARSS 2018-2018 IEEE International Geoscience and Remote Sensing Symposium</source>, <fpage>8636</fpage>&#x2013;<lpage>8638</lpage>.</citation>
</ref>
<ref id="B47">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Hou</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zheng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Shen</surname>
<given-names>S.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Geoscience knowledge graph in the big data era</article-title>. <source>Sci. China Earth Sci.</source> <volume>64</volume> (<issue>7</issue>), <fpage>1105</fpage>&#x2013;<lpage>1114</lpage>. <pub-id pub-id-type="doi">10.1007/s11430-020-9750-4</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>