<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" dtd-version="1.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Energy Res.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Energy Research</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Energy Res.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">2296-598X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1739244</article-id>
<article-id pub-id-type="doi">10.3389/fenrg.2026.1739244</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Risk identification method for power operation and inspection based on multi-element interaction relationship analysis</article-title>
<alt-title alt-title-type="left-running-head">Zou et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fenrg.2026.1739244">10.3389/fenrg.2026.1739244</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Zou</surname>
<given-names>Hongbo</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Xiaping</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/3265645"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing - original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Ma</surname>
<given-names>Fuqi</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1433333"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Qian</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal Analysis</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yuan</surname>
<given-names>Aotian</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Feng</surname>
<given-names>Ziyi</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &#x26; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/">Writing - review and editing</role>
</contrib>
</contrib-group>
<aff id="aff1">
<label>1</label>
<institution>College of Electrical Engineering and New Energy, China Three Gorges University</institution>, <city>Yichang</city>, <country country="CN">China</country>
</aff>
<aff id="aff2">
<label>2</label>
<institution>College of Electrical Engineering, Xi&#x2019;an University of Technology</institution>, <city>Xi&#x2019;an</city>, <state>Shanxi</state>, <country country="CN">China</country>
</aff>
<aff id="aff3">
<label>3</label>
<institution>College of Electricity and Automation, Wuhan University</institution>, <city>Wuhan</city>, <state>Hubei</state>, <country country="CN">China</country>
</aff>
<aff id="aff4">
<label>4</label>
<institution>School of Energy and Electrical Engineering, Qinghai University</institution>, <city>Xining</city>, <state>Qinghai</state>, <country country="CN">China</country>
</aff>
<author-notes>
<corresp id="c001">
<label>&#x2a;</label>Correspondence: Fuqi Ma, <email xlink:href="mailto:whumfq@whu.edu.cn">whumfq@whu.edu.cn</email>
</corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-03-02">
<day>02</day>
<month>03</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>14</volume>
<elocation-id>1739244</elocation-id>
<history>
<date date-type="received">
<day>04</day>
<month>11</month>
<year>2025</year>
</date>
<date date-type="rev-recd">
<day>31</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="accepted">
<day>19</day>
<month>02</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Zou, Zhao, Ma, Liu, Yuan and Feng.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Zou, Zhao, Ma, Liu, Yuan and Feng</copyright-holder>
<license>
<ali:license_ref start_date="2026-03-02">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<sec>
<title>Introduction</title>
<p>The interaction between power operation personnel and tools is a critical factor in ensuring maintenance safety and operational standardization. However, due to the diversity and complexity of human-object interactions in both spatial structure and action semantics, existing methods still face challenges in local features extraction and fine-grained identification of semantically similar behaviors.</p>
</sec>
<sec>
<title>Methods</title>
<p>To address these challenges, we propose an interactive risk recognition method for power operations based on semantic prompts and locally-aware enhanced Transformer, aiming to improve the analysis capabilities of key interaction behaviors such as climbing ladders or electricity checking in operational scenarios. The method first introduces a multimodal semantic prompt module, which synergistically guides visual and linguistic prompts to effectively enhance the model's semantic understanding of complex interaction behaviors. It then integrates a locally perceptual-enhanced Transformer module to strengthen the feature expression capabilities of visual and linguistic networks, thereby improving the performance of multimodal features.</p>
</sec>
<sec>
<title>Results</title>
<p>Experiments conducted on a self-constructed distribution network operation interaction dataset demonstrate that the proposed method achieves high recognition accuracy in key interaction detection tasks and can effectively identify potential safety risks, providing timely alerts. </p>
</sec>
<sec>
<title>Discussion</title>
<p>These results highlight its broad applicability in intelligent operation supervision and risk management.</p>
</sec>
</abstract>
<kwd-group>
<kwd>human-object interaction detection</kwd>
<kwd>interaction risk recognition</kwd>
<kwd>multimodal fusion</kwd>
<kwd>power operation and inspection</kwd>
<kwd>semantic prompts</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declared that financial support was received for this work and/or its publication. This research was funded by National Natural Science Foundation of China, grant number 52407143.</funding-statement>
</funding-group>
<counts>
<fig-count count="6"/>
<table-count count="4"/>
<equation-count count="14"/>
<ref-count count="31"/>
<page-count count="00"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Smart Grids</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<label>1</label>
<title>Introduction</title>
<p>As the terminal link of the power system, the operation and maintenance of the distribution network heavily rely on manual operations, often involving typical high-risk scenarios such as climbing to heights and electricity checking. The hidden risks in the operational process are closely related to the subjective behavior of personnel, and such proactive actions often exhibit strong dynamics and instantaneous characteristics. Once triggered, these actions can quickly escalate into irreversible electrical accidents (<xref ref-type="bibr" rid="B16">Liu et al., 2024</xref>; <xref ref-type="bibr" rid="B3">Energy Foundation China, 2022</xref>). Therefore, utilizing key elements of power production imagery, such as personnel, electrical equipment, and protective gear, along with their interactive relationships, to build an efficient and intelligent behavior recognition and quality management system is of significant importance. It helps reduce the false alarm rate of safety risks and promotes the fine-grained prevention and control of safety risks (<xref ref-type="bibr" rid="B29">Zhao et al., 2021</xref>).</p>
<p>Currently, the risk management and control of distribution network operations are still primarily reliant on manual supervision and video monitoring. Manual supervision is labor-intensive and inefficient, while mobile terminal monitoring can capture operational images but still depends on manual review, lacking real-time capabilities and proactivity (<xref ref-type="bibr" rid="B20">Peng et al., 2021</xref>). To address this, researchers have gradually introduced intelligent perception technologies such as computer vision, attempting to achieve automatic detection and recognition of risky behaviors through video monitoring and image analysis algorithms, thereby improving the safety management efficiency and real-time capabilities of operation sites (<xref ref-type="bibr" rid="B17">Ma et al., 2024</xref>; <xref ref-type="bibr" rid="B14">Li et al., 2025</xref>; <xref ref-type="bibr" rid="B1">Cao et al., 2024</xref>; <xref ref-type="bibr" rid="B18">Meng et al., 2025</xref>). For example, Literature (<xref ref-type="bibr" rid="B13">Li et al., 2023</xref>) proposed a decision algorithm to make decisions based on the results of skeleton data and target detection, thereby identifying interactive unsafe behaviors. Literature (<xref ref-type="bibr" rid="B24">Wang et al., 2021</xref>) introduced a skeletal action recognition method based on Graph Convolutional Networks (GCN), which extracts human key point sequences of workers to construct spatiotemporal graph models, enabling the automatic recognition of high-risk behaviors such as unsafe climbing and failure to wear safety equipment. Literature (<xref ref-type="bibr" rid="B2">Cheng et al., 2020</xref>) proposed a new Shift Graph Convolutional Network (Shift-GCN), which constructs a spatiotemporal skeleton graph and introduces offset operations to improve the recognition accuracy and computational performance of complex human behaviors.</p>
<p>To further capture the complex interactions and dependencies between operational behaviors and tools in RGB video frames, Literature (<xref ref-type="bibr" rid="B11">Kim et al., 2021</xref>) first proposed a Transformer-based HOTR (Human Object Transformer Relation) model, achieving end-to-end detection of human-object interactions, capable of simultaneously predicting both the target location and interaction category. Literature (<xref ref-type="bibr" rid="B26">Yang et al., 2024</xref>) designed the SGI-YOLOv9 model for intelligent detection of critical components in distribution network scenarios. In this model, SGI (Scene Graph Interaction) is introduced to enhance the original YOLOv9 framework by detecting the interaction relationships among operators, equipment, and tools in distribution network operation scenarios, thereby enabling efficient recognition of operation-related interaction behaviors. Literature (<xref ref-type="bibr" rid="B4">Feng et al., 2024</xref>) proposed a behavior recognition method combining skeleton sequences with spatio-temporal convolutional networks. By integrating personnel action features with the interaction relationships contained in tools, it achieves accurate recognition of the behavior of personnel performing live-line works. Literature (<xref ref-type="bibr" rid="B22">Tang et al., 2020</xref>) introduced a human-object interaction recognition method, which automatically identifies potential safety hazards by analyzing the interactions between workers and related tools or equipment in construction site images.</p>
<p>Despite the positive progress made in these studies, there are still numerous challenges in the complex and dynamic distribution network operation scenarios, such as similar postures, severe lighting conditions, and obstruction interference. Specifically, similar actions such as climbing ladders and handrails in high-altitude operations are easily confused during the recognition process, making it difficult to effectively extract key semantic information and accurately judge the interaction between people and tools. To address these issues, this paper takes representative operation and inspection scenarios as the application background, constructs a dedicated dataset covering both single-person and multi-person human-object interactions, and proposes a multi-element interaction analysis method based on semantic prompts and locally-aware enhanced Transformer. Specifically, the locally-aware enhanced Transformer introduces a local feature modeling branch parallel to the Transformer architecture, which strengthens the joint representation of key local regions of operators and interaction regions of tools, while preserving the capability of global dependency modeling. On this basis, the proposed method integrates image and textual information, and further employs adaptive multimodal semantic prompts to guide the model&#x2019;s attention toward critical interaction regions, thereby enabling effective interpretation of key interaction behaviors between operators and tools in power distribution operations. The major contributions are summarized as follows:<list list-type="order">
<list-item>
<p>We propose a multi-element interaction analysis method based on semantic prompts and locally-aware enhanced Transformer, which combines semantic prompt vectors with a locally-aware enhanced Transformer module. This method enables accurate recognition of human-tool interactions and their semantic relations in operation and inspection tasks, while providing effective risk warning, thereby supporting intelligent supervision and risk prevention in field operations.</p>
</list-item>
<list-item>
<p>To address the complex action semantics and high similarity in operational scenarios, we propose a multimodal semantic prompt module. Through an adaptive learning strategy, it dynamically represents the synergistic relationship between language and visual prompts, effectively enhancing the model&#x2019;s semantic analysis and recognition capabilities for key operational behaviors such as climbing ladders and electricity checking.</p>
</list-item>
<list-item>
<p>To tackle the issue of fine-grained target extraction in interaction areas, we introduce a locally-aware enhanced Transformer (LAET) module. By emphasizing the joint representation of key local features of personnel and tool regions, the LAET module significantly improves the model&#x2019;s fine-grained recognition performance in person&#x2013;tool interactions.</p>
</list-item>
<list-item>
<p>To achieve deep fusion and consistent alignment of multimodal information, this paper adopts a contrastive learning optimization strategy to map visual features, human&#x2013;object interaction (HOI) features, and textual semantics into a shared space, thereby ensuring cross-modal consistency between human action semantics and tool interaction semantics.</p>
</list-item>
</list>
</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<sec id="s2-1">
<label>2.1</label>
<title>Application of visual-language pretraining in HOI</title>
<p>Human-object interaction behavior recognition aims to identify dynamic interaction relationships between humans and objects in images or videos, such as actions like &#x201c;climbing ladders,&#x201d; &#x201c;holding ladders,&#x201d; or &#x201c;grasping objects&#x201d; (<xref ref-type="bibr" rid="B23">Tu et al., 2023</xref>). The challenge of this task lies not only in accurately identifying the interacting subjects and objects but also in understanding the semantic relationships between the actions. Existing methods are mainly divided into two-stage methods and single-stage methods: the former relies on intermediate object detection results, making the inference process complex and susceptible to errors (<xref ref-type="bibr" rid="B5">Gkioxari et al., 2018</xref>; <xref ref-type="bibr" rid="B28">Zhang, et al., 2021</xref>; <xref ref-type="bibr" rid="B6">Gu, et al., 2022</xref>); the latter uses end-to-end network architectures to jointly predict the human, object, and interaction types, simplifying the detection process and improving efficiency (<xref ref-type="bibr" rid="B30">Zhu et al., 2024</xref>; <xref ref-type="bibr" rid="B10">Kim, et al., 2020</xref>). In recent years, Literature <xref ref-type="bibr" rid="B31">Zou et al. (2021)</xref> proposed the HOI Transformer model, which utilizes a Transformer encoder-decoder structure to achieve efficient human-object interaction detection within an end-to-end framework. Literature <xref ref-type="bibr" rid="B7">He et al. (2023)</xref> further proposed the SG2HOI&#x2b; framework, which constructs a dual-layer interactive Transformer network to jointly model scene graph generation (SGG) and human-object interaction detection tasks, significantly improving the contextual understanding of interaction relationships. Building upon this, the rapid development of visual-language pretraining models has provided a new research direction for HOI detection. Multimodal pretraining methods, represented by CLIP (Contrastive Language-Image Pretraining), construct a cross-modal unified semantic space through contrastive learning, and have been widely applied in tasks such as open-vocabulary recognition and cross-modal retrieval (<xref ref-type="bibr" rid="B21">Radford et al., 2021</xref>). HOI detection methods based on such models have begun to explore the use of language priors to enhance interaction semantic modeling capabilities. For instance, HOICLIP introduced a query-based knowledge retrieval mechanism, enabling efficient knowledge transfer from the pre-trained CLIP model to HOI detection tasks (<xref ref-type="bibr" rid="B19">Ning et al., 2023</xref>). Literature <xref ref-type="bibr" rid="B15">Liao et al. (2022)</xref> proposed a method called Gen-VLKT, which introduces a Visual-Linguistic Knowledge Transfer mechanism to effectively fuse visual features with semantic priors, thereby enhancing the model&#x2019;s semantic understanding of interaction behaviors.</p>
<p>Based on the aforementioned research, this paper proposes a one-stage method that integrates visual and linguistic knowledge. It combines CLIP Transformer to extract global interaction semantics from the linguistic modality and designs HOI Vision Transformer to strengthen local interaction analysis in the visual path, thereby significantly improving the model&#x2019;s performance in detecting human-object interactions in complex power scenarios.</p>
</sec>
<sec id="s2-2">
<label>2.2</label>
<title>Prompt learning</title>
<p>Prompt learning was originally applied in the field of Natural Language Processing (NLP), where its core idea is to construct specific contextual prompts, leveraging a small amount of labeled or unsupervised data combined with manually designed prompt templates or guiding sentences to enhance model performance without the need for large-scale annotations (<xref ref-type="bibr" rid="B9">Khattak et al., 2023</xref>). In human-object interaction recognition tasks, introducing prompt learning not only helps improve the model&#x2019;s understanding of action semantics but also strengthens the collaborative guidance relationship between the language and visual modalities. Especially in scenarios with imbalanced class distributions, scarce samples, or ambiguous interaction behaviors, guided prompt modeling can provide more discriminative feature support (<xref ref-type="bibr" rid="B27">Zang et al., 2022</xref>).</p>
<p>Multimodal models such as CLIP align image and text feature embeddings through contrastive learning during the training phase, constructing a shared semantic space across modalities, which gives the model good generalization ability. Among them, the CoOp (Context Optimization) method pre-appends learnable vectors to the category words, enabling CLIP to be applied to a variety of downstream tasks. The MaPLe (Multi-modal Prompt Learning) method combines learnable text prompts and visual prompts, leveraging the high coupling between the image and text encoders to enhance the semantic expression and cross-modal information sharing between the two modalities (<xref ref-type="bibr" rid="B25">Xue et al., 2024</xref>).</p>
<p>In contrast to the aforementioned methods, the approach presented in this paper introduces a prompt learning mechanism in both the language and visual modalities. It constructs a unified semantic prompt vector, and through the language prompt network and visual prompt network, maps this vector into respective prompt representations adapted to each modality, ensuring the modality specificity, interactivity, and adaptability of the prompt information.</p>
</sec>
</sec>
<sec sec-type="methods" id="s3">
<label>3</label>
<title>Methods</title>
<sec id="s3-1">
<label>3.1</label>
<title>Method overview</title>
<p>This section introduces the proposed multi-element interaction analysis method based on semantic prompts and locally-aware enhanced Transformer. The overall framework consists of a multimodal semantic prompt module (MSP), a contrastive learning module, and two locally-aware enhanced transformer modules (LAET). <xref ref-type="fig" rid="F1">Figure 1</xref> illustrates the overall structure of the model.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Overall structure of the model.</p>
</caption>
<graphic xlink:href="fenrg-14-1739244-g001.tif">
<alt-text content-type="machine-generated">Diagram of a multimodal semantic prompt model with two primary sections: the upper section shows a unified prompt flow between language and visual prompt learning networks; the lower section details language and visual models, each extracting global and local features via CLIP, GRU, HOI Vision Transformer, and ResNet-50, with features combined by locally-aware enhanced transformer modules and used for contrastive learning, illustrated with example HOI (human-object interaction) tasks and an image of electrical workers on ladders.</alt-text>
</graphic>
</fig>
<p>First, in the semantic prompt generation phase, a unified high-dimensional prompt vector is constructed to express richer semantic information. This vector is then mapped into language and visual prompts via the prompt learning network. The language prompt expresses specific operational interactions in textual form, while the visual prompt is embedded based on the behavior objects extracted from the image, guiding the subsequent modality feature extraction process. In the language modality, the input semantic prompts, along with human-object interaction text pairs (e.g., &#x201c;climb, ladder,&#x201d; &#x201c;check, test rod&#x201d;), are fed into the CLIP Transformer to extract global language features. Simultaneously, a gated recurrent unit (GRU) network is employed to extract local features from the semantic sequence. After fusion, a complete language modality representation is formed. In the visual modality, the visual prompt is combined with the input image, extracting both local region features (e.g., &#x201c;person, ladder, test rod&#x201d;) and global structural features of the entire image. Local features are extracted using a pre-trained ResNet-50 network, while global features are encoded by the HOI Vision Transformer. Finally, all modality features are input into the contrastive learning module, where the consistency between semantic information and the visual modality is optimized. This enables the model to accurately associate each set of interaction features with its corresponding HOI category labels (e.g., &#x201c;person climb ladder,&#x201d; &#x201c;person check electricity &#x201c;), thereby achieving precise recognition, semantic analysis, and risk assessment of human-tool interactions in complex operational scenarios.</p>
</sec>
<sec id="s3-2">
<label>3.2</label>
<title>Multimodal semantic prompt module</title>
<p>To address recognition challenges arising from complex action semantics and high similarity in operation and inspection tasks, this paper proposes an MSP inspired by the Unified Prompt Tuning method (<xref ref-type="bibr" rid="B8">Jia et al., 2021</xref>) to enhance multimodal collaborative understanding in complex human&#x2013;object interaction scenarios. The core idea of this module is to introduce a unified prompt vector to collaboratively guide information interaction between the language modality and the visual modality in a shared semantic space, as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Structure of the multimodal semantic prompt.</p>
</caption>
<graphic xlink:href="fenrg-14-1739244-g002.tif">
<alt-text content-type="machine-generated">Diagram illustrating a transformer-based neural network architecture, showing flow from input U through self-attention, add and layer normalization, feed-forward network, and further normalization steps, then splitting the output into distinct visual and language representations.</alt-text>
</graphic>
</fig>
<p>Although unified prompts help establish a shared semantic bridge between different modalities, due to significant differences in the information structure and expression between the language and visual modalities, directly sharing the prompt vector can lead to high generalization but insufficient specificity. This makes it difficult to simultaneously meet the task requirements of both modalities. To address this issue, this paper further proposes a prompt learning network, which introduces a modality adaptation mechanism. This allows the unified prompt to adaptively adjust according to the feature distribution and representation space of different modalities, enhancing the prompt&#x2019;s adaptability and discriminative power in multimodal tasks and enabling effective mapping from the unified semantic space to modality-specific feature spaces.</p>
<p>Specifically, we define a set of learnable vectors <inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:mi>U</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi mathvariant="normal">d</mml:mi>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as the unified prompt, and assign prompt vectors <inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi mathvariant="normal">n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> to the visual and language modalities, respectively, where <italic>n</italic> represents the length of the prompt vector and <italic>d</italic> represents the dimension of each prompt vector. During the initialization stage, we first predefine the categories of operators, tools, and actions involved in typical inspection and maintenance tasks, and construct corresponding human&#x2013;tool interaction prototypes (e.g., &#x201c;an operator climbing a ladder&#x201d; and &#x201c;an operator grasping a test rod&#x201d;). These natural language descriptions are then encoded using a frozen CLIP text encoder to obtain semantic embeddings, which are subsequently aggregated via mean pooling to form the initial unified prompt representation. This process injects explicit operational semantic priors into the model, thereby facilitating subsequent learning.</p>
<p>After semantic initialization, to further adapt to heterogeneous feature distributions across modalities and enhance cross-modal interaction capability, we introduce a lightweight Transformer layer to perform structured transformations on the unified prompts. In this module, the unified prompt <italic>U</italic> undergoes self-attention, a feed-forward neural network, and layer normalization, resulting in a structurally enhanced prompt representation <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:mover accent="true">
<mml:mi>U</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>. The self-attention mechanism in the lightweight Transformer facilitates effective interaction between the language and visual modalities, enhancing the expressive power and complementarity of the prompt information across modalities. Furthermore, we represent the prompt learning networks for the language modality and visual modality as <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a0;</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">&#x3a0;</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, respectively, which are used to learn the prompt vector representations within each modality. The prompt representations <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>U</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>U</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> are defined in <xref ref-type="disp-formula" rid="e1">Equations 1</xref>, <xref ref-type="disp-formula" rid="e2">2</xref>, respectively.<disp-formula id="e1">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>U</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>V</mml:mi>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mo>&#x220f;</mml:mo>
<mml:mi>V</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>U</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x2190;</mml:mo>
<mml:msub>
<mml:mo>&#x220f;</mml:mo>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>Next, we concatenate the multimodal prompts with the inputs to construct the final multimodal input. In the language modality, we first define the input for the i-th sample as <inline-formula id="inf9">
<mml:math id="m11">
<mml:mrow>
<mml:msub>
<mml:mi>T</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <italic>m</italic> represents the length of the Transformer input sequence, and <inline-formula id="inf10">
<mml:math id="m12">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> equals the combined length of action and object tokens. Accordingly, <inline-formula id="inf11">
<mml:math id="m13">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> zero vectors are introduced as padding. By introducing the language modality prompt, the input sequence <inline-formula id="inf12">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>n</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> is reconstructed, and the final transformed language input sequence, and the final transformed language input sequence can be represented as shown in <xref ref-type="disp-formula" rid="e3">Equation 3</xref>.<disp-formula id="e3">
<mml:math id="m15">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>T</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>n</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>t</mml:mi>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>
</p>
<p>Here, <inline-formula id="inf13">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>a</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>t</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the length of the action word. The language modality prompt is divided into prefix prompt <inline-formula id="inf14">
<mml:math id="m17">
<mml:mrow>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mi>r</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>f</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>k</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> and concatenation prompt <inline-formula id="inf15">
<mml:math id="m18">
<mml:mrow>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>j</mml:mi>
<mml:mi>u</mml:mi>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>n</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <italic>k</italic> denotes the length of vectors in the prefix section of the prompt. Therefore, the number of padding zero vectors is <inline-formula id="inf16">
<mml:math id="m19">
<mml:mrow>
<mml:mi>m</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>L</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. For the visual modality, the i-th input sample is defined as an image <inline-formula id="inf17">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. The image is then divided into several image patches, resulting in an image patch sequence <inline-formula id="inf18">
<mml:math id="m21">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>N</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. A special token, <inline-formula id="inf19">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, serves as the global representation of the image and is prepended to the sequence of image patch. As a result, the image input sequence becomes <inline-formula id="inf20">
<mml:math id="m23">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>N</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Finally, the visual modality prompt is concatenated with the above image input sequence to form the complete visual modality input sequence, as defined in <xref ref-type="disp-formula" rid="e4">Equation 4</xref>.<disp-formula id="e4">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>I</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>V</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>U</mml:mi>
<mml:mi>V</mml:mi>
<mml:mi>n</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>x</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>N</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>n</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-3">
<label>3.3</label>
<title>Locally-aware enhanced transformer module</title>
<p>To address the challenges of extracting fine-grained interaction features and modeling complex multi-person collaboration in operation and inspection tasks, this paper introduces LAET modules into both the language and visual branches, thereby strengthening the model&#x2019;s capability in analyzing local features and collaborative interactions. This module employs a dual-branch parallel architecture design, where one branch is based on the Transformer structure, used to extract global features. This branch calculates the relationships between positions in the input sequence using the self-attention mechanism, enabling the model to better capture long-range dependencies and contextual semantic information during both the encoding and decoding phases. The other branch is based on a convolutional neural network (CNN) or a gated recurrent unit (GRU) for local feature extraction, focusing on key regions and fine-grained local information using a fixed receptive field. Although this branch has advantages in capturing local details, its limited receptive field and weight sharing make it difficult to independently model global contextual dependencies (<xref ref-type="bibr" rid="B12">Li et al., 2021</xref>). To address this limitation, we propose the fusion of Transformer and CNN/GRU structures, achieving efficient integration of global and local features through joint modeling, significantly improving the model&#x2019;s understanding and expression of interaction relationships under multimodal input conditions.</p>
<p>To extract more discriminative multimodal features, we use the outputs calculated by <xref ref-type="disp-formula" rid="e3">Equations 3</xref>, <xref ref-type="disp-formula" rid="e4">4</xref> as inputs and pass them to the LAET module to extract the fused multimodal enhanced features. The global feature extraction branch for the language modality (LGF) is used to extract the global semantic representation of the text, as defined in <xref ref-type="disp-formula" rid="e5">Equation 5</xref>:<disp-formula id="e5">
<mml:math id="m25">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mtext>LGF</mml:mtext>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>CLIP</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mtext>TextEncoder</mml:mtext>
<mml:mtext>frozen</mml:mtext>
</mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>T</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>In this context, frozen indicates that the feature extraction module is in a frozen state during the training process, meaning its parameters do not participate in backpropagation and updating.</p>
<p>The language modality&#x2019;s local extractor captures the local semantic details from the text, as defined in <xref ref-type="disp-formula" rid="e6">Equation 6</xref>.<disp-formula id="e6">
<mml:math id="m26">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>GRU</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>T</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>
</p>
<p>Furthermore, we use the hyperparameter <inline-formula id="inf21">
<mml:math id="m27">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to fuse the global and local features, resulting in the final feature representation <inline-formula id="inf22">
<mml:math id="m28">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for the language modality.<disp-formula id="e7">
<mml:math id="m29">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>l</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xb7;</mml:mo>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>In the visual modality, we first represent the set of HOI instances in image <inline-formula id="inf23">
<mml:math id="m30">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>I</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> as a set of feature vectors <inline-formula id="inf24">
<mml:math id="m31">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>0</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:mo>&#x22ef;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>h</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>, where <italic>N</italic>
<sub>
<italic>h</italic>
</sub> denotes the number of HOI instances detected in the image, and <inline-formula id="inf25">
<mml:math id="m32">
<mml:mrow>
<mml:msubsup>
<mml:mi>h</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents the visual feature representation corresponding to the j-th instance. The global feature extraction branch of the visual modality based on the pre-trained HOI vision Transformer (VGF) is defined as shown in <xref ref-type="disp-formula" rid="e8">Equation 8</xref>:<disp-formula id="e8">
<mml:math id="m33">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>HOI</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mtext>ViT</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>I</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>
</p>
<p>The visual modality&#x2019;s local extractor is applied to retrieve local visual details from the image, as defined in <xref ref-type="disp-formula" rid="e9">Equation 9</xref>.<disp-formula id="e9">
<mml:math id="m34">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:mtext>ResNet</mml:mtext>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>50</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>I</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:msub>
<mml:mi>d</mml:mi>
<mml:mi>V</mml:mi>
</mml:msub>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>Furthermore, we use the hyperparameter <inline-formula id="inf26">
<mml:math id="m35">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to combine global and local features for constructing the final visual embedding <inline-formula id="inf27">
<mml:math id="m36">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> for the visual modality.<disp-formula id="e10">
<mml:math id="m37">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mi>V</mml:mi>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>G</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xb7;</mml:mo>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
</p>
<p>Subsequently, we use the hyperparameter <inline-formula id="inf28">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>v</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2208;</mml:mo>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="&#x7c;">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to further fuse the HOI features with the local visual features, resulting in the final HOI instance feature representation, as shown in <xref ref-type="disp-formula" rid="e11">Equation 11</xref>.<disp-formula id="e11">
<mml:math id="m39">
<mml:mrow>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>&#xb7;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mi>&#x3bb;</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#xb7;</mml:mo>
<mml:msubsup>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>V</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msubsup>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
</p>
</sec>
<sec id="s3-4">
<label>3.4</label>
<title>Contrastive learning and loss function</title>
<p>To achieve deeper fusion and consistent alignment of multimodal information, this study introduces a contrastive loss function during the training phase to guide the learning process. Specifically, the model jointly maps visual features, HOI features, and textual semantics into a shared semantic space, where alignment is facilitated through the construction of positive and negative sample pairs. In this process, matched image&#x2013;text pairs (positive samples) are pulled closer, while unmatched image&#x2013;text pairs (negative samples) are pushed apart. This design effectively enhances the cross-modal consistency between human action semantics and tool interaction semantics. Based on this principle, the following contrastive learning objective and corresponding loss function are defined.</p>
<p>First, the similarity score logits between the image-text pairs are computed by combining the feature representations obtained from <xref ref-type="disp-formula" rid="e7">Equations 7</xref>&#x2013;<xref ref-type="disp-formula" rid="e11">11</xref>:<disp-formula id="e12">
<mml:math id="m40">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x2a;</mml:mo>
<mml:mi mathvariant="script">T</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>R</mml:mi>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#xd7;</mml:mo>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
</p>
<p>In this equation, <inline-formula id="inf29">
<mml:math id="m41">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the total number of human-object interaction queries, <inline-formula id="inf30">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> indicates the count of text label inputs, and <inline-formula id="inf31">
<mml:math id="m43">
<mml:mrow>
<mml:mi mathvariant="script">T</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> serves as the temperature parameter to control distribution smoothness.</p>
<p>Specifically, during the prediction stage, the HOI detection head <inline-formula id="inf32">
<mml:math id="m44">
<mml:mrow>
<mml:msub>
<mml:mi>f</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>I</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> produces a fixed set of HOI queries, where each query predicts a human&#x2013;object pair associated with a human bounding box, an object bounding box, and the corresponding interaction classification scores. These query-based predictions are decoded to form the HOI instance candidate set <inline-formula id="inf33">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>. The generated HOI instances are then jointly evaluated with the similarity logits from <xref ref-type="disp-formula" rid="e12">Equation 12</xref> and the ground-truth labels (GT) through the Hungarian matching algorithm (Matcher) to select the optimal HOI instance set with the lowest matching cost, as shown in <xref ref-type="disp-formula" rid="e13">Equation 13</xref>.<disp-formula id="e13">
<mml:math id="m46">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>H</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mtext>ins</mml:mtext>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mover accent="true">
<mml:mi>g</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>M</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>h</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mtext>ins</mml:mtext>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>g</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>,</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
<p>Finally, the overall loss function <italic>F</italic> is designed to capture the relationship among the predicted instance set <inline-formula id="inf34">
<mml:math id="m47">
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf35">
<mml:math id="m48">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mover accent="true">
<mml:mi>g</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, and the ground truth labels, as formulated in <xref ref-type="disp-formula" rid="e14">Equation 14</xref>.<disp-formula id="e14">
<mml:math id="m49">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:mi>F</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>o</mml:mi>
<mml:mover accent="true">
<mml:mi>g</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mi>i</mml:mi>
<mml:mi>t</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>B</mml:mi>
<mml:mi>C</mml:mi>
<mml:mi>E</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>H</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mspace width="2.2em"/>
<mml:mo>&#x2b;</mml:mo>
<mml:mrow>
<mml:mi>L</mml:mi>
<mml:mn>1</mml:mn>
<mml:mo>_</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>H</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>I</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>U</mml:mi>
<mml:mo>_</mml:mo>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="&#x7c;">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi>H</mml:mi>
<mml:mo>&#x5e;</mml:mo>
</mml:mover>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mi>G</mml:mi>
<mml:mi>T</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
</mml:mrow>
</mml:math>
<label>(14)</label>
</disp-formula>where CE_Loss denotes the cross-entropy loss used to supervise the interaction classification based on the similarity logits, BCE_Loss represents the binary cross-entropy loss for HOI instance existence prediction, L1_Loss measures the regression error between the predicted and ground-truth bounding boxes, and GIoU_Loss further constrains the spatial alignment between predicted and ground-truth boxes by optimizing their geometric overlap.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<label>4</label>
<title>Discussion</title>
<sec id="s4-1">
<label>4.1</label>
<title>Dataset construction</title>
<p>This paper constructs a representative human-object interaction dataset based on typical operation and inspection scenarios in a certain city, covering five types of interaction relationships as detection targets: climbing ladder, holding ladder, lifting ladder, electricity checking, and grasping test rod. To meet the requirements of interaction risk identification tasks in operational scenarios, this paper constructs a dataset based on the HICO-DET dataset format, primarily involving the following steps: (1) To ensure data coverage and representativeness, this paper captures key frame images at a frequency of one frame every 30 s, prioritizing images with relatively complete perspectives and minimal obstructions as positive samples, and appropriately excluding negative samples with no interaction relationships or major obstructions; (2) To further improve sample quality, brightness enhancement and other data preprocessing operations were applied to some images; (3) Referring to the HICO-DET dataset structure, manual annotations were made using annotation tools to label people, tools, and their interaction relationships, generating standard person-object-interaction triplet formats. The final annotated dataset files, trainval_hico_ann.json and test_hico_ann.json, were obtained.</p>
<p>The final dataset contains a total of 1,800 sample images, each of which typically contains one or more interaction targets, with a total of 2,096 annotated interaction targets, as detailed in <xref ref-type="table" rid="T1">Table 1</xref>. To improve model recognition performance, this paper uses the public dataset HICO-DET for pre-training during the model training phase and as the initialization weights for the model to enhance feature perception capabilities. In terms of dataset partitioning, 80% of the self-built dataset was randomly selected for training, with the remaining portion used for testing. Specifically, 1,440 images were used as the training set, and 360 images were used as the test set.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Dataset composition.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Category</th>
<th colspan="2" align="center">Train</th>
<th colspan="2" align="center">Test</th>
</tr>
<tr>
<th align="center">Images</th>
<th align="center">Labels</th>
<th align="center">Images</th>
<th align="center">Labels</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Climb ladder</td>
<td align="center">604</td>
<td align="center">604</td>
<td align="center">176</td>
<td align="center">176</td>
</tr>
<tr>
<td align="center">Hold ladder</td>
<td align="center">260</td>
<td align="center">260</td>
<td align="center">84</td>
<td align="center">84</td>
</tr>
<tr>
<td align="center">Lift ladder</td>
<td align="center">144</td>
<td align="center">164</td>
<td align="center">132</td>
<td align="center">132</td>
</tr>
<tr>
<td align="center">Electricity check</td>
<td align="center">576</td>
<td align="center">576</td>
<td align="center">66</td>
<td align="center">66</td>
</tr>
<tr>
<td align="center">Grasp test rod</td>
<td align="center">52</td>
<td align="center">52</td>
<td align="center">4</td>
<td align="center">4</td>
</tr>
<tr>
<td align="center">Total</td>
<td align="center">1,440</td>
<td align="center">1,656</td>
<td align="center">360</td>
<td align="center">440</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s4-2">
<label>4.2</label>
<title>Experimental environment</title>
<p>The research experimental platform is based on a 64-bit Windows 11 operating system, with an NVIDIA GeForce RTX 4060 Ti graphics card featuring 8 GB of video memory. The Python version used is 3.8, and the deep learning framework is Pytorch 1.10.0. The GPU acceleration libraries are CUDNN 8.2.0 and CUDA 11.3. The model was trained for a total of 200 iterations, with a batch size of 4 and an initial learning rate of 0.0001. The Adam optimizer was used for training, along with decoupled weight decay regularization. Additionally, the hyperparameter was set to 0.7 to control the fusion ratio of global and local features.</p>
</sec>
<sec id="s4-3">
<label>4.3</label>
<title>Experimental results and analysis</title>
<sec id="s4-3-1">
<label>4.3.1</label>
<title>Ablation studies</title>
<p>The multimodal semantic prompt module enhances the semantic interpretation ability of key operational behaviors by facilitating the interaction of prompt information across different modalities and providing targeted semantic guidance for each modality, thereby resolving the problem of modality dimension inconsistency. <xref ref-type="table" rid="T2">Table 2</xref> presents the experimental comparison of different prompt learning methods on the self-built human-object interaction dataset. Here, Model I in <xref ref-type="table" rid="T2">Table 2</xref> denotes the configuration with only the visual prompt, while Model II corresponds to the multimodal semantic prompt configuration constructed by learning unified prompt vectors across visual and language modalities. The Rare, NonRare, and Full columns report the mean Average Precision (mAP) on interaction categories with few samples, frequent samples, and the complete dataset, respectively.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Impact of different prompting methods on model performance.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th colspan="2" align="center">Model</th>
<th rowspan="2" align="center">Rare</th>
<th rowspan="2" align="center">NonRare</th>
<th rowspan="2" align="center">Full</th>
</tr>
<tr>
<th align="center">I</th>
<th align="center">II</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left"/>
<td align="left"/>
<td align="center">79.90</td>
<td align="center">82.53</td>
<td align="center">81.76</td>
</tr>
<tr>
<td align="center">&#x2713;</td>
<td align="left"/>
<td align="center">79.12</td>
<td align="center">84.20</td>
<td align="center">83.87</td>
</tr>
<tr>
<td align="left"/>
<td align="center">&#x2713;</td>
<td align="center">82.55</td>
<td align="center">85.97</td>
<td align="center">84.26</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>To conduct an ablation study, we first constructed the baseline model, which removes the multimodal semantic prompt module and retains only the locally-aware enhanced Transformer. The experimental results show that the performance on the Rare subset dropped to 79.90%, while the performance decline on the NonRare subset was more pronounced, with mean Average Precision (mAP)&#x2014;which measures the detection accuracy of human-object interaction triplets by jointly considering human bounding boxes, object bounding boxes, and interaction category predictions&#x2014;dropping from 85.97% to 82.53%. This indicates that models without semantic prompts struggle to distinguish between highly similar actions, such as climbing and holding a ladder, in operational tasks. Next, in the baseline model, an independent prompt vector was introduced for the visual modality. After adding the visual prompt, the model&#x2019;s performance improved on the NonRare and Full datasets, reaching 84.20% and 83.87%, respectively, but performance on the Rare subset decreased by 0.78%. This result suggests that relying solely on the visual prompt may limit the model&#x2019;s generalization capability for low-sample interaction categories. In contrast, the complete configuration of the multimodal semantic prompt module achieved the highest mAP values across all evaluation subsets, significantly improving detection accuracy, especially showing stronger low-shot learning capability on the Rare subset. The experimental results further demonstrate that the multimodal semantic prompt module effectively enhances the multi-element semantic parsing capability between the visual and language modalities, while also validating the reasonableness and effectiveness of its Unified Prompts design for complex operation scenarios.</p>
<p>To further validate the effectiveness of the locally-aware enhanced Transformer module in different modalities, this paper evaluates the LAET module introduced for both the language and visual modalities. The symbol &#x201c;&#x2713;&#x201d; indicates that the enhancement mechanism has been enabled for the corresponding modality. As shown in <xref ref-type="table" rid="T3">Table 3</xref>, when only the visual modality is enhanced, the model&#x2019;s mAP for overall interaction detection improves from 78.79% to 80.98%. When the language modality is enhanced alone, the model&#x2019;s performance in interaction recognition improves by 1.25%. When the LAET module is introduced for both the language and visual modalities, the model performance is further improved, with the overall mAP reaching 84.26%, a 5.47% increase compared to the baseline version. These results demonstrate that the introduction of the LAET module significantly enhances the model&#x2019;s ability to recognize fine-grained interaction features.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Experimental results of LAET module enhancement in different modalities.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Visual modality</th>
<th rowspan="2" align="center">Linguistic modality</th>
<th colspan="5" align="left">Accuracy of different interactions in operational scenarios (%)</th>
<th rowspan="2" align="center">Accuracy (%)</th>
</tr>
<tr>
<th align="center">Climb ladder</th>
<th align="center">Hold ladder</th>
<th align="center">Lift ladder</th>
<th align="center">Electricity check</th>
<th align="center">Grasp testrod</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left"/>
<td align="left"/>
<td align="center">78.02</td>
<td align="center">76.58</td>
<td align="center">79.18</td>
<td align="center">77.78</td>
<td align="center">82.39</td>
<td align="center">78.79</td>
</tr>
<tr>
<td align="center">&#x2713;</td>
<td align="left"/>
<td align="center">81.44</td>
<td align="center">79.91</td>
<td align="center">81.03</td>
<td align="center">78.38</td>
<td align="center">84.15</td>
<td align="center">80.98</td>
</tr>
<tr>
<td align="left"/>
<td align="center">&#x2713;</td>
<td align="center">82.39</td>
<td align="center">80.45</td>
<td align="center">82.70</td>
<td align="center">80.29</td>
<td align="center">85.30</td>
<td align="center">82.23</td>
</tr>
<tr>
<td align="center">&#x2713;</td>
<td align="center">&#x2713;</td>
<td align="center">84.52</td>
<td align="center">81.90</td>
<td align="center">83.81</td>
<td align="center">81.64</td>
<td align="center">89.42</td>
<td align="center">84.26</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Based on the power operation scenario dataset constructed in this study, the LAET module demonstrates excellent application adaptability and feature extraction capabilities. Experimental results show that the module can accurately identify the key interaction areas between workers and tools in typical operational scenarios such as &#x201c;climb ladder,&#x201d; &#x201c;hold ladder,&#x201d; electricity check,&#x201d; and &#x201c;grasp test rod.&#x201d; This significantly enhances the model&#x2019;s performance in fine-grained behavior recognition, semantic differentiation, and risk assessment. <xref ref-type="fig" rid="F3">Figure 3</xref> illustrates the change trends of training loss and validation accuracy with respect to the number of training epochs. As shown, the training loss exhibits a generally monotonic decrease, while the validation accuracy steadily increases and stabilizes after approximately the 150th epoch. This indicates that the model demonstrates good convergence and maintains a high confidence level during validation, further validating the stability and effectiveness of the proposed multimodal structure during the iterative optimization process.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Training loss and validation mAP change curve.</p>
</caption>
<graphic xlink:href="fenrg-14-1739244-g003.tif">
<alt-text content-type="machine-generated">Line chart showing training loss in blue and validation mean average precision (mAP) in red plotted against epochs. Training loss decreases steadily while validation mAP increases and stabilizes around epoch 150.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-3-2">
<label>4.3.2</label>
<title>Comparative experiments</title>
<p>To further validate the effectiveness and superiority of the proposed method in the human-object interaction recognition task within power operation scenarios, three representative comparison models were selected for evaluation: the typical single-stage HOI detection model HOI Transformer (HOTR), the improved Transformer-based THID model, and the HOICLIP model that integrates visual-language multimodal information. All models were trained and tested on the dataset containing 1,800 sample images from power operation scenarios, with comparative analysis conducted under identical evaluation metrics and parameter settings. The evaluation results for each method are shown in <xref ref-type="table" rid="T4">Table 4</xref>, while typical interaction scenarios, such as multi-person ladder collaboration and electricity checking operations, are visualized in <xref ref-type="fig" rid="F4">Figure 4</xref>.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Comparative results of different methods in interaction recognition.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th rowspan="2" align="center">Model</th>
<th colspan="5" align="center">Accuracy of different interactions in operational scenarios (%)</th>
<th rowspan="2" align="center">Accuracy (%)</th>
</tr>
<tr>
<th align="center">Climb ladder</th>
<th align="center">Hold ladder</th>
<th align="center">Lift ladder</th>
<th align="center">Electricity check</th>
<th align="center">Grasp testrod</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">HOTR</td>
<td align="center">78.41</td>
<td align="center">76.23</td>
<td align="center">77.55</td>
<td align="center">75.63</td>
<td align="center">80.12</td>
<td align="center">77.59</td>
</tr>
<tr>
<td align="center">HOICLIP</td>
<td align="center">82.33</td>
<td align="center">79.64</td>
<td align="center">81.32</td>
<td align="center">80.81</td>
<td align="center">85.77</td>
<td align="center">81.97</td>
</tr>
<tr>
<td align="center">THID</td>
<td align="center">80.75</td>
<td align="center">77.83</td>
<td align="center">79.16</td>
<td align="center">78.91</td>
<td align="center">82.34</td>
<td align="center">79.80</td>
</tr>
<tr>
<td align="center">Proposed method</td>
<td align="center">84.52</td>
<td align="center">81.90</td>
<td align="center">83.81</td>
<td align="center">81.64</td>
<td align="center">89.42</td>
<td align="center">84.26</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Visualization results of different methods in interaction recognition. <bold>(A)</bold> HOTR; <bold>(B)</bold> THID; <bold>(C)</bold> HOICLIP; <bold>(D)</bold> Proposed method.</p>
</caption>
<graphic xlink:href="fenrg-14-1739244-g004.tif">
<alt-text content-type="machine-generated">Four surveillance images labeled A, B, C, and D show workers at an industrial site interacting with electrical equipment and ladders. Each image contains colored bounding boxes and labels marking objects or actions such as &#x201C;person,&#x201D; &#x201C;ladder,&#x201D; &#x201C;testrod,&#x201D; &#x201C;climb,&#x201D; and safety-related warnings like &#x201C;No one holding ladder&#x201D; and &#x201C;No safety personnel during checking.&#x201D; Images illustrate an AI-based object detection and safety warning system in an outdoor work environment.</alt-text>
</graphic>
</fig>
<p>The comparison results indicate that the single-stage HOTR model performs the worst, often making misjudgments when confronted with highly similar interaction scenarios. For instance, in the &#x201c;climb ladder&#x201d; scenario shown in <xref ref-type="fig" rid="F4">Figure 4A</xref>, the model frequently misidentifies the action as &#x201c;hold ladder&#x201d; due to the similarity of the actions. The HOICLIP model, which integrates a pre-trained visual-language multimodal approach, improves recognition of rare interactions, achieving a 4.38% higher accuracy compared to HOTR. Compared with the results of HOICLIP, the proposed method demonstrates higher prediction confidence and more stable interaction recognition in complex operation scenarios. For example, in the ladder climbing scenario, HOICLIP fails to detect the standing operator on the left side, resulting in incomplete modeling of human&#x2013;ladder interactions, whereas the proposed method can simultaneously localize multiple operators and correctly identify their interaction states. In the electricity inspection operation scenario, HOICLIP suffers from misaligned bounding boxes of operators, which further affects subsequent interaction category judgment, while the proposed method achieves more accurate human localization and interaction association. The THID model, although capable of accurately identifying interactions between people and tools, suffers from issues of missing person detection. In contrast, the method proposed in this paper, which introduces semantic prompts and the locally-aware enhanced Transformer, strengthens the model&#x2019;s discriminative ability in cross-category, high-similarity interactions, while also providing risk alerts. The overall mAP achieved 84.26%, delivering optimal performance across all interaction types.</p>
<p>To further analyze the model&#x2019;s discrimination ability on semantically similar interaction categories, we plot the confusion matrix on the test set, as shown in <xref ref-type="fig" rid="F5">Figure 5</xref>. It can be observed that the proposed method presents an obvious diagonal-dominant distribution in most categories, indicating that the model can relatively stably distinguish different operation interaction types. In particular, only a small amount of mutual misclassification exists between the semantically similar &#x201c;climb ladder&#x201d; and &#x201c;hold ladder&#x201d; categories. Meanwhile, although there is still certain confusion between &#x201c;lift ladder&#x201d; and &#x201c;hold ladder,&#x201d; the overall correct recognition proportion remains high, indicating that the model can extract more discriminative feature representations under similar action semantics. In addition, for the &#x201c;grasp test rod&#x201d; category with a small number of samples, all samples can be correctly recognized, further verifying the stability of the model in long-tail scenarios.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Confusion matrix of power operation interaction behaviors.</p>
</caption>
<graphic xlink:href="fenrg-14-1739244-g005.tif">
<alt-text content-type="machine-generated">Confusion matrix heatmap showing five activity classes: Climb ladder, Hold ladder, Lift ladder, Electricity check, and Grasp test rod, with most values concentrated along the diagonal, indicating high accuracy, and a blue color scale bar on the right.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4-3-3">
<label>4.3.3</label>
<title>Interaction recognition test in operational scenarios</title>
<p>To validate the proposed method&#x2019;s ability to identify typical interactive behaviors and risks in actual work scenarios, this paper takes the distribution network work site as the background and selects five representative interactive actions, namely, climbing ladder, holding ladder, lifting ladder, electricity checking, and grasping test rod, to conduct multi-scenario interaction and risk identification tests. The test results are shown in <xref ref-type="fig" rid="F6">Figure 6</xref>. It should be noted that the risk alerts shown in <xref ref-type="fig" rid="F6">Figure 6</xref> are generated based on the human&#x2013;tool interaction results recognized by the model, followed by semantic state mapping and rule-based reasoning. Specifically, the model first outputs the key interaction types and their spatial localization in the operation scene. Subsequently, the interaction set is semantically mapped according to predefined distribution network safety regulations, and risk inference is further performed through rule constraints, thereby producing the corresponding safety alert information.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Detection results for various scenarios. <bold>(A)</bold> Collaborative ladder climbing and holding operation scenario; <bold>(B)</bold> Climbing ladder operation scenario; <bold>(C)</bold> Lifting ladder operation scenario; <bold>(D)</bold> Electricity checking operation scenario; <bold>(E)</bold> Grasping test rod scenario.</p>
</caption>
<graphic xlink:href="fenrg-14-1739244-g006.tif">
<alt-text content-type="machine-generated">Two rows of images display workers at an electrical substation performing tasks involving ladders and tools, with computer vision boxes highlighting objects and workers. Some images indicate safety compliance labels such as &#x22;Safety,&#x22; while others show warning labels like &#x201C;No one holding ladder&#x201D; or &#x201C;No safety personnel during checking.&#x201D; Each image has colored boxes and text overlays identifying worker actions and ladder or tool status for occupational safety monitoring.</alt-text>
</graphic>
</fig>
<p>The results indicate that the proposed method can effectively recognize various target objects and interaction actions in complex conditions such as multiple personnel, multiple interaction relationships, and occlusions, demonstrating strong risk recognition performance. For example, in <xref ref-type="fig" rid="F6">Figure 6A</xref>, the action is only considered safe when both ladder climbing and holding ladder operations are detected simultaneously, otherwise, the system triggers a &#x201c;Warning: No one holding ladder&#x201d;; in <xref ref-type="fig" rid="F6">Figure 6C</xref>, a single person lifting a ladder is flagged with a &#x201c;Warning: Only one person lifting ladder&#x201d;; in <xref ref-type="fig" rid="F6">Figure 6D</xref>, a comparison of two electricity checking scenarios highlights that an on-site safety personnel must be present for the electricity checking operation, otherwise, a &#x201c;Warning: No safety personnel during checking&#x201d; is triggered. These experimental results fully validate the proposed method&#x2019;s capability in intelligent recognition of key interaction behaviors in typical power operation scenarios, providing a reliable technological foundation for operation compliance determination and safety risk warnings, demonstrating its effectiveness and application potential.</p>
</sec>
</sec>
</sec>
<sec sec-type="conclusion" id="s5">
<label>5</label>
<title>Conclusion</title>
<p>To address the challenges in power operation and inspection scenarios, including complex action semantics, highly similar postures, and the difficulty of accurately interpreting multi-element interaction relationships, this paper proposes a multi-element interaction relationship parsing method based on semantic prompts and a locally-aware enhanced Transformer. The main research conclusions are as follows:<list list-type="simple">
<list-item>
<label>1.</label>
<p>Learnable prompt vectors are introduced into both the visual and linguistic modalities to construct a multimodal semantic prompt mechanism, enabling deep fusion of cross-modal semantic information. This design effectively distinguishes highly similar actions such as &#x201c;climbing vs. holding ladder&#x201d; and &#x201c;electricity checking vs. grasping test rod,&#x201d; thereby enhancing the model&#x2019;s semantic interpretation capability for complex interaction behaviors.</p>
</list-item>
<list-item>
<label>2.</label>
<p>A locally-aware enhanced Transformer structure is proposed to strengthen fine-grained feature extraction of key human body parts and local regions of tools in operational scenarios. By integrating semantic cues from textual prompts, the model achieves more precise localization in multi-person collaborative and multi-element interaction settings.</p>
</list-item>
<list-item>
<label>3.</label>
<p>A contrastive learning mechanism is employed to map human action semantics and tool interaction semantics into a shared semantic space, thereby promoting deep alignment of multimodal features. This approach enables accurate interaction recognition and joint parsing of risk factors under conditions of multi-element coupling.</p>
</list-item>
</list>
</p>
<p>Although this study has achieved promising results in improving multimodal interaction recognition capabilities, real-world operational scenarios often involve the coexistence of multiple risk factors, with potential semantic relationships and causal influences between these risks. Therefore, future research will focus on constructing a hierarchical recognition framework for scenarios with multiple coexisting risks, advancing the perception of power operation risks from &#x201c;visible&#x201d; to &#x201c;controllable,&#x201d; and realizing a more accurate and adaptive intelligent safety management system.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec sec-type="ethics-statement" id="s7">
<title>Ethics statement</title>
<p>Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec sec-type="author-contributions" id="s8">
<title>Author contributions</title>
<p>HZ: Conceptualization, Methodology, Investigation, Writing &#x2013; review and editing, Software. XZ: Conceptualization, Methodology, Software, Writing &#x2013; original draft, Visualization. FM: Data curation, Methodology, Conceptualization, Writing &#x2013; review and editing. QL: Writing &#x2013; review and editing, Visualization, Formal Analysis. AY: Data curation, Writing &#x2013; review and editing, Investigation. ZF: Investigation, Visualization, Writing &#x2013; review and editing.</p>
</sec>
<ack>
<title>Acknowledgements</title>
<p>The authors sincerely acknowledge Professor Bo Wang for his invaluable support and guidance throughout this research, and also thank HZ for his assistance. Appreciation is extended to Wuhan University for their support in this study, and to Fuqi Ma from Xi&#x2019;an University of Technology for his valuable suggestions regarding the research work.</p>
</ack>
<sec sec-type="COI-statement" id="s10">
<title>Conflict of interest</title>
<p>The author(s) declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="s11">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was not used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Cao</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Identification of causative factors for fatal accidents in the electric power industry using text categorization and catastrophe association analysis techniques</article-title>. <source>Alex. Eng. J.</source> <volume>102</volume>, <fpage>290</fpage>&#x2013;<lpage>308</lpage>. <pub-id pub-id-type="doi">10.1016/j.aej.2024.05.100</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Cheng</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>Skeleton-based action recognition with shift graph convolutional network</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition</source>, <fpage>183</fpage>&#x2013;<lpage>192</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00026</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book">
<collab>Energy Foundation China</collab> (<year>2022</year>). <source>China&#x2019;s 14th five-year plans on renewable energy development and modern energy system</source>.</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Feng</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Wan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Research on behavior recognition method of live working personnel based onhuman-object interaction detection</article-title>. <source>J. Saf. Sci. Technol.</source> <volume>20</volume>, <fpage>205</fpage>&#x2013;<lpage>211</lpage>. <pub-id pub-id-type="doi">10.11731/j.issn.1673-193x.2024.09.025</pub-id>
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Gkioxari</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Doll&#xe1;r</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2018</year>). &#x201c;<article-title>Detecting and recognizing human-object interactions</article-title>,&#x201d; in <source>Proceedings of the IEEE conference on computer vision and pattern recognition (CVPR)</source>, <fpage>8359</fpage>&#x2013;<lpage>8367</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.1704.07333</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Cai</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>DSSF: dynamic semantic sampling and fusion for one-stage human&#x2013;object interaction detection</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>71</volume>), <fpage>1</fpage>&#x2013;<lpage>13</lpage>. <pub-id pub-id-type="doi">10.1109/TIM.2022.3176899</pub-id>
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Toward a unified transformer-based framework for scene graph generation and human-object interaction detection</article-title>. <source>IEEE T. Image Process.</source> <volume>32</volume>, <fpage>6274</fpage>&#x2013;<lpage>6288</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2023.3330304</pub-id>
<pub-id pub-id-type="pmid">37948145</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Jia</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Scaling up visual and vision-language representation learning with noisy text supervision</article-title>,&#x201d; in <source>Proceedings of the international conference on machine learning (ICML)</source>, <fpage>4904</fpage>&#x2013;<lpage>4916</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2102.05918</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Khattak</surname>
<given-names>M. U.</given-names>
</name>
<name>
<surname>Rasheed</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Maaz</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Maple: multi-modal prompt learning</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>, <fpage>19113</fpage>&#x2013;<lpage>19122</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2210.03117</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Choi</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2020</year>). &#x201c;<article-title>UnionDet: union-level detector towards real-time human-object interaction detection</article-title>,&#x201d; in <source>Computer Vision&#x2013;ECCV 2020: 16th European Conference</source>, <fpage>498</fpage>&#x2013;<lpage>514</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2312.12664</pub-id>
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Kim</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lee</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>E.-S.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>H. J.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>HOTR: end-to-end human-object interaction detection with transformers</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>, <fpage>74</fpage>&#x2013;<lpage>83</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2104.13682</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Selvaraju</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Gotmare</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Align before fuse: vision and language representation learning with momentum distillation</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>34</volume>, <fpage>9694</fpage>&#x2013;<lpage>9705</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2107.07651</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Study on the interaction behaviors identification of construction workers based on ST-GCN and YOLO</article-title>. <source>Sensors</source> <volume>23</volume>, <fpage>6318</fpage>. <pub-id pub-id-type="doi">10.3390/s23146318</pub-id>
<pub-id pub-id-type="pmid">37514613</pub-id>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>M. Alharbi</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>A risk identification method for power operation scenarios using image caption and semantic text similarity analysis</article-title>. <source>IEEE T. Ind. Inf.</source> <volume>21</volume>, <fpage>4488</fpage>&#x2013;<lpage>4498</lpage>. <pub-id pub-id-type="doi">10.1109/TII.2025.3540483</pub-id>
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Liao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Lu</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Gen-VLKT: simplify association and enhance interaction understanding for HOI detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>, <fpage>20123</fpage>&#x2013;<lpage>20132</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2203.13954</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Detection method of the seat belt for workers at height based on UAV image and YOLO algorithm</article-title>. <source>Array</source> <volume>22</volume>, <fpage>100340</fpage>. <pub-id pub-id-type="doi">10.1016/j.array.2024.100340</pub-id>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>H.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Research on intelligent identification method of distribution grid operation safety risk based on semantic feature parsing</article-title>. <source>Int. J. Elec. Power</source> <volume>160</volume>, <fpage>110139</fpage>. <pub-id pub-id-type="doi">10.1016/j.ijepes.2024.110139</pub-id>
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Ban</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Xi</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2025</year>). <article-title>Active hard sample learning for violation action recognition in power grid operation</article-title>. <source>Information</source> <volume>16</volume>, <fpage>67</fpage>. <pub-id pub-id-type="doi">10.3390/info16010067</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Ning</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Qiu</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>HOIClip: efficient knowledge transfer for HOI detection with vision-language models</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>, <fpage>23507</fpage>&#x2013;<lpage>23517</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2303.15786</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Peng</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Lei</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>F.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>CORY-Net: contrastive res-YOLOv5 network for intelligent safety monitoring on power grid construction sites</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>160461</fpage>&#x2013;<lpage>160470</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3132301</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Radford</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>Hallacy</surname>
<given-names>C.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Learning transferable visual models from natural language supervision</article-title>,&#x201d; in <source>Proceedings of the international conference on machine learning (ICML)</source>, <fpage>8748</fpage>&#x2013;<lpage>8763</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2103.00020</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tang</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Roberts</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Golparvar-Fard</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Human-object interaction recognition for automatic construction site safety inspection</article-title>. <source>Autom. Constr.</source> <volume>120</volume>, <fpage>103356</fpage>. <pub-id pub-id-type="doi">10.1016/j.autcon.2020.103356</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Tu</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhai</surname>
<given-names>G.</given-names>
</name>
</person-group> (<year>2023</year>). &#x201c;<article-title>Agglomerative transformer for human-object interaction detection</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF international conference on computer vision (ICCV)</source>, <fpage>21614</fpage>&#x2013;<lpage>21624</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2308.08370</pub-id>
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>X.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Skeleton-based violation action recognition method for safety supervision in operation field of distribution network based on graph convolutional network</article-title>. <source>CSEE J. Power Energy</source> <volume>9</volume>, <fpage>2179</fpage>&#x2013;<lpage>2187</lpage>. <pub-id pub-id-type="doi">10.17775/CSEEJPES.2020.03000</pub-id>
</mixed-citation>
</ref>
<ref id="B25">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xue</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Gao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Fang</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>M.</given-names>
</name>
<etal/>
</person-group> (<year>2024</year>). <article-title>Adaptive multimodal prompt for human-object interaction with local feature enhanced transformer</article-title>. <source>Appl. Intell.</source> <volume>54</volume>, <fpage>12492</fpage>&#x2013;<lpage>12504</lpage>. <pub-id pub-id-type="doi">10.1007/s10489-024-05774-7</pub-id>
</mixed-citation>
</ref>
<ref id="B26">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yang</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>SGI-YOLOv9: an effective method for crucial components detection in the power distribution network</article-title>. <source>Front. Phys.</source> <volume>12</volume>, <fpage>1517177</fpage>. <pub-id pub-id-type="doi">10.3389/fphy.2024.1517177</pub-id>
</mixed-citation>
</ref>
<ref id="B27">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>K.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Unified vision and language prompt learning</article-title>. <source>arXiv Preprint arXiv:2210.07225</source>. <pub-id pub-id-type="doi">10.48550/arXiv.2210.07225</pub-id>
</mixed-citation>
</ref>
<ref id="B28">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Liao</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Mining the benefits of two-stage and one-stage hoi detection</article-title>. <source>Adv. Neural Inf. Process. Syst.</source> <volume>34</volume>, <fpage>17209</fpage>&#x2013;<lpage>17220</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2108.05077</pub-id>
</mixed-citation>
</ref>
<ref id="B29">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhao</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Lan</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Niu</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>W.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Detection and location of safety protective wear in power substation operation using wear-enhanced YOLOv3 algorithm</article-title>. <source>IEEE Access</source> <volume>9</volume>, <fpage>125540</fpage>&#x2013;<lpage>125549</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2021.3104731</pub-id>
</mixed-citation>
</ref>
<ref id="B30">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Ho</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Shun</surname>
<given-names>H. P.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Geometric features enhanced human-object interaction detection</article-title>. <source>IEEE Trans. Instrum. Meas.</source> <volume>73</volume>, <fpage>5026104</fpage>. <pub-id pub-id-type="doi">10.1109/TIM.2024.3427800</pub-id>
</mixed-citation>
</ref>
<ref id="B31">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name>
<surname>Zou</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>End-to-End human-object interaction detection with HOI transformer</article-title>,&#x201d; in <source>Proceedings of the IEEE/CVF conference on computer vision and pattern recognition (CVPR)</source>, <fpage>11825</fpage>&#x2013;<lpage>11834</lpage>. <pub-id pub-id-type="doi">10.48550/arXiv.2103.04503</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn fn-type="custom" custom-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/661595/overview">Constantinos S. Psomopoulos</ext-link>, University of West Attica, Greece</p>
</fn>
<fn fn-type="custom" custom-type="reviewed-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3264570/overview">Dimitrios Barkas</ext-link>, University of West Attica, Greece</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3320327/overview">Konstantinos Kalkanis</ext-link>, University of West Attica, Greece</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3359857/overview">Chen Lv</ext-link>, China Jiliang University, China</p>
</fn>
</fn-group>
</back>
</article>