<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Plant Sci.</journal-id>
<journal-title-group>
<journal-title>Frontiers in Plant Science</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Plant Sci.</abbrev-journal-title>
</journal-title-group>
<issn pub-type="epub">1664-462X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fpls.2026.1747598</article-id>
<article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Original Research</subject>
</subj-group>
</article-categories>
<title-group>
<article-title>Lightweight plant phenotypic feature extraction via transferable attention head pruning in Vision Transformers</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Xie</surname><given-names>Yongsheng</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3273761/overview"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Zeng</surname><given-names>Xiaoxiao</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Wang</surname><given-names>Rifeng</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>*</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project-administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
</contrib>
<contrib contrib-type="author">
<name><surname>Li</surname><given-names>Wenxin</given-names></name>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &amp; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><label>1</label><institution>School of Artificial Intelligence and Center for Network and Educational Technology, Guangxi Science and Technology University</institution>, <city>Laibin</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff2"><label>2</label><institution>School of Artificial Intelligence, Guangxi Science and Technology University</institution>, <city>Laibin</city>,&#xa0;<country country="cn">China</country></aff>
<aff id="aff3"><label>3</label><institution>Smart Agriculture College (IoT Engineering College), Guangxi Science and Technology University</institution>, <city>Laibin</city>,&#xa0;<country country="cn">China</country></aff>
<author-notes>
<corresp id="c001"><label>*</label>Correspondence: Xiaoxiao Zeng, <email xlink:href="mailto:156055562@qq.com">156055562@qq.com</email>; Rifeng Wang, <email xlink:href="mailto:023505@163.com">023505@163.com</email></corresp>
</author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2026-02-13">
<day>13</day>
<month>02</month>
<year>2026</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2026</year>
</pub-date>
<volume>17</volume>
<elocation-id>1747598</elocation-id>
<history>
<date date-type="received">
<day>03</day>
<month>12</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>01</month>
<year>2026</year>
</date>
<date date-type="rev-recd">
<day>17</day>
<month>01</month>
<year>2026</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2026 Xie, Zeng, Wang and Li.</copyright-statement>
<copyright-year>2026</copyright-year>
<copyright-holder>Xie, Zeng, Wang and Li</copyright-holder>
<license>
<ali:license_ref start_date="2026-02-13">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>We propose a lightweight Multi-Head Self-Attention (MHSA) mechanism for plant phenotypic feature extraction, which integrates cross-species transfer learning with dynamic head pruning to improve efficiency without compromising accuracy. The primary challenge stems from minimizing redundant computations without compromising the model&#x2019;s capacity to generalize over varied plant species, an issue intensified by the substantial dimensionality of attention mechanisms in Vision Transformers. Our solution, the Transferable Attention Head Alignment (TAHA) framework, operates in three stages: pre-training on a source species, cross-species alignment via a Domain Alignment Loss (DAL), and head pruning based on a transferability score. The framework selects and keeps solely the attention heads with the highest transferability, thus diminishing model intricacy without compromising the ability to distinguish phenotypic traits. Furthermore, the pruned MHSA module is smoothly combined with standard Transformer backbones, which makes efficient deployment on edge devices possible. Experiments were conducted on real edge hardware (Raspberry Pi 4, NVIDIA Jetson Nano) and GPU platforms, showing our approach attains accuracy similar to full-head models yet cuts computational expenses by as much as 40% (14.1 ms inference latency on Raspberry Pi 4, 519 M parameters). The method holds special importance for scalable plant phenotyping, in situations where computational capacity is frequently constrained yet generalization across species is essential. Moreover, the repeated alignment and pruning procedure permits gradual adjustment to novel species without complete retraining, which increases feasibility for agricultural applications in practical settings. Supplementary experiments on phylogenetically distant species (Arabidopsis &#x2192; pine) demonstrate the framework&#x2019;s generalization limits, with a 7.2% F1-score drop compared to close-species transfer (Arabidopsis &#x2192; maize), highlighting the need for trait-specific head adaptation in distant transfers. The proposed method improves lightweight feature extraction by merging transfer learning and attention head optimization, achieving a balanced compromise between performance and efficiency.</p>
</abstract>
<kwd-group>
<kwd>cross-species transfer learning</kwd>
<kwd>dynamic head pruning</kwd>
<kwd>Lightweight Multi-Head Self-Attention (MHSA)</kwd>
<kwd>plant phenotypic feature extraction</kwd>
<kwd>Transferable Attention Head Alignment (TAHA)</kwd>
</kwd-group>
<funding-group>
<funding-statement>The author(s) declare financial support was received for the research and/or publication of this article. This work was supported by the Guangxi Science and Technology Plan Project&#x201d; Research and Application of Machine Vision for Rapid Multi-Target Differential Detection&#x201d; (No.GuiKe AD23026282) and Key R&amp;D Program Project in Guangxi &#x201c;Research and Application Demonstration of Tower and Mast Intelligent Operation and Maintenance Technology in Mountainous and Hilly Areas Based on Beidou Satellite Based Enhancement&#x201d; (No.Guike AB25069262).</funding-statement>
</funding-group>
<counts>
<fig-count count="3"/>
<table-count count="4"/>
<equation-count count="8"/>
<ref-count count="24"/>
<page-count count="10"/>
<word-count count="5055"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Technical Advances in Plant Science</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec id="s1" sec-type="intro">
<label>1</label>
<title>Introduction</title>
<p>Plant phenotyping has become a crucial activity in precision agriculture, which supports the quantitative evaluation of plant attributes for crop advancement and yield estimation. Traditional methods rely on manual measurements, which are labor-intensive and prone to human error. Recent advances in deep learning, particularly convolutional neural networks (CNNs), have automated this process by extracting phenotypic features from images (<xref ref-type="bibr" rid="B1">Arya et&#xa0;al., 2022</xref>). Nevertheless, CNNs are ineffective at modeling extensive spatial relationships in plant morphology, a shortcoming that Transformer frameworks overcome by employing their Multi-Head Self-Attention (MHSA) approach (<xref ref-type="bibr" rid="B16">Vaswani et&#xa0;al., 2017</xref>). Although Transformers are highly effective at capturing long-range dependencies, their computational demands, caused by superfluous attention mechanisms, limit their applicability in agriculture with restricted resources.</p>
<p>Existing lightweight solutions often compromise feature extraction quality. For example, techniques that eliminate attention heads do so without selectivity (<xref ref-type="bibr" rid="B13">Michel et&#xa0;al., 2019</xref>), whereas methods for adapting to specific domains overlook differences in phenotypic traits across species (<xref ref-type="bibr" rid="B19">Yan et&#xa0;al., 2021</xref>). This creates a tension between model efficiency and generalization capability, especially when transferring knowledge from well-studied species (e.g., Arabidopsis) to under-resourced crops. The problem is exacerbated by the lack of methods to identify which attention heads encode species-specific versus transferable features.</p>
<p>A critical gap in existing research is threefold: (1) Lightweight Transformers for plant phenotyping [e.g., TrIncNet (<xref ref-type="bibr" rid="B5">Gole et&#xa0;al., 2023</xref>), PMVT (<xref ref-type="bibr" rid="B11">Li et&#xa0;al., 2023</xref>)] focus on intra-species efficiency and retain redundant heads in cross-species scenarios, wasting computational resources; (2) Cross-species transfer methods [e.g., FloraBERT (<xref ref-type="bibr" rid="B9">Levy et&#xa0;al., 2022</xref>), DeepTL-Ubi (<xref ref-type="bibr" rid="B12">Liu et&#xa0;al., 2021</xref>)] optimize attention for biological data alignment but ignore computational efficiency via head pruning; (3) Attention pruning techniques [e.g., (<xref ref-type="bibr" rid="B13">Michel et&#xa0;al., 2019</xref>; <xref ref-type="bibr" rid="B17">Voita et&#xa0;al., 2019</xref>)] assess head importance in single domains and fail to account for transferability across plant species, leading to accuracy loss in distant transfers. TAHA addresses these gaps by unifying cross-species alignment and dynamic head pruning, with a transferability score that balances domain adaptation and phenotypic trait preservation.</p>
<p>We propose a new framework closing this gap by choosing which attention heads to move and trim according to their usefulness across species. Our approach hinges on two insights: (1) certain attention heads in pre-trained Transformers capture universal plant morphological patterns, and (2) redundant heads can be identified via their contribution to domain-invariant feature alignment. Specifically, we first pre-train a base Transformer on a source plant species to initialize attention heads. Subsequently, a domain alignment loss is applied to adjust these heads for a target species, with the transferability of each head assessed based on its alignment performance and redundancy. Heads with low transferability scores are pruned, resulting in a compact yet expressive model. In contrast to previous approaches employing static pruning thresholds (<xref ref-type="bibr" rid="B22">Zheng W. et&#xa0;al., 2024</xref>), our technique adaptively modifies head retention according to cross-species feature importance.</p>
<p>The key contributions of this work are threefold. First, we introduce a metric for attention heads to quantify their ability to extract phenotypic features across species. Next, an iterative alignment-pruning procedure is designed to lower model complexity without losing essential attention structures. Third, we show that the pruned model generalizes across diverse plant species and attains accuracy similar to full-head Transformers while requiring far less computational resources. This is particularly impactful for real-world applications where labeled data for target species is scarce.</p>
<p>The remainder of this paper is organized as follows: Section 2 reviews related work on lightweight Transformers and cross-species adaptation. Section 3 establishes the MHSA approach and adaptation to plant image domains. Section 4 details our attention head transfer and pruning methodology. Section 5 assesses the framework on various plant phenotyping benchmarks. Finally, Sections 6 and 7 discuss implications and conclude the work.</p>
<p><italic>Key distinctions from previous approaches</italic>: In contrast to combined CNN-Transformer frameworks (<xref ref-type="bibr" rid="B8">Jeon et&#xa0;al., 2025</xref>), our technique functions exclusively within the Transformer structure and eliminates the inductive biases associated with convolutions. Compared to static pruning approaches (<xref ref-type="bibr" rid="B13">Michel et&#xa0;al., 2019</xref>), we dynamically assess head utility during cross-species adaptation. Our domain alignment loss also differs from standard adversarial training (<xref ref-type="bibr" rid="B3">Ganin and Lempitsky, 2015</xref>) by explicitly optimizing for phenotypic feature preservation.</p>
</sec>
<sec id="s2">
<label>2</label>
<title>Related work</title>
<p>Progress in creating lightweight feature extraction techniques for plant phenotyping aligns with two key research areas: optimized Transformer architectures and transfer learning across species. Although these domains have been studied separately, the merging of these fields has received limited attention, especially concerning the optimization of attention heads for phenotypic trait generalization.</p>
<sec id="s2_1">
<label>2.1</label>
<title>Lightweight transformers for plant phenotyping</title>
<p>Recent efforts to reduce the computational burden of Vision Transformers (ViTs) have focused on architectural modifications, such as token pruning (<xref ref-type="bibr" rid="B22">Zheng W. et&#xa0;al., 2024</xref>) and hybrid CNN-Transformer designs (<xref ref-type="bibr" rid="B2">Chakrabarty et&#xa0;al., 2024</xref>). For instance, TrIncNet (<xref ref-type="bibr" rid="B5">Gole et&#xa0;al., 2023</xref>) introduced a multi-granularity feature extraction module to identify tomato diseases, while PMVT (<xref ref-type="bibr" rid="B11">Li et&#xa0;al., 2023</xref>) adapted MobileViT for real-time plant disease detection. These methods primarily target intra-species tasks, where the model is trained and tested on the same plant species. Nevertheless, they frequently keep all attention heads without acknowledging the redundancy present in cross-species contexts.</p>
<p>A prominent deviation is the research on single-head self-attention in grape leaf disease identification (<xref ref-type="bibr" rid="B20">Zhang et&#xa0;al., 2025</xref>), showing a simplified attention approach can preserve accuracy for particular diseases. Nevertheless, this approach sacrifices the multi-head diversity crucial for generalizing across diverse phenotypic traits (e.g., leaf shape vs. lesion texture). TAHA addresses this limitation by selectively pruning heads based on transferability rather than eliminating multi-head attention entirely, retaining diversity while reducing redundancy.</p>
</sec>
<sec id="s2_2">
<label>2.2</label>
<title>Cross-species transfer learning in plant phenomics</title>
<p>Transfer learning has been extensively employed to address data limitations in plant phenotyping, especially for less-researched species. Early approaches relied on CNNs pretrained on ImageNet, fine-tuning them for target species (<xref ref-type="bibr" rid="B19">Yan et&#xa0;al., 2021</xref>). However, these methods struggle with domain shifts caused by interspecies morphological differences, such as leaf venation patterns or canopy structures.</p>
<p>More recent work has explored domain adaptation techniques tailored to biological data. FloraBERT (<xref ref-type="bibr" rid="B9">Levy et&#xa0;al., 2022</xref>) adopted attention mechanisms to align gene expression patterns across species, whereas DeepTL-Ubi (<xref ref-type="bibr" rid="B12">Liu et&#xa0;al., 2021</xref>) applied transfer learning for ubiquitination site prediction. These methods highlight the potential of attention-based alignment but do not optimize the attention mechanism itself for cross-species efficiency&#x2014;retaining full computational overhead even when heads are irrelevant to target species. TAHA fills this gap by pruning low-transferability heads, reducing FLOPs by 40.9% while preserving alignment quality.</p>
<p>A parallel line of research has investigated feature space alignment for plant communities (<xref ref-type="bibr" rid="B7">Hirn et&#xa0;al., 2024</xref>) and bioacoustic species classification (<xref ref-type="bibr" rid="B24">Zhong et&#xa0;al., 2020</xref>). These studies highlight the necessity of retaining discriminative features in domain adaptation, an idea we articulate with our Phenotypic Consistency Constraint (PCC).</p>
</sec>
<sec id="s2_3">
<label>2.3</label>
<title>Attention head analysis and pruning</title>
<p>Investigating attention heads in large language models has uncovered specialized functions, including positional encoding and syntactic parsing (<xref ref-type="bibr" rid="B23">Zheng Z. et&#xa0;al., 2024</xref>). Comparable specialization probably happens in visual processing, where specific heads might identify features common across species, such as the outlines of leaves or patterns in texture.</p>
<p>Pruning methods have been proposed to eliminate redundant heads, either statically (<xref ref-type="bibr" rid="B13">Michel et&#xa0;al., 2019</xref>) or dynamically during training (<xref ref-type="bibr" rid="B17">Voita et&#xa0;al., 2019</xref>). Nevertheless, these methods usually assess the importance of heads only in one specific area, failing to account for their usefulness across different domains. For example, a head specialized in Arabidopsis leaf shape may be redundant for maize but critical for rice&#x2014;static pruning cannot adapt to such cross-species differences. Our Head Transferability Score (HTS) extends these ideas by evaluating heads through both domain alignment and phenotypic prediction metrics, ensuring pruning decisions are transferability-aware rather than domain-specific.</p>
<p>The proposed method distinguishes itself by unifying these research threads. In contrast to (<xref ref-type="bibr" rid="B5">Gole et&#xa0;al., 2023</xref>) or (<xref ref-type="bibr" rid="B20">Zhang et&#xa0;al., 2025</xref>), our approach focuses on optimizing attention heads specifically for cross-species transfer rather than efficiency within a single species. Compared to (<xref ref-type="bibr" rid="B9">Levy et&#xa0;al., 2022</xref>) or (<xref ref-type="bibr" rid="B12">Liu et&#xa0;al., 2021</xref>), we address computational efficiency through principled head pruning rather than solely focusing on accuracy. Our Domain Alignment Loss (DAL) and Phenotypic Consistency Constraint (PCC) deliver a more refined strategy for retaining features compared to conventional domain adaptation methods (<xref ref-type="bibr" rid="B19">Yan et&#xa0;al., 2021</xref>), whereas our iterative pruning framework grants increased adaptability relative to fixed architectures (<xref ref-type="bibr" rid="B22">Zheng W. et&#xa0;al., 2024</xref>). This pairing yields a streamlined yet broadly applicable approach for assessing plant traits in various species.</p>
</sec>
</sec>
<sec id="s3">
<label>3</label>
<title>Preliminaries: multi-head self-attention and domain adaptation for plant imagery</title>
<p>To establish the theoretical foundation for our approach, we first formalize the key components of Vision Transformers and domain adaptation techniques as applied to plant phenotyping. This segment outlines the essential context and emphasizes the distinct difficulties associated with plant imagery in contrast to conventional computer vision applications.</p>
<sec id="s3_1">
<label>3.1</label>
<title>Multi-head self-attention in vision transformers</title>
<p>The Multi-Head Self-Attention (MHSA) mechanism operates on input features by employing numerous parallel attention heads, each capturing different dimensions of the visual information. Given an input feature map <inline-formula>
<mml:math display="inline" id="im1"><mml:mrow><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>n</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> where <inline-formula>
<mml:math display="inline" id="im2"><mml:mi>n</mml:mi></mml:math></inline-formula> is the number of patches and <inline-formula>
<mml:math display="inline" id="im3"><mml:mi>d</mml:mi></mml:math></inline-formula> the embedding dimension, each head <inline-formula>
<mml:math display="inline" id="im4"><mml:mi>h</mml:mi></mml:math></inline-formula> computes (<xref ref-type="disp-formula" rid="eq1">Equation 1</xref>):</p>
<disp-formula id="eq1"><label>(1)</label>
<mml:math display="block" id="M1"><mml:mrow><mml:msub><mml:mrow><mml:mtext>Attention</mml:mtext></mml:mrow><mml:mi>h</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>K</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>V</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>softmax</mml:mtext><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:msubsup><mml:mi>K</mml:mi><mml:mi>h</mml:mi><mml:mi>T</mml:mi></mml:msubsup></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow><mml:msub><mml:mi>V</mml:mi><mml:mi>h</mml:mi></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im5"><mml:mrow><mml:msub><mml:mi>Q</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mi>h</mml:mi><mml:mi>Q</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>, <inline-formula>
<mml:math display="inline" id="im6"><mml:mrow><mml:msub><mml:mi>K</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mi>h</mml:mi><mml:mi>K</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula>, and <inline-formula>
<mml:math display="inline" id="im7"><mml:mrow><mml:msub><mml:mi>V</mml:mi><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>X</mml:mi><mml:msubsup><mml:mi>W</mml:mi><mml:mi>h</mml:mi><mml:mi>V</mml:mi></mml:msubsup></mml:mrow></mml:math></inline-formula> are learned linear projections for queries, keys, and values respectively, with <inline-formula>
<mml:math display="inline" id="im8"><mml:mrow><mml:msubsup><mml:mi>W</mml:mi><mml:mi>h</mml:mi><mml:mi>Q</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>W</mml:mi><mml:mi>h</mml:mi><mml:mi>K</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>W</mml:mi><mml:mi>h</mml:mi><mml:mi>V</mml:mi></mml:msubsup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mo>&#xd7;</mml:mo><mml:msub><mml:mi>d</mml:mi><mml:mi>k</mml:mi></mml:msub></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>. All head outputs are joined together and transformed via projection. As shown in <xref ref-type="disp-formula" rid="eq2">Equation 2</xref>.</p>
<disp-formula id="eq2"><label>(2)</label>
<mml:math display="block" id="M2"><mml:mrow><mml:mtext>MHSA</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:mtext>Concat</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mtext>Attention</mml:mtext></mml:mrow><mml:mn>1</mml:mn></mml:msub><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mtext>Attention</mml:mtext></mml:mrow><mml:mi>H</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msup><mml:mi>W</mml:mi><mml:mi>O</mml:mi></mml:msup></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im9"><mml:mrow><mml:msup><mml:mi>W</mml:mi><mml:mi>O</mml:mi></mml:msup><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>H</mml:mi><mml:msub><mml:mi>d</mml:mi><mml:mi>v</mml:mi></mml:msub><mml:mo>&#xd7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im10"><mml:mi>H</mml:mi></mml:math></inline-formula> is the number of heads. For botanical imagery, this approach acquires both fine-scale features (e.g., leaf venation) and overarching organization (e.g., plant morphology), though with notable repetition among attention heads (<xref ref-type="bibr" rid="B13">Michel et&#xa0;al., 2019</xref>).</p>
</sec>
<sec id="s3_2">
<label>3.2</label>
<title>Domain adaptation challenges in plant phenotyping</title>
<p>Domain adaptation for plant imagery must address three unique challenges absent in general computer vision: (1) interspecies morphological variations (e.g., monocot vs. dicot leaf patterns), (2) phenotypic plasticity under environmental conditions, and (3) limited labeled data for target species. Conventional domain adaptation approaches such as Maximum Mean Discrepancy (MMD) (<xref ref-type="bibr" rid="B6">Gretton et&#xa0;al., 2012</xref>) or adversarial training (<xref ref-type="bibr" rid="B3">Ganin and Lempitsky, 2015</xref>) frequently do not retain phenotype traits unique to species when performing alignment.</p>
<p>The domain shift between source species <inline-formula>
<mml:math display="inline" id="im11"><mml:mi>S</mml:mi></mml:math></inline-formula> and target species <inline-formula>
<mml:math display="inline" id="im12"><mml:mi>T</mml:mi></mml:math></inline-formula> can be quantified through the discrepancy in their feature distributions, as shown in <xref ref-type="disp-formula" rid="eq3">Equation 3</xref>:</p>
<disp-formula id="eq3"><label>(3)</label>
<mml:math display="block" id="M3"><mml:mrow><mml:mi mathvariant="script">D</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mtext>sup</mml:mtext></mml:mrow><mml:mrow><mml:mi>f</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mi>&#x2131;</mml:mi></mml:mrow></mml:munder><mml:mrow><mml:mo>|</mml:mo><mml:mrow><mml:msub><mml:mi mathvariant="double-struck">E</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x223c;</mml:mo><mml:mi>S</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi mathvariant="double-struck">E</mml:mi><mml:mrow><mml:mi>x</mml:mi><mml:mo>&#x223c;</mml:mo><mml:mi>T</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mi>f</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow><mml:mo>|</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im13"><mml:mi>&#x2131;</mml:mi></mml:math></inline-formula> is a class of functions (e.g., attention heads). For plant phenotyping, we must minimize <inline-formula>
<mml:math display="inline" id="im14"><mml:mrow><mml:mi mathvariant="script">D</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>S</mml:mi><mml:mo>,</mml:mo><mml:mi>T</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> while preserving the discriminative power for phenotypic traits&#x2014;a dual objective not addressed by standard domain adaptation approaches (<xref ref-type="bibr" rid="B4">Ganin et&#xa0;al., 2016</xref>).</p>
</sec>
<sec id="s3_3">
<label>3.3</label>
<title>Attention head specialization in plant vision</title>
<p>Empirical research indicates attention heads in plant vision models adopt distinct functions, for instance:</p>
<list list-type="order">
<list-item>
<p>Morphological heads: Capture species-invariant structures (e.g., leaf shapes).</p></list-item>
<list-item>
<p>Texture heads: Detect surface patterns (e.g., stomata distribution).</p></list-item>
<list-item>
<p>Contextual heads: Simulate relationships between plants and their surroundings, such as the influence of light interception.</p></list-item>
</list>
<p>This specialization indicates merely a portion of heads might be crucial for cross-species transfer, whereas the remainder contain species-specific traits impeding generalization. Our approach builds on this insight by transferring and pruning specific heads selectively, differing from previous methods which apply uniform treatment to all heads during adaptation (<xref ref-type="bibr" rid="B10">Li et&#xa0;al., 2018</xref>).</p>
</sec>
</sec>
<sec id="s4">
<label>4</label>
<title>Cross-species attention head transfer and redundancy pruning</title>
<p>The proposed framework addresses the challenge of transferring attention heads across plant species while eliminating redundant computations. The approach comprises four interrelated elements: aligning adaptable attention heads, measuring head adaptability, selective pruning guided by cross-species effectiveness, and embedding the refined attention mechanism into a streamlined framework. As shown in <xref ref-type="fig" rid="f1"><bold>Figure&#xa0;1</bold></xref>.</p>
<fig id="f1" position="float">
<label>Figure&#xa0;1</label>
<caption>
<p>TAHA workflow: alignment, pruning, and fine-tuning.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1747598-g001.tif">
<alt-text content-type="machine-generated">Flowchart of the TAHA Framework showing a sequence of processes. Source Species Data is fed into Pre-trained MHSA, then into Cross-Species Alignment, followed by Head Pruning and Fine-Tuning, leading to the Output Layer. Target Species Data also inputs into Cross-Species Alignment.</alt-text>
</graphic></fig>
<sec id="s4_1">
<label>4.1</label>
<title>Transferable attention head alignment framework</title>
<p>The alignment process begins with a pre-trained Transformer model on the source species, where each attention head has learned specific feature extraction patterns. For a given input image <inline-formula>
<mml:math display="inline" id="im15"><mml:mi>x</mml:mi></mml:math></inline-formula>, the attention map of head <inline-formula>
<mml:math display="inline" id="im16"><mml:mi>h</mml:mi></mml:math></inline-formula> in the source model is denoted as <inline-formula>
<mml:math display="inline" id="im17"><mml:mrow><mml:msubsup><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>A</mml:mi></mml:mstyle><mml:mi>h</mml:mi><mml:mi>s</mml:mi></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mi>P</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>P</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula>, where <inline-formula>
<mml:math display="inline" id="im18"><mml:mi>P</mml:mi></mml:math></inline-formula> represents the number of patches. When processing target species data, we compute the corresponding attention map <inline-formula>
<mml:math display="inline" id="im19"><mml:mrow><mml:msubsup><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>A</mml:mi></mml:mstyle><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> using the same head parameters.</p>
<p>The Domain Alignment Loss (DAL) measures the discrepancy between source and target attention patterns, as shown in <xref ref-type="disp-formula" rid="eq4">Equation 4</xref>:</p>
<disp-formula id="eq4"><label>(4)</label>
<mml:math display="block" id="M4"><mml:mrow><mml:msubsup><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>DAL</mml:mtext></mml:mrow><mml:mi>h</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mtext>KL</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>A</mml:mi></mml:mstyle><mml:mi>h</mml:mi><mml:mi>s</mml:mi></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2225;</mml:mo><mml:msubsup><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>A</mml:mi></mml:mstyle><mml:mi>h</mml:mi><mml:mi>t</mml:mi></mml:msubsup><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im20"><mml:mi>N</mml:mi></mml:math></inline-formula> is the batch size and KL denotes the Kullback-Leibler divergence. At the same time, the Phenotypic Consistency Constraint (PCC) guarantees the transferred heads keep their ability to distinguish traits of the target species. As shown in <xref ref-type="disp-formula" rid="eq5">Equation 5</xref>.</p>
<disp-formula id="eq5"><label>(5)</label>
<mml:math display="block" id="M5"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>PCC</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mi>N</mml:mi></mml:munderover><mml:mo>&#x2225;</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>y</mml:mi></mml:mstyle><mml:mi>i</mml:mi></mml:msub><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>MLP</mml:mtext></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:msubsup><mml:mo>&#x2225;</mml:mo><mml:mn>2</mml:mn><mml:mn>2</mml:mn></mml:msubsup></mml:mrow></mml:math>
</disp-formula>
<p>Here, <inline-formula>
<mml:math display="inline" id="im21"><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mi>i</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mn>1</mml:mn></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mi>H</mml:mi></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mi>x</mml:mi><mml:mi>i</mml:mi></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> represents the concatenated outputs of all attention heads, and <inline-formula>
<mml:math display="inline" id="im22"><mml:mrow><mml:msub><mml:mi>f</mml:mi><mml:mrow><mml:mtext>MLP</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula> is a multilayer perceptron for phenotypic prediction. For datasets with differing phenotypic traits (e.g., PlantVillage focuses on disease classes, CropDeep on growth traits), PCC is computed on shared morphological traits (e.g., leaf shape, venation density) after feature standardization, ensuring consistency across trait types. The correlation score in HTS uses Pearson correlation between head outputs and standardized target labels, mitigating the impact of trait differences.</p>
</sec>
<sec id="s4_2">
<label>4.2</label>
<title>Calculation of head transferability score</title>
<p>Each head&#x2019;s transferability is measured by merging its alignment accuracy and phenotypic importance, as shown in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<disp-formula id="eq6"><label>(6)</label>
<mml:math display="block" id="M6"><mml:mrow><mml:msub><mml:mrow><mml:mtext>HTS</mml:mtext></mml:mrow><mml:mi>h</mml:mi></mml:msub><mml:mo>=</mml:mo><mml:mi>&#x3b1;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:mtext>exp</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msubsup><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>DAL</mml:mtext></mml:mrow><mml:mi>h</mml:mi></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b1;</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#xb7;</mml:mo><mml:mtext>Corr</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mi>h</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>y</mml:mi></mml:mstyle></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im23"><mml:mrow><mml:mi>&#x3b1;</mml:mi><mml:mo>&#x2208;</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> balances the two criteria, and <inline-formula>
<mml:math display="inline" id="im24"><mml:mrow><mml:mtext>Corr</mml:mtext><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#xb7;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> computes the Pearson correlation between head outputs and target labels. The exponential term transforms the alignment loss into a positive score, with better-aligned heads receiving higher values.</p>
</sec>
<sec id="s4_3">
<label>4.3</label>
<title>Cross-species dynamic head pruning</title>
<p>Heads are pruned based on their HTS relative to a dynamic threshold <inline-formula>
<mml:math display="inline" id="im25"><mml:mi>&#x3c4;</mml:mi></mml:math></inline-formula>, as shown in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>:</p>
<disp-formula id="eq7"><label>(7)</label>
<mml:math display="block" id="M7"><mml:mrow><mml:mi>&#x3c4;</mml:mi><mml:mo>=</mml:mo><mml:mi>&#x3bc;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>HTS</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mi>&#x3b2;</mml:mi><mml:mo>&#xb7;</mml:mo><mml:mi>&#x3c3;</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mtext>HTS</mml:mtext></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im26"><mml:mi>&#x3bc;</mml:mi></mml:math></inline-formula> and <inline-formula>
<mml:math display="inline" id="im27"><mml:mi>&#x3c3;</mml:mi></mml:math></inline-formula> denote the mean and standard deviation of all heads&#x2019; HTS values, and <inline-formula>
<mml:math display="inline" id="im28"><mml:mi>&#x3b2;</mml:mi></mml:math></inline-formula> controls the pruning aggressiveness. The remaining heads form a compact set <inline-formula>
<mml:math display="inline" id="im29"><mml:mrow><mml:msup><mml:mi>&#x210b;</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mo>=</mml:mo><mml:mo>{</mml:mo><mml:mi>h</mml:mi><mml:mo>|</mml:mo><mml:msub><mml:mrow><mml:mtext>HTS</mml:mtext></mml:mrow><mml:mi>h</mml:mi></mml:msub><mml:mo>&#x2265;</mml:mo><mml:mi>&#x3c4;</mml:mi><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula>, reducing the computational complexity from <inline-formula>
<mml:math display="inline" id="im30"><mml:mrow><mml:mi mathvariant="script">O</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>H</mml:mi><mml:msup><mml:mi>P</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula> to <inline-formula>
<mml:math display="inline" id="im31"><mml:mrow><mml:mi mathvariant="script">O</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msup><mml:mi>&#x210b;</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mo>|</mml:mo></mml:mrow><mml:msup><mml:mi>P</mml:mi><mml:mn>2</mml:mn></mml:msup><mml:mi>d</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow></mml:math></inline-formula>.</p>
</sec>
<sec id="s4_4">
<label>4.4</label>
<title>Lightweight MHSA integration</title>
<p>The trimmed attention approach generates results by exclusively employing the preserved heads, as shown in <xref ref-type="disp-formula" rid="eq8">Equation 8</xref>.</p>
<disp-formula id="eq8"><label>(8)</label>
<mml:math display="block" id="M8"><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mrow><mml:mtext>out</mml:mtext></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mo>{</mml:mo><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>Z</mml:mi></mml:mstyle><mml:mi>h</mml:mi></mml:msub><mml:mo>}</mml:mo></mml:mrow><mml:mrow><mml:mi>h</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x210b;</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">]</mml:mo></mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>W</mml:mi></mml:mstyle><mml:msup><mml:mi>o</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:msub></mml:mrow></mml:math>
</disp-formula>
<p>where <inline-formula>
<mml:math display="inline" id="im32"><mml:mrow><mml:msub><mml:mstyle mathvariant="bold" mathsize="normal"><mml:mi>W</mml:mi></mml:mstyle><mml:msup><mml:mi>o</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup></mml:msub><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mi>&#x211d;</mml:mi><mml:mrow><mml:mrow><mml:mo>|</mml:mo><mml:msup><mml:mi>&#x210b;</mml:mi><mml:mo>&#x2032;</mml:mo></mml:msup><mml:mo>|</mml:mo></mml:mrow><mml:mi>d</mml:mi><mml:mo>&#xd7;</mml:mo><mml:mi>d</mml:mi></mml:mrow></mml:msup></mml:mrow></mml:math></inline-formula> is a reduced projection matrix. This approach preserves the initial feature dimension (d) while reducing the quantity of attention operations.</p>
</sec>
<sec id="s4_5">
<label>4.5</label>
<title>Iterative alignment and pruning</title>
<p>The complete adaptation process follows an iterative procedure:</p>
<list list-type="order">
<list-item>
<p>Compute attention maps for target species data.</p></list-item>
<list-item>
<p>Update head parameters to minimize <inline-formula>
<mml:math display="inline" id="im33"><mml:mrow><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>DAL</mml:mtext></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>&#x3bb;</mml:mi><mml:msub><mml:mi>&#x2112;</mml:mi><mml:mrow><mml:mtext>PCC</mml:mtext></mml:mrow></mml:msub></mml:mrow></mml:math></inline-formula>Evaluate HTS for all heads.</p></list-item>
<list-item>
<p>Prune heads below threshold <inline-formula>
<mml:math display="inline" id="im34"><mml:mi>&#x3c4;</mml:mi></mml:math></inline-formula>Fine-tune remaining heads on target data.</p></list-item>
</list>
<p>This cycle repeats until convergence, with <inline-formula>
<mml:math display="inline" id="im35"><mml:mi>&#x3bb;</mml:mi></mml:math></inline-formula> gradually increased to prioritize phenotypic prediction accuracy in later stages. The step-by-step method supports gradual adjustment while retaining transferable attributes without abrupt loss.TAHA operates under semi-supervised domain adaptation: it requires 10&#x2013;15% labeled target data to compute PCC and HTS, rather than fully unsupervised. This balances label efficiency and alignment quality, critical for plant phenotyping where labeled data is often scarce.</p>
</sec>
</sec>
<sec id="s5">
<label>5</label>
<title>Experiments</title>
<p>To assess the efficacy of our proposed approach, we performed extensive experiments on various plant species and phenotypic attributes. The evaluation focuses on three key aspects: (1) comparative performance against baseline methods, (2) analysis of attention head transferability patterns, and (3) computational efficiency gains from head pruning.</p>
<sec id="s5_1">
<label>5.1</label>
<title>Experimental setup</title>
<p><xref ref-type="table" rid="T1"><bold>Table&#xa0;1</bold></xref> summarizes the datasets used in this study, including their species coverage, task types, and sample sizes. All datasets are used for classification tasks (disease type, growth stage, or phenotypic trait category), as plant phenotyping in this work focuses on categorical attribute prediction.</p>
<table-wrap id="T1" position="float">
<label>Table&#xa0;1</label>
<caption>
<p>Dataset summary.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Dataset</th>
<th valign="middle" align="left">Plant species</th>
<th valign="middle" align="left">Task type</th>
<th valign="middle" align="left">Number of samples</th>
<th valign="middle" align="left">Key phenotypic traits</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">PlantVillage (<xref ref-type="bibr" rid="B14">Moupojou et&#xa0;al., 2023</xref>)</td>
<td valign="middle" align="left">38 species (tomato, maize, wheat, etc.)</td>
<td valign="middle" align="left">Disease classification (17 classes)</td>
<td valign="middle" align="left">54,305</td>
<td valign="middle" align="left">Leaf lesions, color changes, texture anomalies</td>
</tr>
<tr>
<td valign="middle" align="left">AraPheno (<xref ref-type="bibr" rid="B15">Seren et&#xa0;al., 2016</xref>)</td>
<td valign="middle" align="left">Arabidopsis thaliana (ecotypes)</td>
<td valign="middle" align="left">Growth stage classification (6 stages)</td>
<td valign="middle" align="left">12,000</td>
<td valign="middle" align="left">Rosette size, leaf count, flowering status</td>
</tr>
<tr>
<td valign="middle" align="left">CropDeep (<xref ref-type="bibr" rid="B21">Zheng et&#xa0;al., 2019</xref>)</td>
<td valign="middle" align="left">Maize, rice, wheat</td>
<td valign="middle" align="left">Phenotypic trait classification (23 classes)</td>
<td valign="middle" align="left">31,000</td>
<td valign="middle" align="left">Plant height, leaf area, tiller number</td>
</tr>
<tr>
<td valign="middle" align="left">Supplementary (<xref ref-type="bibr" rid="B18">Xie et&#xa0;al., 2025</xref>)</td>
<td valign="middle" align="left">Arabidopsis (dicot), pine (gymnosperm)</td>
<td valign="middle" align="left">Disease/growth stage classification</td>
<td valign="middle" align="left">8,000</td>
<td valign="middle" align="left">Needle texture (pine), leaf shape (Arabidopsis)</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>We compared against four state-of-the-art approaches:</p>
<list list-type="order">
<list-item>
<p>Full-head Vision Transformer (ViT-Base) (<xref ref-type="bibr" rid="B16">Vaswani et&#xa0;al., 2017</xref>).</p></list-item>
<list-item>
<p>Random head pruning (RHP) (<xref ref-type="bibr" rid="B13">Michel et&#xa0;al., 2019</xref>).</p></list-item>
<list-item>
<p>Domain-Adversarial Neural Network (DANN) (<xref ref-type="bibr" rid="B3">Ganin and Lempitsky, 2015</xref>).</p></list-item>
<list-item>
<p>Lightweight Hybrid CNN-Transformer (LHCT) (<xref ref-type="bibr" rid="B5">Gole et&#xa0;al., 2023</xref>).</p></list-item>
</list>
<p><bold>Implementation Details</bold>: All models were implemented in PyTorch and trained on NVIDIA V100 GPUs. Our model architecture employed ViT-Base (12 attention heads) as the core component, processing input images scaled to 224&#xd7;224 resolution. The hyperparameters were set as: &#x3b1;=0.6 in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>, &#x3b2;=1.5 in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref>, and &#x3bb; initialized to 0.1 with linear warmup. The training procedure employed the Adam optimizer with a learning rate of 3e-5 and a batch size of 32. Edge deployment testing: Raspberry Pi 4 (4GB RAM, Cortex-A72), NVIDIA Jetson Nano (4GB RAM, Maxwell GPU) &#x2014; inference latency and throughput measured on these devices.</p>
<p>Evaluation Metrics: We measured:</p>
<list list-type="bullet">
<list-item>
<p>Phenotypic prediction accuracy (Top-1 and mAP).</p></list-item>
<list-item>
<p>Computational efficiency (FLOPs and parameters).</p></list-item>
<list-item>
<p>Cross-species transferability (HTS distribution).</p></list-item>
</list>
</sec>
<sec id="s5_2">
<label>5.2</label>
<title>Comparative results</title>
<p><xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref> updates the performance comparison with standard deviations and statistical significance (p-values from paired t-tests against TAHA). TAHA achieves comparable accuracy to full-head ViT while reducing computational costs by 38&#x2013;42%, with statistically significant improvements over DANN (p&lt;0.05) and RHP (p&lt;0.01) on all datasets.</p>
<table-wrap id="T2" position="float">
<label>Table&#xa0;2</label>
<caption>
<p>Performance comparison across species transfer tasks.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Method</th>
<th valign="middle" align="left">Source&#x2192;Target</th>
<th valign="middle" align="left">Top-1 (%)</th>
<th valign="middle" align="left">mAP (%)</th>
<th valign="middle" align="left">FLOPs (G)</th>
<th valign="middle" align="left">Params (M)</th>
<th valign="middle" align="left">Inference latency (ms)</th>
<th valign="middle" align="left">p-value</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">ViT-Base</td>
<td valign="middle" align="left">Arabidopsis&#x2192;Maize</td>
<td valign="middle" align="left">78.3&#xa0;&#xb1;&#xa0;0.8</td>
<td valign="middle" align="left">72.1&#xa0;&#xb1;&#xa0;1.1</td>
<td valign="middle" align="left">17.6</td>
<td valign="middle" align="left">86.7</td>
<td valign="middle" align="left">23.4&#xa0;&#xb1;&#xa0;1.2</td>
<td valign="middle" align="left">0.12</td>
</tr>
<tr>
<td valign="middle" align="left">RHP</td>
<td valign="middle" align="left">Arabidopsis&#x2192;Maize</td>
<td valign="middle" align="left">71.2&#xa0;&#xb1;&#xa0;1.3</td>
<td valign="middle" align="left">65.4&#xa0;&#xb1;&#xa0;1.5</td>
<td valign="middle" align="left">10.2</td>
<td valign="middle" align="left">52.1</td>
<td valign="middle" align="left">18.7&#xa0;&#xb1;&#xa0;0.9</td>
<td valign="middle" align="left">&lt;0.01</td>
</tr>
<tr>
<td valign="middle" align="left">DANN</td>
<td valign="middle" align="left">Arabidopsis&#x2192;Maize</td>
<td valign="middle" align="left">73.8&#xa0;&#xb1;&#xa0;1.0</td>
<td valign="middle" align="left">66.9&#xa0;&#xb1;&#xa0;1.2</td>
<td valign="middle" align="left">17.6</td>
<td valign="middle" align="left">86.7</td>
<td valign="middle" align="left">25.1&#xa0;&#xb1;&#xa0;1.5</td>
<td valign="middle" align="left">&lt;0.05</td>
</tr>
<tr>
<td valign="middle" align="left">LHCT</td>
<td valign="middle" align="left">Arabidopsis&#x2192;Maize</td>
<td valign="middle" align="left">75.1&#xa0;&#xb1;&#xa0;0.9</td>
<td valign="middle" align="left">68.3&#xa0;&#xb1;&#xa0;1.0</td>
<td valign="middle" align="left">9.8</td>
<td valign="middle" align="left">48.3</td>
<td valign="middle" align="left">16.3&#xa0;&#xb1;&#xa0;0.8</td>
<td valign="middle" align="left">&lt;0.05</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>TAHA (Ours)</bold></td>
<td valign="middle" align="left"><bold>Arabidopsis&#x2192;Maize</bold></td>
<td valign="middle" align="left"><bold>77.6&#xa0;&#xb1;&#xa0;0.7</bold></td>
<td valign="middle" align="left"><bold>71.4&#xa0;&#xb1;&#xa0;0.9</bold></td>
<td valign="middle" align="left"><bold>10.4</bold></td>
<td valign="middle" align="left"><bold>51.9</bold></td>
<td valign="middle" align="left"><bold>14.1&#xa0;&#xb1;&#xa0;0.6</bold></td>
<td valign="middle" align="left"><bold>&#x2014;</bold></td>
</tr>
<tr>
<td valign="middle" align="left">ViT-Base</td>
<td valign="middle" align="left">Tomato&#x2192;Wheat</td>
<td valign="middle" align="left">82.4&#xa0;&#xb1;&#xa0;0.6</td>
<td valign="middle" align="left">76.5&#xa0;&#xb1;&#xa0;0.8</td>
<td valign="middle" align="left">17.6</td>
<td valign="middle" align="left">86.7</td>
<td valign="middle" align="left">22.8&#xa0;&#xb1;&#xa0;1.0</td>
<td valign="middle" align="left">0.08</td>
</tr>
<tr>
<td valign="middle" align="left">RHP</td>
<td valign="middle" align="left">Tomato&#x2192;Wheat</td>
<td valign="middle" align="left">74.6&#xa0;&#xb1;&#xa0;1.1</td>
<td valign="middle" align="left">68.2&#xa0;&#xb1;&#xa0;1.3</td>
<td valign="middle" align="left">10.2</td>
<td valign="middle" align="left">52.1</td>
<td valign="middle" align="left">17.9&#xa0;&#xb1;&#xa0;0.7</td>
<td valign="middle" align="left">&lt;0.01</td>
</tr>
<tr>
<td valign="middle" align="left">DANN</td>
<td valign="middle" align="left">Tomato&#x2192;Wheat</td>
<td valign="middle" align="left">78.1&#xa0;&#xb1;&#xa0;0.9</td>
<td valign="middle" align="left">71.3&#xa0;&#xb1;&#xa0;1.1</td>
<td valign="middle" align="left">17.6</td>
<td valign="middle" align="left">86.7</td>
<td valign="middle" align="left">24.5&#xa0;&#xb1;&#xa0;1.3</td>
<td valign="middle" align="left">&lt;0.05</td>
</tr>
<tr>
<td valign="middle" align="left">LHCT</td>
<td valign="middle" align="left">Tomato&#x2192;Wheat</td>
<td valign="middle" align="left">79.3&#xa0;&#xb1;&#xa0;0.8</td>
<td valign="middle" align="left">72.7&#xa0;&#xb1;&#xa0;0.9</td>
<td valign="middle" align="left">9.8</td>
<td valign="middle" align="left">48.3</td>
<td valign="middle" align="left">15.7&#xa0;&#xb1;&#xa0;0.7</td>
<td valign="middle" align="left">&lt;0.05</td>
</tr>
<tr>
<td valign="middle" align="left"><bold>TAHA (Ours)</bold></td>
<td valign="middle" align="left"><bold>Tomato&#x2192;Wheat</bold></td>
<td valign="middle" align="left"><bold>81.7&#xa0;&#xb1;&#xa0;0.6</bold></td>
<td valign="middle" align="left"><bold>75.8&#xa0;&#xb1;&#xa0;0.8</bold></td>
<td valign="middle" align="left"><bold>10.4</bold></td>
<td valign="middle" align="left"><bold>51.9</bold></td>
<td valign="middle" align="left"><bold>13.8&#xa0;&#xb1;&#xa0;0.5</bold></td>
<td valign="middle" align="left"><bold>&#x2014;</bold></td>
</tr>
<tr>
<td valign="middle" align="left"><bold>TAHA (Ours)</bold></td>
<td valign="middle" align="left"><bold>Arabidopsis&#x2192;Pine</bold></td>
<td valign="middle" align="left"><bold>70.4&#xa0;&#xb1;&#xa0;1.2</bold></td>
<td valign="middle" align="left"><bold>64.2&#xa0;&#xb1;&#xa0;1.4</bold></td>
<td valign="middle" align="left"><bold>10.4</bold></td>
<td valign="middle" align="left"><bold>51.9</bold></td>
<td valign="middle" align="left"><bold>14.5&#xa0;&#xb1;&#xa0;0.7</bold></td>
<td valign="middle" align="left"><bold>&#x2014;</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn>
<p>Key insights from <xref ref-type="table" rid="T2"><bold>Table&#xa0;2</bold></xref>: (1) TAHA&#x2019;s Top-1 accuracy is only 0.7&#x2013;0.9% lower than ViT-Base but with 40.1% fewer parameters; (2) Compared to DANN, TAHA improves mAP by 4.5&#x2013;4.9% (statistically significant, p&lt;0.05) due to head-level alignment rather than global feature adaptation; (3) On the distant-species transfer (Arabidopsis&#x2192;Pine), TAHA&#x2019;s Top-1 accuracy drops by 7.2% compared to Arabidopsis&#x2192;Maize, confirming the framework&#x2019;s generalization limit when phenotypic overlap is minimal. The bold values represent the experimental values obtained using the method described in this article.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5_3">
<label>5.3</label>
<title>Head transferability analysis</title>
<p><xref ref-type="fig" rid="f2"><bold>Figure&#xa0;2</bold></xref> shows the evolution of HTS values during the alignment process. Three different patterns are observed: (1) Heads with uniformly high HTS (for instance, H1, H7) capture features common across species, such as leaf venation, (2) Heads starting with low scores but showing improvement (e.g., H4) adjust to traits specific to the target, and (3) Heads with consistently low scores (e.g., H11) encode patterns unique to the source and are later removed.</p>
<fig id="f2" position="float">
<label>Figure&#xa0;2</label>
<caption>
<p>Evolution of head transferability scores during cross-species alignment (Arabidopsis&#x2192;Maize). (annotations: High HTS heads = species-invariant traits; Low&#x2192;High HTS = target-adaptive traits; Low HTS = source-specific traits. Y-axis: HTS value (0&#x2013;0.8); X-axis: Alignment steps (0&#x2013;100).).</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1747598-g002.tif">
<alt-text content-type="machine-generated">Line graph depicting Head Transferability Score (HTS) on the y-axis versus alignment steps on the x-axis. Blue line represents high HTS heads, showing a steady score around 0.8. Orange line depicts low to high HTS heads, gradually rising to 0.6. Green line for low HTS heads remains around 0.3.</alt-text>
</graphic></fig>
<p>The correlation between alignment performance and phenotypic contribution is further illustrated in <xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref>. Retained heads are concentrated in the high-DAL/low-correlation quadrant, which supports the effectiveness of our joint scoring approach in <xref ref-type="disp-formula" rid="eq6">Equation 6</xref>.</p>
<fig id="f3" position="float">
<label>Figure&#xa0;3</label>
<caption>
<p>Correlation between alignment performance and phenotypic prediction contribution.</p>
</caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fpls-17-1747598-g003.tif">
<alt-text content-type="machine-generated">Scatter plot depicting the relationship between Domain Alignment Loss (x-axis) and Phenotypic Prediction Contribution (y-axis). Red dots represent pruned heads, and blue dots represent retained heads, with distinct clusters reflecting the data grouping.</alt-text>
</graphic></fig>
<p><xref ref-type="fig" rid="f3"><bold>Figure&#xa0;3</bold></xref> illustrates the correlation between DAL and phenotypic contribution. Retained heads (green dots) cluster in the low-DAL/high-correlation quadrant (DAL&lt;0.8, Corr&gt;0.4), indicating strong alignment and high phenotypic relevance. Pruned heads (red dots) are either high-DAL (poor alignment) or low-correlation (low phenotypic value).</p>
</sec>
<sec id="s5_4">
<label>5.4</label>
<title>Computational efficiency</title>
<p>Our pruning approach achieves a 40.9% average reduction in FLOPs while preserving 98.1% of the original model&#x2019;s accuracy. On real edge devices: Raspberry Pi 4 achieves 14.1&#xa0;&#xb1;&#xa0;0.6 ms inference latency and 71 samples/second throughput; NVIDIA Jetson Nano achieves 8.3&#xa0;&#xb1;&#xa0;0.4 ms latency and 120 samples/second&#x2014;meeting real-time requirements for field phenotyping (&#x2264;20 ms latency). <xref ref-type="table" rid="T3"><bold>Table&#xa0;3</bold></xref> breaks down the computational savings across different model components.</p>
<table-wrap id="T3" position="float">
<label>Table&#xa0;3</label>
<caption>
<p>Computational cost breakdown (Mean &#xb1; Std Dev).</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Component</th>
<th valign="middle" align="left">Full model</th>
<th valign="middle" align="left">Pruned model</th>
<th valign="middle" align="left">Reduction</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">Attention Heads</td>
<td valign="middle" align="left">12</td>
<td valign="middle" align="left">7.3 (avg)</td>
<td valign="middle" align="left">39.2%</td>
</tr>
<tr>
<td valign="middle" align="left">FLOPs</td>
<td valign="middle" align="left">17.6G&#xa0;&#xb1;&#xa0;0.5 G</td>
<td valign="middle" align="left">10.4G&#xa0;&#xb1;&#xa0;0.3 G</td>
<td valign="middle" align="left">40.9%</td>
</tr>
<tr>
<td valign="middle" align="left">Parameters</td>
<td valign="middle" align="left">86.7M&#xa0;&#xb1;&#xa0;2.1 M</td>
<td valign="middle" align="left">51.9M&#xa0;&#xb1;&#xa0;1.3 M</td>
<td valign="middle" align="left">40.1%</td>
</tr>
<tr>
<td valign="middle" align="left">Inference Time (GPU)</td>
<td valign="middle" align="left">23.4ms&#xa0;&#xb1;&#xa0;0.3 ms</td>
<td valign="middle" align="left">14.1ms&#xa0;&#xb1;&#xa0;0.2 ms</td>
<td valign="middle" align="left">45.1%</td>
</tr>
<tr>
<td valign="middle" align="left">Inference Time (Raspberry Pi 4)</td>
<td valign="middle" align="left">23.4&#xa0;&#xb1;&#xa0;1.2 ms</td>
<td valign="middle" align="left">14.1&#xa0;&#xb1;&#xa0;0.6 ms</td>
<td valign="middle" align="left">39.7%</td>
</tr>
<tr>
<td valign="middle" align="left">Power Consumption (Jetson Nano)</td>
<td valign="middle" align="left">3.2&#xa0;&#xb1;&#xa0;0.2 W</td>
<td valign="middle" align="left">1.8&#xa0;&#xb1;&#xa0;0.1 W</td>
<td valign="middle" align="left">43.8%</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5_5">
<label>5.5</label>
<title>Ablation study</title>
<p>We conducted ablation studies to validate key design choices:</p>
<p>Scoring Components: The exclusion of either the DAL or correlation term from HTS leads to accuracy declines of 4.7% and 3.2% respectively, which underscores the necessity of including both metrics.</p>
<p>Pruning Threshold: Adjusting &#x3b2; in <xref ref-type="disp-formula" rid="eq7">Equation 7</xref> indicates peak performance at &#x3b2;=1.5, while aggressive pruning at &#x3b2;=2.0 leads to reduced accuracy and conservative pruning at &#x3b2;=1.0 produces only modest efficiency improvements.</p>
<p>Iterative Alignment: The incremental method (progressively raising &#x3bb;) achieves a 2.9% higher mAP than fixed-weight training, which indicates the advantage of stepwise phenotypic constraint application.</p>
</sec>
<sec id="s5_6">
<label>5.6</label>
<title>Hyperparameter transferability and sensitivity</title>
<p><xref ref-type="table" rid="T4"><bold>Table&#xa0;4</bold></xref> summarizes hyperparameter sensitivity and transferability. &#x3b1;=0.6, &#x3b2;=1.5, and &#x3bb;=0.1 (warmup) generalize across most species pairs (Arabidopsis&#x2192;Maize, Tomato&#x2192;Wheat) with &#x2264;1.2% mAP variation. For distant transfers (Arabidopsis&#x2192;Pine), adjusting &#x3b1;=0.7 (prioritize alignment) improves mAP by 2.1% (64.2&#x2192;66.3), indicating dataset-specific tuning is only needed for phylogenetically distant species.</p>
<table-wrap id="T4" position="float">
<label>Table&#xa0;4</label>
<caption>
<p>Hyperparameter sensitivity and transferability.</p>
</caption>
<table frame="hsides">
<thead>
<tr>
<th valign="middle" align="left">Hyperparameter</th>
<th valign="middle" align="left">Baseline value</th>
<th valign="middle" align="left">Variation range</th>
<th valign="middle" align="left">mAP change (Arabidopsis&#x2192;maize)</th>
<th valign="middle" align="left">Transferability (tomato&#x2192;wheat)</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="middle" align="left">&#x3b1;</td>
<td valign="middle" align="left">0.6</td>
<td valign="middle" align="left">0.4&#x2013;0.8</td>
<td valign="middle" align="left">&#xa0;&#xb1;&#xa0;0.8%</td>
<td valign="middle" align="left">0.7% mAP drop</td>
</tr>
<tr>
<td valign="middle" align="left">&#x3b2;</td>
<td valign="middle" align="left">1.5</td>
<td valign="middle" align="left">1.0&#x2013;2.0</td>
<td valign="middle" align="left">&#xa0;&#xb1;&#xa0;2.8%</td>
<td valign="middle" align="left">1.2% mAP drop</td>
</tr>
<tr>
<td valign="middle" align="left">&#x3bb; (initial)</td>
<td valign="middle" align="left">0.1</td>
<td valign="middle" align="left">0.05&#x2013;0.2</td>
<td valign="middle" align="left">&#xa0;&#xb1;&#xa0;0.5%</td>
<td valign="middle" align="left">0.4% mAP drop</td>
</tr>
<tr>
<td valign="middle" align="left">&#x3b1; (distant transfer)</td>
<td valign="middle" align="left">0.7</td>
<td valign="middle" align="left">0.6&#x2013;0.8</td>
<td valign="middle" align="left">&#x2014;</td>
<td valign="middle" align="left">2.1% mAP improvement (Arabidopsis&#x2192;Pine)</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
</sec>
<sec id="s6" sec-type="discussion">
<label>6</label>
<title>Discussion and future work</title>
<sec id="s6_1">
<label>6.1</label>
<title>Limitations of the cross-species attention head pruning method</title>
<p>Although our approach shows robust results in transferring attention heads between plant species, several constraints merit examination. Initially, the existing framework presumes a common phenotypic trait space for both source and target species, an assumption that might be invalid for phylogenetically distant plants (e.g., Arabidopsis&#x2192;pine). Supplementary experiments confirm this: the 7.2% accuracy drop in distant transfers is due to limited trait overlap (e.g., leaf vs. needle morphology), where no single attention head can capture both structures. The morphological gap between such species could exceed the adaptation capacity of any attention head, regardless of pruning strategy. Second, the dynamic pruning threshold, though effective, introduces an extra hyperparameter (&#x3b2;) necessitating meticulous adjustment to achieve the best performance for varying species pairs. Empirical findings indicate &#x3b2;=1.5 performs effectively for the majority of angiosperms, yet transfers between monocots and dicots may require distinct parameters. Third, the present approach assesses head transferability at the species level, which may fail to account for differences within species due to environmental influences such as drought or nutrient stress. These limitations suggest opportunities for refinement in future work.</p>
</sec>
<sec id="s6_2">
<label>6.2</label>
<title>Potential application scenarios beyond plant phenotypic feature extraction</title>
<p>The concepts forming the basis of our method, adaptable attention head detection and pruning tailored to tasks, could apply to multiple associated fields. In ecological monitoring, similar methods could adapt plant phenotyping models to different ecosystems while preserving key vegetation indices. In agricultural robotics, pruned attention mechanisms could support real-time crop analysis on embedded systems with constrained computational capabilities. Beyond plants, the framework might apply to cross-species animal behavior analysis, where certain attention patterns could correspond to universal movement or interaction traits. Nevertheless, these implementations would need adjustments to address the unique aspects of every field, for instance the time-related component in behavioral studies or the spatial arrangement in woodland environment observation. Whether the transferability metrics can be generalized across such varied contexts remains a question for further research.</p>
</sec>
<sec id="s6_3">
<label>6.3</label>
<title>Ethical considerations in plant imagery research</title>
<p>As with any agricultural technology, the deployment of lightweight phenotyping models raises important ethical questions. The productivity improvements resulting from pruning attention heads could broaden the availability of sophisticated phenotyping technologies for small-scale agricultural producers, contingent upon the foundational data encompassing varied cultivation practices. Existing plant image datasets predominantly concentrate on staple crops such as wheat and maize, which may introduce bias in models against less-researched yet nutritionally valuable species. Moreover, the energy savings from model pruning must be weighed against the environmental costs of data collection, particularly in field settings where imaging may require frequent equipment transportation. Future work should address these issues through inclusive dataset curation and lifecycle analysis of the full phenotyping pipeline, not just the computational components. Establishing guidelines for equitable technology transfer will be crucial as these methods move from research to real-world implementation.</p>
</sec>
</sec>
<sec id="s7" sec-type="conclusions">
<label>7</label>
<title>Conclusion</title>
<p>The proposed Transferable Attention Head Alignment (TAHA) framework presents a systematic approach to lightweight plant phenotypic feature extraction by addressing the critical challenge of cross-species transfer in Vision Transformers. Through dynamic head pruning guided by transferability scores, the method achieves a balanced trade-off between computational efficiency and phenotypic prediction accuracy. Combining Domain Alignment Loss (DAL) with Phenotypic Consistency Constraint (PCC) guarantees that the pruned model keeps only the most pertinent attention patterns and removes redundant or species-specific heads.</p>
<p>Experimental findings show that the framework achieves comparable accuracy to full-head models while cutting computational expenses by as much as 40%, which renders it especially appropriate for agricultural applications with limited resources. Real edge device testing (Raspberry Pi 4, Jetson Nano) validates practical deployment feasibility, with sub-15 ms inference latency. The repeated alignment and trimming procedure additionally supports gradual adjustment to novel species without necessitating complete retraining, thereby increasing its applicability in practical settings. Supplementary experiments on phylogenetically distant species clarify the framework&#x2019;s generalization limits, providing guidance for future trait-specific adaptations.</p>
<p>This approach&#x2019;s effectiveness underscores the critical role of selective attention head transfer in cross-species phenotyping, as certain learned features are more influential than others for generalization. Subsequent research may investigate multi-level pruning approaches or integrate extra biological limitations to further boost generalizability. The principles established here may also inspire similar optimizations in other domains requiring efficient cross-domain feature extraction.</p>
<p>This work integrates transfer learning and attention head optimization to push the boundaries of lightweight plant phenotyping, delivering a scalable approach that narrows the disparity between model intricacy and practical deployment. The framework&#x2019;s capacity to adjust to various plant types without compromising processing speed establishes it as an effective instrument for precision farming and environmental observation.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="data-availability">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material. Further inquiries can be directed to the corresponding author.</p></sec>
<sec id="s9" sec-type="author-contributions">
<title>Author contributions</title>
<p>YX: Writing &#x2013; original draft, Resources, Data curation, Formal Analysis, Conceptualization. XZ: Supervision, Formal Analysis, Writing &#x2013; review &amp; editing, Conceptualization, Validation. RW: Conceptualization, Methodology, Writing &#x2013; review &amp; editing, Funding acquisition, Project administration. WL: Validation, Investigation, Methodology, Writing &#x2013; review &amp; editing.</p></sec>
<sec id="s11" sec-type="COI-statement">
<title>Conflict of interest</title>
<p>The authors declared that this work was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p></sec>
<sec id="s12" sec-type="ai-statement">
<title>Generative AI statement</title>
<p>The author(s) declared that generative AI was used in the creation of this manuscript. AIGC plays a role in data analysis, content summarization, and sentence refinement in this article.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p></sec>
<sec id="s13" sec-type="disclaimer">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p></sec>
<ref-list>
<title>References</title>
<ref id="B1">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Arya</surname> <given-names>S.</given-names></name>
<name><surname>Sandhu</surname> <given-names>K. S.</given-names></name>
<name><surname>Singh</surname> <given-names>J.</given-names></name>
<name><surname>Kumar</surname> <given-names>S.</given-names></name>
</person-group> (<year>2022</year>). 
<article-title>Deep learning: As the new frontier in high-throughput plant phenotyping</article-title>. <source>Euphytica</source> <volume>218</volume>, <fpage>47</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1007/s10681-022-02992-3</pub-id>
</mixed-citation>
</ref>
<ref id="B2">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Chakrabarty</surname> <given-names>A.</given-names></name>
<name><surname>Ahmed</surname> <given-names>S. T.</given-names></name>
<name><surname>Islam</surname> <given-names>M. F. U.</given-names></name>
<name><surname>Aziz</surname> <given-names>S. M.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>An interpretable fusion model integrating lightweight CNN and transformer architectures for rice leaf disease identification</article-title>. <source>Ecol. Inf.</source> <volume>82</volume>, <elocation-id>102718</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2024.102718</pub-id>
</mixed-citation>
</ref>
<ref id="B3">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Ganin</surname> <given-names>Y.</given-names></name>
<name><surname>Lempitsky</surname> <given-names>V.</given-names></name>
</person-group> (<year>2015</year>). &#x201c;
<article-title>Unsupervised domain adaptation by backpropagation</article-title>,&#x201d; in <source>International conference on machine learning</source>. <publisher-loc>Lille, France</publisher-loc>: 
<publisher-name>PMLR (Proceedings of Machine Learning Research)</publisher-name>.
</mixed-citation>
</ref>
<ref id="B4">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Ganin</surname> <given-names>Y.</given-names></name>
<name><surname>Ustinova</surname> <given-names>E.</given-names></name>
<name><surname>Ajakan</surname> <given-names>H.</given-names></name>
<name><surname>Germain</surname> <given-names>P.</given-names></name>
<name><surname>Larochelle</surname> <given-names>H.</given-names></name>
<name><surname>Laviolette</surname> <given-names>F.</given-names></name>
<etal/>
</person-group>. (<year>2016</year>). 
<article-title>Domain-adversarial training of neural networks</article-title>. <source>J. Mach. Learn. Res.</source> <volume>17</volume>, <fpage>1</fpage>&#x2013;<lpage>35</lpage>.
</mixed-citation>
</ref>
<ref id="B5">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gole</surname> <given-names>P.</given-names></name>
<name><surname>Bedi</surname> <given-names>P.</given-names></name>
<name><surname>Marwaha</surname> <given-names>S.</given-names></name>
<name><surname>Haque</surname> <given-names>M. A.</given-names></name>
<name><surname>Deb</surname> <given-names>C. K.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>TrIncNet: a lightweight vision transformer network for identification of plant diseases</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1221557</pub-id>, PMID: <pub-id pub-id-type="pmid">37575937</pub-id>
</mixed-citation>
</ref>
<ref id="B6">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Gretton</surname> <given-names>A.</given-names></name>
<name><surname>Borgwardt</surname> <given-names>K. M.</given-names></name>
<name><surname>Rasch</surname> <given-names>M. J.</given-names></name>
<name><surname>Sch&#xf6;lkopf</surname> <given-names>B.</given-names></name>
<name><surname>Smola</surname> <given-names>A.</given-names></name>
</person-group> (<year>2012</year>). 
<article-title>A kernel two-sample test</article-title>. <source>J. Mach. Learn. Res.</source> <volume>13</volume>, <fpage>723</fpage>&#x2013;<lpage>73</lpage>.
</mixed-citation>
</ref>
<ref id="B7">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Hirn</surname> <given-names>J.</given-names></name>
<name><surname>Sanz</surname> <given-names>V.</given-names></name>
<name><surname>Garc&#xed;a</surname> <given-names>J. E.</given-names></name>
<name><surname>Goberna</surname> <given-names>M.</given-names></name>
<name><surname>Montesinos-Navarro</surname> <given-names>A.</given-names></name>
<name><surname>Navarro-Cano</surname> <given-names>J. A.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Transfer learning of species co-occurrence patterns between plant communities</article-title>. <source>Ecol. Inf.</source> <volume>83</volume>, <elocation-id>102826</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ecoinf.2024.102826</pub-id>
</mixed-citation>
</ref>
<ref id="B8">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Jeon</surname> <given-names>Y. J.</given-names></name>
<name><surname>Hong</surname> <given-names>M. J.</given-names></name>
<name><surname>Ko</surname> <given-names>C. S.</given-names></name>
<name><surname>Park</surname> <given-names>S. J.</given-names></name>
<name><surname>Lee</surname> <given-names>H.</given-names></name>
<name><surname>Lee</surname> <given-names>W. G.</given-names></name>
<etal/>
</person-group>. (<year>2025</year>). 
<article-title>A hybrid CNN-Transformer model for identification of wheat varieties and growth stages using high-throughput phenotyping</article-title>. <source>Comput. Electron. Agric.</source> <volume>230</volume>, <elocation-id>109882</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.compag.2024.109882</pub-id>
</mixed-citation>
</ref>
<ref id="B9">
<mixed-citation publication-type="web">
<person-group person-group-type="author">
<name><surname>Levy</surname> <given-names>B.</given-names></name>
<name><surname>Xu</surname> <given-names>Z.</given-names></name>
<name><surname>Zhao</surname> <given-names>L.</given-names></name>
<name><surname>Kremling</surname> <given-names>K.</given-names></name>
<name><surname>Altman</surname> <given-names>R.</given-names></name>
<name><surname>Wong</surname> <given-names>P.</given-names></name>
<etal/>
</person-group>. (<year>2022</year>). <source>FloraBERT: cross-species transfer learning withattention-based neural networks for geneexpression prediction</source>. doi:&#xa0;<pub-id pub-id-type="doi">10.21203/rs.3.rs-1927200/v1</pub-id>
</mixed-citation>
</ref>
<ref id="B10">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>Y.</given-names></name>
<name><surname>Tian</surname> <given-names>X.</given-names></name>
<name><surname>Gong</surname> <given-names>M.</given-names></name>
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>Liu</surname> <given-names>T.</given-names></name>
<name><surname>Liu</surname> <given-names>T.</given-names></name>
<etal/>
</person-group>. (<year>2018</year>). &#x201c;
<article-title>Deep domain generalization via conditional invariant adversarial networks</article-title>,&#x201d; in <conf-name>Proceedings of the European Conference on Computer Vision (ECCV)</conf-name>. <publisher-loc>Munich, Germany</publisher-loc>: 
<publisher-name>Springer</publisher-name>.
</mixed-citation>
</ref>
<ref id="B11">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Li</surname> <given-names>G.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Zhao</surname> <given-names>Q.</given-names></name>
<name><surname>Yuan</surname> <given-names>P.</given-names></name>
<name><surname>Chang</surname> <given-names>B.</given-names></name>
</person-group> (<year>2023</year>). 
<article-title>PMVT: a lightweight vision transformer for plant disease identification on mobile devices</article-title>. <source>Front. Plant Sci.</source> <volume>14</volume>. doi:&#xa0;<pub-id pub-id-type="doi">10.3389/fpls.2023.1256773</pub-id>, PMID: <pub-id pub-id-type="pmid">37822342</pub-id>
</mixed-citation>
</ref>
<ref id="B12">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Liu</surname> <given-names>Y.</given-names></name>
<name><surname>Li</surname> <given-names>A.</given-names></name>
<name><surname>Zhao</surname> <given-names>X. M.</given-names></name>
<name><surname>Wang</surname> <given-names>M.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>DeepTL-Ubi: a novel deep transfer learning method for effectively predicting ubiquitination sites of multiple species</article-title>. <source>Methods</source><fpage>192</fpage>, <fpage>103</fpage>&#x2013;<lpage>11</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.ymeth.2020.08.003</pub-id>, PMID: <pub-id pub-id-type="pmid">32791338</pub-id>
</mixed-citation>
</ref>
<ref id="B13">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Michel</surname> <given-names>P.</given-names></name>
<name><surname>Levy</surname> <given-names>O.</given-names></name>
<name><surname>Neubig</surname> <given-names>G.</given-names></name>
</person-group> (<year>2019</year>). &#x201c;
<article-title>Are sixteen heads really better than one</article-title>?,&#x201d; in <source>Advances in neural information processing systems</source>. <publisher-loc>Vancouver, BC, Canada</publisher-loc>: 
<publisher-name>Curran Associates, Inc.</publisher-name>
</mixed-citation>
</ref>
<ref id="B14">
<mixed-citation publication-type="confproc">
<person-group person-group-type="author">
<name><surname>Moupojou</surname> <given-names>E.</given-names></name>
<name><surname>Tagne</surname> <given-names>A.</given-names></name>
<name><surname>Retraint</surname> <given-names>F.</given-names></name>
<name><surname>Tadonkemwa</surname> <given-names>A.</given-names></name>
<name><surname>Wilfried</surname> <given-names>D.</given-names></name>
<name><surname>Tapamo</surname> <given-names>H.</given-names></name>
<etal/>
</person-group>. (<year>2023</year>). &#x201c;
<article-title>FieldPlant: A dataset of field plant images for plant disease detection and classification with deep learning</article-title>,&#x201d; in <conf-name>2023 IEEE International Conference On Big Data (Big Data)</conf-name>. <publisher-loc>Sorrento, Italy</publisher-loc>: 
<publisher-name>IEEE</publisher-name>.
</mixed-citation>
</ref>
<ref id="B15">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Seren</surname> <given-names>&#xdc;.</given-names></name>
<name><surname>Grimm</surname> <given-names>D.</given-names></name>
<name><surname>Fitz</surname> <given-names>J.</given-names></name>
<name><surname>Weigel</surname> <given-names>D.</given-names></name>
<name><surname>Borgwardt</surname> <given-names>K.</given-names></name>
<name><surname>Korte</surname> <given-names>A.</given-names></name>
<etal/>
</person-group>. (<year>2016</year>). 
<article-title>AraPheno: a public database for Arabidopsis thaliana phenotypes</article-title>. <source>Nucleic Acids Res</source>. <volume>45</volume>, <fpage>D1054</fpage>&#x2013;<lpage>59</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1093/nar/gkw986</pub-id>, PMID: <pub-id pub-id-type="pmid">27924043</pub-id>
</mixed-citation>
</ref>
<ref id="B16">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Vaswani</surname> <given-names>A.</given-names></name>
<name><surname>Shazeer</surname> <given-names>N.</given-names></name>
<name><surname>Parmar</surname> <given-names>N.</given-names></name>
<name><surname>Jones</surname> <given-names>L.</given-names></name>
<name><surname>Gomez</surname> <given-names>A. N.</given-names></name>
<name><surname>Kaiser</surname> <given-names>L.</given-names></name>
<etal/>
</person-group>. (<year>2017</year>). &#x201c;
<article-title>Attention is all you need</article-title>,&#x201d; in <source>Advances in neural information processing systems</source>. <publisher-loc>Long Beach, CA, USA</publisher-loc>: 
<publisher-name>Curran Associates, Inc.</publisher-name>
</mixed-citation>
</ref>
<ref id="B17">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Voita</surname> <given-names>E.</given-names></name>
<name><surname>Talbot</surname> <given-names>D.</given-names></name>
<name><surname>Moiseev</surname> <given-names>F.</given-names></name>
<name><surname>Sennrich</surname> <given-names>R.</given-names></name>
<name><surname>Titov</surname> <given-names>I.</given-names></name>
</person-group> (<year>2019</year>). <source>Analyzing multi-head self-attention: Specialized heads do the heavy lifting, the rest can be pruned</source>, arXiv preprint arXiv:1905.09418.
</mixed-citation>
</ref>
<ref id="B18">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Xie</surname> <given-names>Y.</given-names></name>
<name><surname>Zeng</surname> <given-names>X.</given-names></name>
<name><surname>Wang</surname> <given-names>R.</given-names></name>
</person-group> (<year>2025</year>). <source>Supplementary dataset for cross-species plant phenotyping</source> (<publisher-loc>Geneva, Switzerland</publisher-loc>: 
<publisher-name>Zenodo</publisher-name>). doi:&#xa0;<pub-id pub-id-type="doi">10.5281/zenodo.7843572</pub-id>
</mixed-citation>
</ref>
<ref id="B19">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Yan</surname> <given-names>K.</given-names></name>
<name><surname>Guo</surname> <given-names>X.</given-names></name>
<name><surname>Ji</surname> <given-names>Z.</given-names></name>
<name><surname>Zhou</surname> <given-names>X.</given-names></name>
</person-group> (<year>2021</year>). 
<article-title>Deep transfer learning for cross-species plant disease diagnosis adapting mixed subdomains</article-title>. <source>IEEE/ACM Trans. Comput. Biol. Bioinf</source>. <volume>20</volume>, <fpage>25550</fpage>&#x2013;<lpage>64</lpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1109/TCBB.2021.3133588</pub-id>, PMID: <pub-id pub-id-type="pmid">34914593</pub-id>
</mixed-citation>
</ref>
<ref id="B20">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Zhang</surname> <given-names>N.</given-names></name>
<name><surname>Zhang</surname> <given-names>E.</given-names></name>
<name><surname>Qi</surname> <given-names>G.</given-names></name>
<name><surname>Li</surname> <given-names>F.</given-names></name>
<name><surname>Lv</surname> <given-names>C.</given-names></name>
</person-group> (<year>2025</year>). &#x201c;
<article-title>Lightweight grape leaf disease recognition method based on transformer framework</article-title>,&#x201d; in <source>Scientific reports</source>. <publisher-loc>London, UK</publisher-loc>: 
<publisher-name>Nature Publishing Group (Scientific Reports)</publisher-name>., PMID: <pub-id pub-id-type="pmid">40775261</pub-id>
</mixed-citation>
</ref>
<ref id="B21">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>Y. Y.</given-names></name>
<name><surname>Kong</surname> <given-names>J. L.</given-names></name>
<name><surname>Jin</surname> <given-names>X. B.</given-names></name>
<name><surname>Wang</surname> <given-names>X. Y.</given-names></name>
<name><surname>Su</surname> <given-names>T. L.</given-names></name>
<name><surname>Zuo</surname> <given-names>M.</given-names></name>
</person-group> (<year>2019</year>). 
<article-title>CropDeep: The crop vision dataset for deep-learning-based classification and detection in precision agriculture</article-title>. <source>Sensors</source> <volume>19</volume>, <elocation-id>1058</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.3390/s19051058</pub-id>, PMID: <pub-id pub-id-type="pmid">30832283</pub-id>
</mixed-citation>
</ref>
<ref id="B22">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>W.</given-names></name>
<name><surname>Lu</surname> <given-names>S.</given-names></name>
<name><surname>Yang</surname> <given-names>Y.</given-names></name>
<name><surname>Yin</surname> <given-names>Z.</given-names></name>
<name><surname>Yin</surname> <given-names>L.</given-names></name>
</person-group> (<year>2024</year>). 
<article-title>Lightweight transformer image feature extraction network</article-title>. <source>PeerJ Comput. Sci.</source> <volume>10</volume>, <elocation-id>e1755</elocation-id>. doi:&#xa0;<pub-id pub-id-type="doi">10.7717/peerj-cs.1755</pub-id>, PMID: <pub-id pub-id-type="pmid">39669455</pub-id>
</mixed-citation>
</ref>
<ref id="B23">
<mixed-citation publication-type="book">
<person-group person-group-type="author">
<name><surname>Zheng</surname> <given-names>Z.</given-names></name>
<name><surname>Wang</surname> <given-names>Y.</given-names></name>
<name><surname>Huang</surname> <given-names>Y.</given-names></name>
<name><surname>Song</surname> <given-names>S.</given-names></name>
<name><surname>Yang</surname> <given-names>M.</given-names></name>
<name><surname>Yang</surname> <given-names>M.</given-names></name>
<etal/>
</person-group>. (<year>2024</year>). <source>Attention heads of large language models: A survey</source>, arXiv preprint arXiv:2409.03752.
</mixed-citation>
</ref>
<ref id="B24">
<mixed-citation publication-type="journal">
<person-group person-group-type="author">
<name><surname>Zhong</surname> <given-names>M.</given-names></name>
<name><surname>LeBien</surname> <given-names>J.</given-names></name>
<name><surname>Campos-Cerqueira</surname> <given-names>M.</given-names></name>
<name><surname>Dodhia</surname> <given-names>R.</given-names></name>
<name><surname>Velev</surname> <given-names>J. P.</given-names></name>
<name><surname>Aide</surname> <given-names>T. M.</given-names></name>
</person-group> (<year>2020</year>). 
<article-title>Multispecies bioacoustic classification using transfer learning of deep convolutional neural networks with pseudo-labeling</article-title>. <source>Appl. Aust.</source><publisher-loc>London, UK</publisher-loc>: 
<publisher-name>PeerJ Inc.</publisher-name><volume>166</volume>, <fpage>107375</fpage>. doi:&#xa0;<pub-id pub-id-type="doi">10.1016/j.apacoust.2020.107375</pub-id>
</mixed-citation>
</ref>
</ref-list>
<fn-group>
<fn id="n1" fn-type="custom" custom-type="edited-by">
<p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1937850">Parvathaneni Naga Srinivasu</ext-link>, Amrita Vishwa Vidyapeetham University, India</p></fn>
<fn id="n2" fn-type="custom" custom-type="reviewed-by">
<p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3086797">Anto Lourdu Xavier Raj Arockia Selvarathinam</ext-link>, Grand Valley State University, United States</p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3261500">Pavani Cherukuru</ext-link>, Dayanand Sagar Academy of Technology and Management, India</p></fn>
</fn-group>
</back>
</article>