<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Publishing DTD v1.3 20210610//EN" "JATS-journalpublishing1-3-mathml3.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:ali="http://www.niso.org/schemas/ali/1.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" dtd-version="1.3" xml:lang="EN">
<front>
<journal-meta><journal-id journal-id-type="pmc">fcomp</journal-id><journal-id journal-id-type="publisher-id">Front. Comput. Sci.</journal-id><journal-title-group><journal-title>Frontiers in Computer Science</journal-title><abbrev-journal-title abbrev-type="pubmed">Front. Comput. Sci.</abbrev-journal-title></journal-title-group><issn pub-type="epub">2624-9898</issn><publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher></journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fcomp.2025.1626346</article-id><article-version article-version-type="Version of Record" vocab="NISO-RP-8-2008"/>
<article-categories>
<subj-group subj-group-type="heading"><subject>Original Research</subject></subj-group>
</article-categories>
<title-group>
<article-title>LLaVA-GM: lightweight LLaVA multimodal architecture</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Han</surname>
<given-names>Zhiyin</given-names>
</name><xref ref-type="aff" rid="aff1"/>
<uri xlink:href="https://loop.frontiersin.org/people/3063214"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Data curation" vocab-term-identifier="https://credit.niso.org/contributor-roles/data-curation/">Data curation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="resources" vocab-term-identifier="https://credit.niso.org/contributor-roles/resources/">Resources</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; original draft" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-original-draft/">Writing &#x2013; original draft</role>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Xiaoqun</given-names>
</name><xref ref-type="aff" rid="aff1"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="conceptualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/conceptualization/">Conceptualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Funding acquisition" vocab-term-identifier="https://credit.niso.org/contributor-roles/funding-acquisition/">Funding acquisition</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Project administration" vocab-term-identifier="https://credit.niso.org/contributor-roles/project-administration/">Project administration</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="validation" vocab-term-identifier="https://credit.niso.org/contributor-roles/validation/">Validation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Hao</surname>
<given-names>Juan</given-names>
</name><xref ref-type="aff" rid="aff1"/>
<xref ref-type="corresp" rid="c001"><sup>&#x002A;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/3120266"/>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Formal analysis" vocab-term-identifier="https://credit.niso.org/contributor-roles/formal-analysis/">Formal analysis</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="investigation" vocab-term-identifier="https://credit.niso.org/contributor-roles/investigation/">Investigation</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="methodology" vocab-term-identifier="https://credit.niso.org/contributor-roles/methodology/">Methodology</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="software" vocab-term-identifier="https://credit.niso.org/contributor-roles/software/">Software</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="supervision" vocab-term-identifier="https://credit.niso.org/contributor-roles/supervision/">Supervision</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="visualization" vocab-term-identifier="https://credit.niso.org/contributor-roles/visualization/">Visualization</role>
<role vocab="credit" vocab-identifier="https://credit.niso.org/" vocab-term="Writing &#x2013; review &amp; editing" vocab-term-identifier="https://credit.niso.org/contributor-roles/writing-review-editing/">Writing &#x2013; review &#x0026; editing</role>
</contrib>
</contrib-group>
<aff id="aff1"><institution>College of Information Engineering, Hebei University of Architecture</institution>, <city>Zhangjiakou</city>, <country>China</country></aff>
<author-notes><corresp id="c001"><label>&#x002A;</label>Correspondence: Juan Hao, <email xlink:href="mailto:hj2008@hebiace.edu.cn">hj2008@hebiace.edu.cn</email></corresp></author-notes>
<pub-date publication-format="electronic" date-type="pub" iso-8601-date="2025-09-01">
<day>01</day>
<month>09</month>
<year>2025</year>
</pub-date>
<pub-date publication-format="electronic" date-type="collection">
<year>2025</year>
</pub-date>
<volume>7</volume>
<elocation-id>1626346</elocation-id>
<history>
<date date-type="received">
<day>10</day>
<month>05</month>
<year>2025</year>
</date>
<date date-type="accepted">
<day>13</day>
<month>08</month>
<year>2025</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x00A9; 2025 Han, Liu and Hao.</copyright-statement>
<copyright-year>2025</copyright-year>
<copyright-holder>Han, Liu and Hao</copyright-holder>
<license><ali:license_ref start_date="2025-09-01">https://creativecommons.org/licenses/by/4.0/</ali:license_ref>
<license-p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</license-p>
</license>
</permissions>
<abstract>
<p>Multimodal large-scale language modeling has become the mainstream approach in natural language processing tasks and has been applied to various cross-modal fields such as image description and visual question answering. However, large-scale language modeling has high computational complexity and a large operational scale, which presents significant challenges for deployment in many resource-constrained scenarios. To address such problems, a lightweight multimodal framework, LLaVA-GM, is proposed, based on LLaVA, which can be deployed on devices with low resource requirements and has greatly reduced model parameters. It can also be tested on common VQA tasks and achieves good performance. The main contributions and work are as follows: First, it is found that the backbone of the Vicuna language model in LLaVA is too redundant. When fine-tuning downstream tasks, a very small amount of data sets is difficult to affect the language model. It is replaced with a new Gemma language model, thereby achieving fast task-specific adaptation with fewer parameters and data. Second, in response to the problem of information redundancy, the MoE mixed expert model is introduced. This model can be used in combination with itself, combining the MoE mixed expert model with Gemma to reduce the amount of computation while maintaining performance. Directly training the entire model will lead to a decline in performance. A multi-stage training strategy is adopted to maintain performance. First, the MLP layer is trained for visual adaptation, then the entire Gemma model is trained to improve multimodal capabilities, and finally only the MoE layer is trained for sparsification to ensure a smooth transition from dense models to sparse models. The experiment was tested on multiple VQA datasets and achieved good performance, confirming the potential of this compact model in downstream multimodal applications.</p>
</abstract>
<kwd-group>
<kwd>lightweight</kwd>
<kwd>LLaVA</kwd>
<kwd>Gemma</kwd>
<kwd>sparse expert</kwd>
<kwd>deep learning</kwd>
</kwd-group><funding-group><funding-statement>The author(s) declare that financial support was received for the research and/or publication of this article. This research was funded by Hebei Provincial Science and Technology Program (grant number 20470302D).</funding-statement></funding-group>
<counts>
<fig-count count="7"/>
<table-count count="3"/>
<equation-count count="7"/>
<ref-count count="43"/>
<page-count count="10"/>
<word-count count="6334"/>
</counts>
<custom-meta-group>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Computer Vision</meta-value>
</custom-meta>
</custom-meta-group>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="sec1">
<label>1</label>
<title>Introduction</title>
<p>Large language models have shown excellent performance in various natural languages processing tasks, such as text generation (<xref ref-type="bibr" rid="ref36">Stiennon et al., 2020</xref>), machine translation (<xref ref-type="bibr" rid="ref40">Zhao et al., 2023</xref>), and question-answering systems (<xref ref-type="bibr" rid="ref21">Kolomiyets and Moens, 2011</xref>). With technological advancements, their applications have expanded into cross-modal fields, including image captioning (<xref ref-type="bibr" rid="ref10">Dong et al., 2021</xref>) and visual question answering (VQA). To further enhance the performance of LLMs, researchers have adopted multiple strategies. On one hand, increasing the model&#x2019;s parameter scale and training data volume can enhance its expressive and generalization abilities. On the other hand, leveraging techniques such as image encoders (<xref ref-type="bibr" rid="ref3">Alsayed et al., 2023</xref>) and visual projection layers strengthens the visual perception of language models, enabling more effective processing of visual-language fusion tasks. However, large-scale models bring about issues such as high computational complexity and resource demands. Training and deploying these models require substantial computing resources, resulting in high hardware costs and the need for specialized parallel computing equipment and optimization techniques. This poses a significant challenge for applications requiring rapid iteration and updates. For specific downstream tasks like specialized Q&#x0026;A or image captioning, a smaller, specialized model may be more suitable. It can be optimized for particular tasks through training on relevant data, achieving efficient and precise processing. This approach not only reduces computational costs but also enhances the model&#x2019;s relevance and practicality, offering more effective solutions for specific domains. We chose the LLaVA large model as our research entry point. It consists of a pre-trained visual encoder and a large language model, connected by a simple linear layer that maps to the language embedding space. This modular design reduces architectural and training complexity, making the model easy to implement and extend. To optimize LLaVA for VQA tasks and achieve a lightweight model for easy deployment in downstream tasks, we made the following improvements:</p>
<list list-type="order">
<list-item>
<p>We found that Vicuna, the language model in LLaVA, has a large architecture that is not conducive to specialized task improvement. Through experiments, we replaced it with the smaller Gemma speech model and trained it. We discovered that this smaller model responds more quickly to fine-tuning for specific tasks.</p>
</list-item>
<list-item>
<p>We gradually replaced FFNN layers with MoE layers in LLaVA and combined them with the Gemma language model to introduce sparsity, exploring the potential of small models in multimodal tasks, especially in resource-constrained scenarios. This approach maintains most of the model&#x2019;s performance while improving efficiency.</p>
</list-item>
<list-item>
<p>We used a multi-stage training strategy. First, we trained only the MLP layers to adapt to visual inputs. Then, we trained the entire Gemma model to build multimodal capabilities. Finally, we trained just the MoE layers to achieve sparsification, ensuring a smooth transition from a dense to a sparse model and leveraging Gemma&#x2019;s compactness to boost training efficiency. We&#x2019;ve developed a lightweight LLaVA multimodal architecture for VQA tasks.</p>
</list-item>
</list>
<p>As shown in <xref ref-type="fig" rid="fig1">Figure 1</xref>, the LLaVA-GM-2B model achieves a high object hallucination benchmark score with few active parameters, indicating better performance with images closer to the top-left corner of the coordinate axis.</p>
<fig position="float" id="fig1">
<label>Figure 1</label>
<caption>
<p>Compare LLaVA-GM with open-source LVLMs on the object hallucination benchmark and active parameter size.</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g001.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Scatter plot showing the relationship between hallucination average performance and the number of activated parameters for various models. Data points include LLaVA, MoE-LLaVA, Qwen-VL, Shikra, and BLIP, ranging from 1.6B to 13B parameters. Performance scores vary between 81 and 88.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec2">
<label>2</label>
<title>Related work</title>
<p>In recent years, large vision-language models (LVLMs) have made significant progress in visual-language tasks by integrating powerful language models with visual encoders. Models like OpenAI&#x2019;s GPT series (<xref ref-type="bibr" rid="ref33">Roumeliotis and Tselikas, 2023</xref>), DeepSeek (<xref ref-type="bibr" rid="ref14">Guo et al., 2024</xref>) from Hugging Face, and Google&#x2019;s Gemini (<xref ref-type="bibr" rid="ref31">Menger and Keiper, 2000</xref>) series have been widely used in image captioning (<xref ref-type="bibr" rid="ref4">Bernardi et al., 2016</xref>), visual question answering (VQA) (<xref ref-type="bibr" rid="ref22">Lan et al., 2023</xref>), and cross-modal reasoning (<xref ref-type="bibr" rid="ref13">Guan et al., 2023</xref>). CLIP (<xref ref-type="bibr" rid="ref32">Radford et al., 2021</xref>) laid the foundation for multimodal tasks through contrastive learning on large-scale image-text pairs, but focuses on global feature alignment. BLIP (<xref ref-type="bibr" rid="ref24">Li et al., 2022</xref>) improved this by jointly optimizing image and text encoders, enhancing performance in image captioning and VQA. BLIP-2 (<xref ref-type="bibr" rid="ref23">Li et al., 2023</xref>) further reduced computational costs by using a pre - trained image encoder [e.g., ViT (<xref ref-type="bibr" rid="ref20">Khan et al., 2022</xref>)] and a few visual mapping layers to inject visual features into large language models, while LLaVA (<xref ref-type="bibr" rid="ref29">Liu et al., 2024</xref>) achieved efficient collaboration in visual - language tasks using a pre - trained visual encoder (CLIP-ViT) (<xref ref-type="bibr" rid="ref38">Yang et al., 2024</xref>) and a large language model (e.g., LLaMA) (<xref ref-type="bibr" rid="ref37">Touvron et al., 2023</xref>), projecting image features into the language model&#x2019;s embedding space. However, LLaVA-OneVision (<xref ref-type="bibr" rid="ref25">Li et al., 2024</xref>), though demonstrating strong video understanding and cross-scene capabilities, comes with high costs. The performance improvements of these models often depend on expanding model size and dataset scale. For example, increasing the parameters of the visual encoder or using a larger language model [like GPT-4 (<xref ref-type="bibr" rid="ref1">Achiam et al., 2023</xref>)] can achieve higher accuracy in downstream tasks, but also increases computational costs. These dense models require full forward propagation for each token, leading to high inference costs and making deployment difficult in resource-constrained scenarios. Moreover, the computational complexity grows exponentially in per-pixel tasks. The introduction of MoE sparse matrix models offers a solution. MoE&#x2019;s (<xref ref-type="bibr" rid="ref17">Hwang et al., 2023</xref>) core idea is to divide the model into multiple expert subnetworks and dynamically select active experts via a routing mechanism, achieving computational sparsity. For instance, the Switch Transformer (<xref ref-type="bibr" rid="ref11">Fedus et al., 2022</xref>) activates only the top 1 expert for each input, successfully scaling the model to trillion-parameter levels while maintaining inference efficiency. GShard and GLaM further optimized the MoE architecture by introducing expert parallelism (<xref ref-type="bibr" rid="ref42">Zhou et al., 2022</xref>) and load balancing strategies, enabling sparse models to perform well in large-scale language tasks. LLaVA-Gemma (<xref ref-type="bibr" rid="ref15">Hinck et al., 2024</xref>) was the first to integrate Gemma into the LLaVA multimodal model, reducing the model size but not the total parameters. MoE-LLaVA (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>) fine-tuned the LLaVA model with MoE, and Deepseekmoe (<xref ref-type="bibr" rid="ref9">Dai et al., 2024</xref>) also adopted this architecture, laying the foundation for model lightweight. According to <xref ref-type="bibr" rid="ref19">Jin et al. (2024)</xref>, various algorithms for hardware-efficient multimodal LLM were presented. At the same time, the keyword enhancement and self-supervised contrastive learning techniques of <xref ref-type="bibr" rid="ref7">Chen et al. (2024)</xref>, <xref ref-type="bibr" rid="ref6">Chen and Zhu (2025)</xref>, and <xref ref-type="bibr" rid="ref8">Chen et al. (2025)</xref> have a certain promoting effect on the research. Based on this, we propose the LLaVA-GM model architecture, which retains most of the model&#x2019;s performance while significantly reducing parameters.</p>
</sec>
<sec sec-type="methods" id="sec3">
<label>3</label>
<title>Methods</title>
<sec id="sec4">
<label>3.1</label>
<title>Language models</title>
<p>Our work aims to boost model performance in diverse tasks, especially VQA and multimodal benchmarks, through lightweight design and sparse computation. To lighten the model, we compared major multimodal architectures like FLAVA (<xref ref-type="bibr" rid="ref35">Singh et al., 2022</xref>), LAMM (<xref ref-type="bibr" rid="ref39">Yin et al., 2023</xref>), CLIP (<xref ref-type="bibr" rid="ref32">Radford et al., 2021</xref>), and BLIP (<xref ref-type="bibr" rid="ref24">Li et al., 2022</xref>). CLIP excels in image-text matching but is large. FLAVA offers multimodal pre-training advantages, yet it is computationally intensive. LAMM integrates image and text info effectively, but underperforms in specific tasks. Consequently, we chose LLaVA as the main framework due to its efficient and simple mapping mechanism that reduces model complexity while maintaining performance. After determining the main model framework, we focus on streamlining the language model to achieve further model lightweight while preserving its original architecture as much as possible. We conduct a comprehensive evaluation of several candidate models, including Llama (<xref ref-type="bibr" rid="ref37">Touvron et al., 2023</xref>) and the GPT (<xref ref-type="bibr" rid="ref1">Achiam et al., 2023</xref>) series, and finally select Google&#x2019;s Gemma (<xref ref-type="bibr" rid="ref21">Kolomiyets and Moens, 2011</xref>), which offers a smaller size and decent performance with flexible options of 2b and 7b scales. To assess Gemma&#x2019;s performance across tasks, we design experiments covering GQA (<xref ref-type="bibr" rid="ref2">Ainslie et al., 2023</xref>), VQAv2.0 (<xref ref-type="bibr" rid="ref12">Goyal et al., 2017</xref>), ScienceQA-IMG (<xref ref-type="bibr" rid="ref30">Lu et al., 2022</xref>), MMBench (<xref ref-type="bibr" rid="ref28">Liu et al., 2024</xref>), and MME (<xref ref-type="bibr" rid="ref26">Liang et al., 2024</xref>). By experimenting with and evaluating Gemma-2b and Gemma-7b, we analyze the impact of model size on performance. As shown in <xref ref-type="fig" rid="fig2">Figure 2</xref>, the new language model Gemma is integrated into LLaVA. Image inputs are processed by the CLIP ViT-L/336px visual encoder. Text inputs are tokenized, fed into Gemma, and mapped to a 2-layer MLP for processing.</p>
<fig position="float" id="fig2">
<label>Figure 2</label>
<caption>
<p>Overall sketch of the model.</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g002.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram showing the process of combining image and text inputs in a model. An image is processed by a vision encoder labeled "CLIP ViT-L/336px." This connects to a vision-language connector labeled "MLP&#x002A;2," which links to a language model called "Gemma." In parallel, text is tokenized and also fed into the language model.</alt-text>
</graphic>
</fig>
</sec>
<sec id="sec5">
<label>3.2</label>
<title>Sparse architecture</title>
<p>In multimodal tasks, we aim to maintain performance while reducing computational costs through sparse activation for efficient inference, avoiding the full parameter activation of dense models. However, directly replacing the feedforward networks in the Transformer with MoE layers can significantly degrade model performance. Therefore, based on experimental findings, we gradually replace some FFNNs with MoE layers to reduce activated parameters. We also use CLIP-ViT-L-14 to process image inputs for tasks like VQA. The model&#x2019;s processing flow starts with the input layer. Given an RGB image <inline-formula>
<mml:math id="M1">
<mml:mi>v</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>H</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>W</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, where H and W are the original resolutions (usually 336&#x202F;&#x00D7;&#x202F;336), the visual encoder processes the input image to obtain a sequence of visual tokens</p>
<disp-formula id="EQ1">
<mml:math id="M2">
<mml:mi>Z</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo stretchy="true">[</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22EF;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>z</mml:mi>
<mml:mi>P</mml:mi>
</mml:msub>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(1)</label></disp-formula>
<p>Subsequently, In <xref ref-type="disp-formula" rid="EQ1">Equation 1</xref> the visual projection layer &#x201C;f &#x201C;maps <inline-formula>
<mml:math id="M3">
<mml:mi>Z</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>C</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula> to <inline-formula>
<mml:math id="M4">
<mml:mi>V</mml:mi>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
</inline-formula>, where P is the visual token sequence length, C is the encoder output dimension, and D is the LLM&#x2019;s hidden dimension. Meanwhile, text inputs (e.g., &#x201C;What&#x2019;s in the picture?&#x201D;) pass through the word embedding layer g to obtain projected sequence tokens</p>
<disp-formula id="EQ2">
<mml:math id="M5">
<mml:mi>T</mml:mi>
<mml:mo>=</mml:mo>
<mml:mo stretchy="true">[</mml:mo>
<mml:msub>
<mml:mi>t</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>t</mml:mi>
<mml:mn>2</mml:mn>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>&#x22EF;</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>t</mml:mi>
<mml:mi>N</mml:mi>
</mml:msub>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mi>N</mml:mi>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(2)</label></disp-formula>
<p>Here, In <xref ref-type="disp-formula" rid="EQ3">Equation 2</xref> N denotes the sequence length of text tokens. Then, the visual tokens V and text tokens T are combined to form the input sequence.</p>
<disp-formula id="EQ3">
<mml:math id="M6">
<mml:mo stretchy="true">[</mml:mo>
<mml:mi>V</mml:mi>
<mml:mo>;</mml:mo>
<mml:mi>T</mml:mi>
<mml:mo stretchy="true">]</mml:mo>
<mml:mo>&#x2208;</mml:mo>
<mml:msup>
<mml:mi>&#x211D;</mml:mi>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>+</mml:mo>
<mml:mi>N</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x00D7;</mml:mo>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:msup>
</mml:math>
<label>(3)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ7">Equation 3</xref>, We only train the visual projection layer f, while keeping the LLM and embedding layer g in their pre-trained states. The LLM is composed of stacked multi-head self-attention (MHSA) and feedforward network (FFN) blocks, each with layer normalization (LN) and residual connections. The formula is:</p>
<disp-formula id="EQ4">
<mml:math id="M7">
<mml:mtext mathvariant="italic">Layer</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:mi mathvariant="italic">LN</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext mathvariant="italic">MHSA</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>&#x2192;</mml:mo>
<mml:mi mathvariant="italic">LN</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mtext>FFNN</mml:mtext>
<mml:mo stretchy="true">(</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mspace width="0.33em"/>
<mml:mtext>or</mml:mtext>
<mml:mspace width="0.33em"/>
<mml:mi>MoE</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>+</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(4)</label></disp-formula>
<p>We replace some FFNNs with MoE layers, each containing a router and four experts in <xref ref-type="disp-formula" rid="EQ4">Equation 4</xref>. The router, implemented as a linear layer, calculates expert scores based on the input x. Where &#x2026; denotes omitted additional parameters, including visual features, language embeddings, attention masks, and model configuration parameters (such as the number of heads and hidden layer dimensions). Together, these parameters define the multimodal fusion process.</p>
<disp-formula id="EQ5">
<mml:math id="M8">
<mml:mi>f</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:msub>
<mml:mi>W</mml:mi>
<mml:mtext>router</mml:mtext>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:mi>x</mml:mi>
</mml:math>
<label>(5)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ5">Equation 5</xref>, The router calculates expert scores based on input &#x00D7; and uses softmax to obtain probabilities.</p>
<disp-formula id="EQ6">
<mml:math id="M9">
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>=</mml:mo>
<mml:mfrac>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
<mml:mrow>
<mml:munder>
<mml:mo movablelimits="false">&#x2211;</mml:mo>
<mml:mi>j</mml:mi>
</mml:munder>
<mml:mspace width="0.25em"/>
<mml:msup>
<mml:mi>e</mml:mi>
<mml:mrow>
<mml:mi>f</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msup>
</mml:mrow>
</mml:mfrac>
</mml:math>
<label>(6)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ6">Equation 6</xref>, The router selects the top-k experts via softmax. Each expert is an FFNN, and the output is a weighted sum</p>
<disp-formula id="EQ7">
<mml:math id="M10">
<mml:mspace width="0.25em"/>
<mml:mi mathvariant="italic">MoE</mml:mi>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
<mml:mo>=</mml:mo>
<mml:munderover>
<mml:mo movablelimits="false">&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>=</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:munderover>
<mml:mspace width="0.25em"/>
<mml:msub>
<mml:mrow>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:mrow>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x00B7;</mml:mo>
<mml:msub>
<mml:mi>e</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo stretchy="true">(</mml:mo>
<mml:mi>x</mml:mi>
<mml:mo stretchy="true">)</mml:mo>
</mml:math>
<label>(7)</label></disp-formula>
<p>In <xref ref-type="disp-formula" rid="EQ7">Equation 7</xref>, A linear transformation maps the output back to the vocabulary. Only the top-k experts are activated during inference to reduce parameters. The alternate replacement of FFNN and MoE balances generality and task specificity. As a submodule of LLN, the feedforward neural network (FNN) contains two layers of linear transformation and ReLU activation to enhance the expressiveness of unimodal features: the MoE module is integrated after LLN and dynamically selects the expert network to process the output of FNN through the gating network. Dynamic routing enables a flexible and efficient model, combining high performance with low cost. As shown in <xref ref-type="fig" rid="fig3">Figure 3</xref>, both image and text tokens are processed by MHSA and feedforward networks.</p>
<fig position="float" id="fig3">
<label>Figure 3</label>
<caption>
<p>MoE mixing treatment stage flow.</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g003.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram depicting a model architecture. At the top, labeled "Text," is a green bar. Below, layers labeled "Expert" lead to "Router" sections with blue and pink blocks. Orange layers below are labeled "MHSA," leading from an "Image" and ending with "FFNN&#x0026;MoE" pointing to a "Text" section.</alt-text>
</graphic>
</fig>
<p>However, half of the feedforward networks are replaced with MoE. After normalization, the gating network selects appropriate routes for processing, and finally, the top-k selection determines the corresponding expert outputs.</p>
</sec>
<sec id="sec6">
<label>3.3</label>
<title>Training strategies</title>
<p>As mentioned in the previous chapter, we have designed a three-stage training process for our model to achieve lightweight and high-accuracy goals, as direct MoE layer replacement or model training fails to meet the requirements.</p>
<p><italic>Stage 1</italic>: Focus on adapting visual inputs. Only the MLP visual mapping layer is trained, with Gemma and the word embedding layer frozen. The aim is to align image tokens V with the LLM&#x2019;s input space, treating them as pseudo-text tokens. Training data consists of image descriptions, and f is optimized to generate a compatible V.</p>
<p><italic>Stage 2</italic>: Train the entire Gemma model to build multimodal capabilities. Unfreeze the MLP layer f and the word embedding layer g and use multimodal instruction data for training. Outputs are generated through a linear layer, transforming Gemma from a language model to a multimodal LVLM and establishing visual-language fusion.</p>
<p><italic>Stage 3</italic>: Achieve sparsification by only training MoE layers. Replace half of the MoE layers, freeze half of the FFNN and MHSA, and only train the routers and experts of the MoE layers, with top-2 expert outputs selected. The remaining experts remain inactive, and this sparse path design improves efficiency.</p>
<p>As shown in <xref ref-type="fig" rid="fig4">Figure 4</xref>, it contains four stages: visual encoding, language encoding, language-visual fusion layer (LLN), and mixture of experts (MoE) module. Visual encoding uses pre-trained CLIP-ViT-L-336px, which has a high number of parameters but does not require retraining. The input image is 336X336px, and the language encoding is based on Gemma-2B/7B. The amount of calculation increases with the size of the model. The fusion module LLN module, fuses multi-modal features through multi-head attention, and the FNN submodule enhances feature expression. The MoE module reduces the amount of calculation through sparse activation and optimizes the gated network to ensure expert utilization. Make it load balanced. We conduct training in three stages. Image information is split and fed into the visual encoder in blocks. We simplify the text encoder to directly input tokens into the MLP, highlighting the alignment of image tokens and clarifying the structure. In the second stage, we unfreeze the MLP and train Gemma to build its multimodal capabilities. Finally, we use sparse expert selection to generate outputs.</p>
<fig position="float" id="fig4">
<label>Figure 4</label>
<caption>
<p>Three stages of training.</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g004.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Diagram illustrating a three-stage process for vision-language transformation. Stage 1 freezes an image into smaller patches, processed by a Vision Encoder. Stage 2 involves a Vision-language connector feeding into decoder layers. Stage 3 uses MoE and FFNN blocks to output through Experts, leading to the final result.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec id="sec7">
<label>4</label>
<title>Experiment</title>
<sec id="sec8">
<label>4.1</label>
<title>Data set and evaluation index</title>
<p>VQAv2.0 (<xref ref-type="bibr" rid="ref12">Goyal et al., 2017</xref>) is a classic visual question answering dataset based on COCO and abstract scene images, featuring 265,016 images and over 1.1 million open-ended questions. Each question comes with 10 real answers and 3 plausible but potentially wrong options, testing the model&#x2019;s understanding of image content, language, and common sense.</p>
<p>ScienceQA-IMG (<xref ref-type="bibr" rid="ref30">Lu et al., 2022</xref>) focuses on multimodal questions in scientific fields, containing 21,208 multiple-choice questions across 26 subjects, including natural, language, and social sciences. Each question is accompanied by detailed explanations, making it the first large-scale annotated dataset with lectures and explanations. It emphasizes scientific knowledge and reasoning.</p>
<p>GQA (<xref ref-type="bibr" rid="ref2">Ainslie et al., 2023</xref>), based on Visual Genome, offers over 1 million questions and scene graph annotations with about 260,000 images. Questions are linked to objects, attributes, and relationships in images, with controlled and balanced question generation to reduce language ambiguity.</p>
<p>MMBench (<xref ref-type="bibr" rid="ref28">Liu et al., 2024</xref>) is a newer benchmark with around 3,000 multiple-choice questions, covering 20 fine-grained ability dimensions (e.g., identity and attribute reasoning). Derived from an extension of ScienceQA, it innovatively assesses model predictions against options using ChatGPT for a more robust evaluation.</p>
<p>MME (<xref ref-type="bibr" rid="ref26">Liang et al., 2024</xref>) aims to comprehensively evaluate multimodal models with various task types, such as perception and reasoning. Despite its small size, it features diverse tasks like &#x201C;Does the object in the picture exist?&#x201D; and &#x201C;Answer questions based on charts,&#x201D; emphasizing objectivity and reproducibility.</p>
<p>Together, these datasets provide a comprehensive test of model performance, from basic understanding to complex reasoning, supporting the development of efficient and high-performing multimodal models.</p>
</sec>
<sec id="sec9">
<label>4.2</label>
<title>Experimental settings and analysis</title>
<p>Training uses 3&#x202F;V100 vGPUs - 32GB (96GB total memory) running on Ubuntu 22.04, PyTorch 2.1.0, Python 3.10, CUDA 12.1, The training power consumption of LLaVA-GM-2B is approximately 32 kWh, and that of LLaVA-GM-7B is approximately 69 kWh, which is a significant advantage over the original 7B and 78 kWh, verifying its deployment potential in mobile and embedded systems. During training, we used a learning rate of 2e-5, a batch size of 32, an AdamW (&#x03B2;1&#x202F;=&#x202F;0.9, &#x03B2;2&#x202F;=&#x202F;0.999, wd&#x202F;=&#x202F;0.01) optimizer, and a cross-entropy loss. We loaded the Gemma-2B/7B pre-trained weights of Hugging Face and the CLIP-ViT-L-336px visual encoder and iterated for 3 epochs. Deployment on NVIDIA Orin NX shows that it benefits from its modular design and MoE sparse activation mechanism. Our method reduces memory access overhead and redundant computation. The model latency of LLaVA-GM 2B is further reduced to 0.5&#x2013;0.6&#x202F;s, with a power consumption of about 5.0&#x202F;W and a video memory occupancy of only 1.9&#x202F;GB. The 7B model occupies 4.2GB of video memory and consumes about 12&#x202F;W of power. Even after distillation, the model has too many visual tokens and high inference latency. LLaVA-MoD (<xref ref-type="bibr" rid="ref34">Shu et al., 2024</xref>) consumes 9.5&#x202F;W of power and has 4.8GB of video memory. Our models have reduced resources to varying degrees. LLaVA-GM optimizes hardware efficiency while maintaining high accuracy, making it suitable for resource-constrained embedded platforms.</p>
</sec>
<sec id="sec10">
<label>4.3</label>
<title>Experimental results</title>
<p>We found that the original language model, Vicuna in LLaVA is too bulky for deployment and fine-tuning in downstream tasks. In contrast, Gemma was trained with twice the data volume and incorporates knowledge distillation and architectural improvements, giving it an edge in handling complex tasks. Since our aim is to serve computation-constrained devices, we experimented as shown in <xref ref-type="table" rid="tab1">Table 1</xref>. All our experiments are based on the average of 3 independent experiments and have been processed with a standard deviation.</p>
<table-wrap position="float" id="tab1">
<label>Table 1</label>
<caption>
<p>Generic model comparison.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Methods</th>
<th align="center" valign="top">LLM</th>
<th align="center" valign="top">Finetune size</th>
<th align="center" valign="top">VQA2.0</th>
<th align="center" valign="top">GQA</th>
<th align="center" valign="top">SQA-IMG</th>
<th align="center" valign="top">MMbench</th>
<th align="center" valign="top">MME</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">BLIP-2 (<xref ref-type="bibr" rid="ref23">Li et al., 2023</xref>)</td>
<td align="center" valign="middle">Vicuna-13B</td>
<td align="center" valign="middle">&#x2013;</td>
<td align="center" valign="top">65.0</td>
<td align="center" valign="top">41.0</td>
<td align="center" valign="top">61.0</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">1,293.8</td>
</tr>
<tr>
<td align="left" valign="middle">InstructBLIP (<xref ref-type="bibr" rid="ref16">Huang et al., 2023</xref>)</td>
<td align="center" valign="middle">Vicuna-7B</td>
<td align="center" valign="middle">1.2&#x202F;M</td>
<td align="center" valign="top">-</td>
<td align="center" valign="top">49.2</td>
<td align="center" valign="top">60.5</td>
<td align="center" valign="top">36.0</td>
<td align="center" valign="top">-</td>
</tr>
<tr>
<td align="left" valign="middle">Shikra (<xref ref-type="bibr" rid="ref5">Chen et al., 2023</xref>)</td>
<td align="center" valign="middle">Vicuna-13B</td>
<td align="center" valign="middle">5.5&#x202F;M</td>
<td align="center" valign="top">77.4</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">58.8</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">Qwen-VL (<xref ref-type="bibr" rid="ref18">Jiao et al., 2024</xref>)</td>
<td align="center" valign="middle">LLaMA-7B</td>
<td align="center" valign="middle">1.4B</td>
<td align="center" valign="top"><bold>78.8</bold></td>
<td align="center" valign="top">59.2</td>
<td align="center" valign="top"><bold>67.1</bold></td>
<td align="center" valign="top">38.2</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA1.5 (<xref ref-type="bibr" rid="ref29">Liu et al., 2024</xref>)</td>
<td align="center" valign="middle">Vicuna-7B</td>
<td align="center" valign="middle"><bold>665&#x202F;K</bold></td>
<td align="center" valign="top">78.5</td>
<td align="center" valign="top"><bold>62.0</bold></td>
<td align="center" valign="top">66.8</td>
<td align="center" valign="top"><bold>64.3</bold></td>
<td align="center" valign="top"><bold>1,510.7</bold></td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-GM</td>
<td align="center" valign="middle">Gemma-2B</td>
<td align="center" valign="middle"><bold>665&#x202F;K</bold></td>
<td align="center" valign="top">78.2</td>
<td align="center" valign="top">61.3</td>
<td align="center" valign="top">65.3</td>
<td align="center" valign="top">62.3</td>
<td align="center" valign="top">1,498.3</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold parts represent the optimal values under this indicator.</p>
</table-wrap-foot>
</table-wrap>
<p>We tested several LLaVA-based models using general datasets and GFLOPs as parameters. The visual framework was uniformly set as CLIP. As shown in <xref ref-type="table" rid="tab2">Table 2</xref>, larger LLMs generally yield better performance. Most datasets show that the LLaVA model with Vicuna-13B performs the best. However, our goal is to achieve good performance with fewer parameters. Our LLaVA-GM performs comparably to the 7B model on various datasets while having less than half the GFLOPs, proving its superiority.</p>
<table-wrap position="float" id="tab2">
<label>Table 2</label>
<caption>
<p>Compare similar models.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Methods</th>
<th align="center" valign="top">LLM</th>
<th align="center" valign="top">VQA2.0</th>
<th align="center" valign="top">GQA</th>
<th align="center" valign="top">SQA-IMG</th>
<th align="center" valign="top">MMbench</th>
<th align="center" valign="top">MME</th>
<th align="center" valign="top">GFLOPs</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">LLaVA-1.5 (<xref ref-type="bibr" rid="ref29">Liu et al., 2024</xref>)</td>
<td align="center" valign="middle">Vicuna-7B</td>
<td align="center" valign="top">78.5</td>
<td align="center" valign="top">62.0</td>
<td align="center" valign="top">66.8</td>
<td align="center" valign="top">64.3</td>
<td align="center" valign="top">1510.7</td>
<td align="center" valign="top">8,027</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-1.5 (<xref ref-type="bibr" rid="ref29">Liu et al., 2024</xref>)</td>
<td align="center" valign="middle">Vicuna-13B</td>
<td align="center" valign="top"><bold>80.5</bold></td>
<td align="center" valign="top">63.3</td>
<td align="center" valign="top"><bold>71.6</bold></td>
<td align="center" valign="top"><bold>67.7</bold></td>
<td align="center" valign="top"><bold>1531.3</bold></td>
<td align="center" valign="top">14,927</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-1.5 (<xref ref-type="bibr" rid="ref29">Liu et al., 2024</xref>)</td>
<td align="center" valign="middle">Phi-2.7B</td>
<td align="center" valign="top">67.5</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">68.4</td>
<td align="center" valign="top">58.8</td>
<td align="center" valign="top">1135.7</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">MoE-LLaVA (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>)</td>
<td align="center" valign="middle">Vicuna-7B</td>
<td align="center" valign="top">78.6</td>
<td align="center" valign="top">59.2</td>
<td align="center" valign="top">67.1</td>
<td align="center" valign="top">65.3</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">8,027</td>
</tr>
<tr>
<td align="left" valign="middle">MoE-LLaVA (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>)</td>
<td align="center" valign="middle">Phi-2.7B</td>
<td align="center" valign="top">71.4</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">68.4</td>
<td align="center" valign="top">65.2</td>
<td align="center" valign="top">1423.5</td>
<td align="center" valign="top">3,754</td>
</tr>
<tr>
<td align="left" valign="middle">TinyLLaVA (<xref ref-type="bibr" rid="ref41">Zhou et al., 2024</xref>)</td>
<td align="center" valign="middle">Phi-2B</td>
<td align="center" valign="top">72.4</td>
<td align="center" valign="top">58.4</td>
<td align="center" valign="top">67.2</td>
<td align="center" valign="top">66.1</td>
<td align="center" valign="top">1434.3</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-MoD (<xref ref-type="bibr" rid="ref34">Shu et al., 2024</xref>)</td>
<td align="center" valign="middle">Qwen1.5B</td>
<td align="center" valign="top">75.8</td>
<td align="center" valign="top">58.8</td>
<td align="center" valign="top">69.2</td>
<td align="center" valign="top">64.4</td>
<td align="center" valign="top">&#x2013;</td>
<td align="center" valign="top">&#x2013;</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-GM</td>
<td align="center" valign="middle">Gemma-2B</td>
<td align="center" valign="top">76.4</td>
<td align="center" valign="top">61.3</td>
<td align="center" valign="top">69.3</td>
<td align="center" valign="top">62.3</td>
<td align="center" valign="top">1498.3</td>
<td align="center" valign="top"><bold>3,623</bold></td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-GM</td>
<td align="center" valign="middle">Gemma-7B</td>
<td align="center" valign="top">76.8</td>
<td align="center" valign="top"><bold>62.4</bold></td>
<td align="center" valign="top">70.1</td>
<td align="center" valign="top">63.1</td>
<td align="center" valign="top">1502.5</td>
<td align="center" valign="top">7,512</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<p>The bold parts represent the optimal values under this indicator.</p>
</table-wrap-foot>
</table-wrap>
<p>In <xref ref-type="table" rid="tab3">Table 3</xref>, we compare expert-based models ranging from 1.6B to 7B. All models have 4 experts, activate Top-2, and half of their layers are replaced with MoE layers. LLaVA-GM-2B has only 2.2B active parameters, outperforming MoE-LLaVA-1.6B with similar sparsity.</p>
<table-wrap position="float" id="tab3">
<label>Table 3</label>
<caption>
<p>Effect of MoE parameter settings and adjustments on modeling.</p>
</caption>
<table frame="hsides" rules="groups">
<thead>
<tr>
<th align="left" valign="top">Methods</th>
<th align="center" valign="top">Experts</th>
<th align="left" valign="top">Top-k</th>
<th align="left" valign="top">MoE layers</th>
<th align="left" valign="top">Layers</th>
<th align="left" valign="top">Total param</th>
<th align="left" valign="top">Activated param</th>
</tr>
</thead>
<tbody>
<tr>
<td align="left" valign="middle">MoE-LLaVA-1.6B (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>)</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">16</td>
<td align="center" valign="top">32</td>
<td align="center" valign="top">2.9B</td>
<td align="center" valign="top">2.0B</td>
</tr>
<tr>
<td align="left" valign="middle">MoE-LLaVA-1.8B (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>)</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">24</td>
<td align="center" valign="top">3.1B</td>
<td align="center" valign="top">2.2B</td>
</tr>
<tr>
<td align="left" valign="middle">MoE-LLaVA-2.7B (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>)</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">16</td>
<td align="center" valign="top">32</td>
<td align="center" valign="top">5.3B</td>
<td align="center" valign="top">3.6B</td>
</tr>
<tr>
<td align="left" valign="middle">MoE-LLaVA-7B (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>)</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">16</td>
<td align="center" valign="top">32</td>
<td align="center" valign="top">14.3B</td>
<td align="center" valign="top">8.5B</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-GM-2B</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">12</td>
<td align="center" valign="top">24</td>
<td align="center" valign="top">3.3B</td>
<td align="center" valign="top">2.2B</td>
</tr>
<tr>
<td align="left" valign="middle">LLaVA-GM-7B</td>
<td align="center" valign="middle">4</td>
<td align="center" valign="top">2</td>
<td align="center" valign="top">16</td>
<td align="center" valign="top">32</td>
<td align="center" valign="top">13.4B</td>
<td align="center" valign="top">7.9B</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Gemma-2b performs well on simple factual questions, showing competitiveness in specific scenarios. In the VQAv2 task, model size significantly impacts generalization and robustness. Gemma-7B offers more stable performance with diverse images and questions, while Gemma-2B may have larger errors with rare ones. Gemma-7B generally outperforms Gemma-2B, especially in complex reasoning and multimodal fusion tasks. For sentiment analysis, Gemma-7B captures emotional cues more finely and makes more accurate judgments. Considering model size, choose Gemma-7B for fine-grained tasks and Gemma-2B for daily tasks. LLaVA-Gemma (<xref ref-type="bibr" rid="ref15">Hinck et al., 2024</xref>) only replaces the language model without introducing sparsity, resulting in high computational cost. The MoE of MoE-LLaVA (<xref ref-type="bibr" rid="ref27">Lin et al., 2024</xref>) has low accuracy after sparsification. LLaVA-GM significantly improves the accuracy of the architecture and reduces the number of parameters required by the model through a more efficient MoE layer design (12&#x2013;16 layers, 2.2B-7.9B activation parameters) and multi-stage fine-tuning training. These improvements make LLaVA-GM more suitable for application on resource-constrained devices.</p>
<p>The results generated by the model are shown below. Although the model size has been compressed to make it suitable for resource-constrained devices, it can be seen from the figure below that the quality of the generated text and the quality of the VQA command question and answer are not inferior at all.</p>
<p>As shown in <xref ref-type="fig" rid="fig5">Figure 5</xref>, LLaVA-GM can correctly identify the specific location, orientation, and color of objects in the image, introduce the scene, and reduce the description of non-important information and reduce redundancy.</p>
<fig position="float" id="fig5">
<label>Figure 5</label>
<caption>
<p>Example of the model&#x2019;s ability to perceive the outdoor environment (reproduced from MS COCO datasets, <ext-link xlink:href="https://cocodataset.org/" ext-link-type="uri">https://cocodataset.org/</ext-link>, licensed under <ext-link xlink:href="http://creativecommons.org/licenses/by/4.0/" ext-link-type="uri">CC BY 4.0</ext-link>).</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g005.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">A rider in equestrian attire, including a black jacket, white pants, black boots, and a helmet, rides a brown horse with white markings in an outdoor arena. Surrounded by green trees and a wooden fence, the dirt surface completes the scene.</alt-text>
</graphic>
</fig>
<p>As shown in <xref ref-type="fig" rid="fig6">Figure 6</xref>, we input a few simple math problems into the model and ask it to answer them. After quick thinking, the model answers the questions in sequential order without any redundant information.</p>
<fig position="float" id="fig6">
<label>Figure 6</label>
<caption>
<p>The model&#x2019;s answers to some simple math questions.</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g006.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">Math problem asking to solve four equations: Five plus three, eight minus two, four times six, and twelve divided by three. Solutions shown: eight, six, twenty-four, and four.</alt-text>
</graphic>
</fig>
<p>As shown in <xref ref-type="fig" rid="fig7">Figure 7</xref>, this is a virtual driver&#x2019;s license photo, which contains various images of the driver. In the original LLaVA paper, it generates answers in JSON format. The two paragraphs on the left of <xref ref-type="fig" rid="fig7">Figure 7</xref> are descriptions of a driver&#x2019;s license generated by LLaVA and LLaVA1.5, while the right side is a description of the driver&#x2019;s license information by LLaVA-GM. It can be seen that LLaVA and LLaVA1.5 have read errors in height, weight, birthday, and other information, but our model corrected the errors and gave the correct answer. The accuracy rate has increased by nearly 30%, which reflects the accuracy and sophistication of the model in image and text recognition.</p>
<fig position="float" id="fig7">
<label>Figure 7</label>
<caption>
<p>An example of a virtual driver&#x2019;s license recognition [reproduced from <xref ref-type="bibr" rid="ref04">Yang et al. (2023)</xref>, (2309.17421) The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision), licensed under <ext-link xlink:href="http://creativecommons.org/licenses/by/4.0/" ext-link-type="uri">CC-BY 4.0</ext-link>].</p>
</caption>
<graphic xlink:href="fcomp-07-1626346-g007.tif" mimetype="image" mime-subtype="tiff">
<alt-text content-type="machine-generated">A California driver's license with visible personal information. Details include name, address, date of birth, expiration date, class, and physical descriptions like height and hair color. Contains a veteran indicator. Surrounding text provides similar structured data about the license.</alt-text>
</graphic>
</fig>
</sec>
</sec>
<sec sec-type="conclusions" id="sec11">
<label>5</label>
<title>Conclusion</title>
<sec id="sec12">
<label>5.1</label>
<title>Significance of work</title>
<p>Our work explores the trade-off between computational efficiency and multimodal understanding for small models and elucidates how model size affects performance on different tasks. This provides valuable references and benchmarks for future research on small visual language models, contributing to further development and innovation in the field. Our research helps guide the choice of model size for optimal performance and efficiency in different scenarios. Compared to the dense fusion of LLaVA-1.5, LLaVA-GM achieves explicit &#x201C;disentanglement&#x201D; on a small model for the first time through a modular four-stage pipeline (visual encoding &#x2192; language encoding &#x2192; LLN fusion &#x2192; MoE output). The model decomposes multimodal processing into clear stages (visual encoding, language encoding, language-visual fusion layer, and MoE module), and the functions and outputs of each stage can be analyzed independently, which enhances the transparency of model behavior. In the test of the dataset, an average of 2 to 3 experts are activated, reducing the amount of redundant calculation by about 70% while maintaining performance.</p>
</sec>
<sec id="sec13">
<label>5.2</label>
<title>Limitation</title>
<p>
<list list-type="alpha-lower">
<list-item>
<p>Insufficient visual representation: the first stage freezes the weights of CLIP-ViT-L and only trains the MLP adapter, resulting in the loss of fine-grained visual details. It lags behind dynamic segmentation schemes in tasks that require high resolution or document understanding, such as MME and OCRBench.</p>
</list-item>
<list-item>
<p>Generalization bottleneck: the performance is significantly degraded on diverse and cross-domain datasets (such as DocVQA and Video-VQA), exposing the over-reliance of multi-stage training on early visual features.</p>
</list-item>
<list-item>
<p>Extremely low resource latency: although MoE has reduced GFLOPs by 70%, routing overhead still introduces additional latency on low-end mobile chips, affecting the real-time interactive experience.</p>
</list-item>
</list>
</p>
</sec>
<sec id="sec14">
<label>5.3</label>
<title>Future work</title>
<p>We plan to make improvements in the following three aspects:</p>
<list list-type="alpha-lower">
<list-item>
<p>Optimize visual encoding - enhance detail perception through lightweight fine-tuning or introducing multi-granularity encoders such as DINOv2/SigLIP2;</p>
</list-item>
<list-item>
<p>Expand training data - use a unified protocol similar to LLaVA-MOE to continue fine-tuning on high-resolution, document, and video data to improve cross-domain robustness;</p>
</list-item>
<list-item>
<p>Upgrade gate efficiency - use low-rank routing + quantized expert weights to enable MoE to achieve zero latency penalty on extremely constrained devices.</p>
</list-item>
</list>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="sec15">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material, further inquiries can be directed to the corresponding author.</p>
</sec>
<sec sec-type="ethics-statement" id="sec16">
<title>Ethics statement</title>
<p>Written informed consent was obtained from the individual(s) for the publication of any potentially identifiable images or data included in this article.</p>
</sec>
<sec sec-type="author-contributions" id="sec17">
<title>Author contributions</title>
<p>ZH: Conceptualization, Data curation, Resources, Validation, Writing &#x2013; original draft. XL: Conceptualization, Funding acquisition, Project administration, Validation, Writing &#x2013; review &#x0026; editing. JH: Formal analysis, Investigation, Methodology, Software, Supervision, Visualization, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<ack><title>Acknowledgments</title>
<p>The authors thank researchers from Hebei University of Architecture for their research support.</p>
</ack>
<sec sec-type="COI-statement" id="sec19">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="ai-statement" id="sec20">
<title>Generative AI statement</title>
<p>The authors declare that no Gen AI was used in the creation of this manuscript.</p>
<p>Any alternative text (alt text) provided alongside figures in this article has been generated by Frontiers with the support of artificial intelligence and reasonable efforts have been made to ensure accuracy, including review by the authors wherever possible. If you identify any issues, please contact us.</p>
</sec>
<sec sec-type="disclaimer" id="sec21">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec><ref-list>
<title>References</title>
<ref id="ref1"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Achiam</surname><given-names>J.</given-names></name> <name><surname>Adler</surname><given-names>S.</given-names></name> <name><surname>Agarwal</surname><given-names>S.</given-names></name> <name><surname>Ahmad</surname><given-names>L.</given-names></name> <name><surname>Akkaya</surname><given-names>I.</given-names></name> <name><surname>Aleman</surname><given-names>F. L.</given-names></name></person-group> (<year>2023</year>). <source>Gpt-4 technical report</source>.</mixed-citation></ref>
<ref id="ref2"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Ainslie</surname><given-names>J.</given-names></name> <name><surname>Lee-Thorp</surname><given-names>J.</given-names></name> <name><surname>De Jong</surname><given-names>M.</given-names></name> <name><surname>Zemlyanskiy</surname><given-names>Y.</given-names></name> <name><surname>Lebr&#x00F3;n</surname><given-names>F.</given-names></name> <name><surname>Sanghai</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). <source>Gqa: Training generalized multi-query transformer models from multi-head checkpoints</source>.</mixed-citation></ref>
<ref id="ref3"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Alsayed</surname><given-names>A.</given-names></name> <name><surname>Arif</surname><given-names>M.</given-names></name> <name><surname>Qadah</surname><given-names>T. M.</given-names></name> <name><surname>Alotaibi</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). <article-title>A systematic literature review on using the encoder-decoder models for image captioning in English and Arabic languages</article-title>. <source>Appl. Sci.</source> <volume>13</volume>:<fpage>10894</fpage>. doi: <pub-id pub-id-type="doi">10.3390/app131910894</pub-id></mixed-citation></ref>
<ref id="ref4"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Bernardi</surname><given-names>R.</given-names></name> <name><surname>Cakici</surname><given-names>R.</given-names></name> <name><surname>Elliott</surname><given-names>D.</given-names></name> <name><surname>Erdem</surname><given-names>A.</given-names></name> <name><surname>Erdem</surname><given-names>E.</given-names></name> <name><surname>Ikizler-Cinbis</surname><given-names>N.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>Automatic description generation from images: a survey of models, datasets, and evaluation measures</article-title>. <source>J. Artif. Intell. Res.</source> <volume>55</volume>, <fpage>409</fpage>&#x2013;<lpage>442</lpage>. doi: <pub-id pub-id-type="doi">10.1613/jair.4900</pub-id></mixed-citation></ref>
<ref id="ref5"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>K.</given-names></name> <name><surname>Zhang</surname><given-names>Z.</given-names></name> <name><surname>Zeng</surname><given-names>W.</given-names></name> <name><surname>Zhang</surname><given-names>R.</given-names></name> <name><surname>Zhu</surname><given-names>F.</given-names></name> <name><surname>Zhao</surname><given-names>R.</given-names></name></person-group> (<year>2023</year>). <source>Shikra: Unleashing multimodal llm's referential dialogue magic</source>.</mixed-citation></ref>
<ref id="ref6"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>L.</given-names></name> <name><surname>Zhu</surname><given-names>G.</given-names></name></person-group> (<year>2025</year>). <article-title>Self-supervised contrastive learning for itinerary recommendation</article-title>. <source>Expert Syst. Appl.</source> <volume>268</volume>:<fpage>126246</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2024.126246</pub-id></mixed-citation></ref>
<ref id="ref7"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>L.</given-names></name> <name><surname>Zhu</surname><given-names>G.</given-names></name> <name><surname>Liang</surname><given-names>W.</given-names></name> <name><surname>Cao</surname><given-names>J.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name></person-group> (<year>2024</year>). <article-title>Keywords-enhanced contrastive learning model for travel recommendation</article-title>. <source>Inf. Process. Manag.</source> <volume>61</volume>:<fpage>103874</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ipm.2024.103874</pub-id></mixed-citation></ref>
<ref id="ref8"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Chen</surname><given-names>L.</given-names></name> <name><surname>Zhu</surname><given-names>X.</given-names></name> <name><surname>Zhu</surname><given-names>G.</given-names></name></person-group> (<year>2025</year>). <article-title>Exploiting attributes and keywords for session-based recommendation with multi-view graph neural network</article-title>. <source>Expert Syst. Appl.</source> <volume>13</volume>:<fpage>128990</fpage>. doi: <pub-id pub-id-type="doi">10.1016/j.eswa.2025.128990</pub-id></mixed-citation></ref>
<ref id="ref9"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Dai</surname><given-names>D.</given-names></name> <name><surname>Deng</surname><given-names>C.</given-names></name> <name><surname>Zhao</surname><given-names>C.</given-names></name> <name><surname>Xu</surname><given-names>R. X.</given-names></name> <name><surname>Gao</surname><given-names>H.</given-names></name> <name><surname>Chen</surname><given-names>D.</given-names></name></person-group> (<year>2024</year>). <source>Deepseekmoe: Towards ultimate expert specialization in mixture-of-experts language models</source>.</mixed-citation></ref>
<ref id="ref10"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Dong</surname><given-names>X.</given-names></name> <name><surname>Long</surname><given-names>C.</given-names></name> <name><surname>Xu</surname><given-names>W.</given-names></name> <name><surname>Xiao</surname><given-names>C.</given-names></name></person-group> (<year>2021</year>). &#x201C;Dual graph convolutional networks with transformer and curriculum learning for image captioning,&#x201D; in <italic>Proceedings of the 29th ACM International Conference on Multimedia</italic>, 2615&#x2013;2624.</mixed-citation></ref>
<ref id="ref11"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Fedus</surname><given-names>W.</given-names></name> <name><surname>Zoph</surname><given-names>B.</given-names></name> <name><surname>Shazeer</surname><given-names>N.</given-names></name></person-group> (<year>2022</year>). <article-title>Switch transformers: scaling to trillion parameter models with simple and efficient sparsity</article-title>. <source>J. Mach. Learn. Res.</source> <volume>23</volume>, <fpage>1</fpage>&#x2013;<lpage>39</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2101.03961</pub-id></mixed-citation></ref>
<ref id="ref12"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Goyal</surname><given-names>Y.</given-names></name> <name><surname>Khot</surname><given-names>T.</given-names></name> <name><surname>Summers-Stay</surname><given-names>D.</given-names></name> <name><surname>Batra</surname><given-names>D.</given-names></name> <name><surname>Parikh</surname><given-names>D.</given-names></name></person-group> (<year>2017</year>). &#x201C;Making the v in vqa matter: Elevating the role of image understanding in visual question answering,&#x201D; in <italic>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</italic> (6904&#x2013;6913).</mixed-citation></ref>
<ref id="ref13"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Guan</surname><given-names>Q. L.</given-names></name> <name><surname>Zheng</surname><given-names>Y.</given-names></name> <name><surname>Meng</surname><given-names>L.</given-names></name> <name><surname>Dong</surname><given-names>L. Q.</given-names></name> <name><surname>Hao</surname><given-names>Q.</given-names></name></person-group> (<year>2023</year>). <article-title>Improving the generalization of visual classification models across IoT cameras via cross-modal inference and fusion</article-title>. <source>IEEE Internet Things J.</source> <volume>10</volume>, <fpage>15835</fpage>&#x2013;<lpage>15846</lpage>. doi: <pub-id pub-id-type="doi">10.1109/JIOT.2023.3265645</pub-id></mixed-citation></ref>
<ref id="ref14"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Guo</surname><given-names>D.</given-names></name> <name><surname>Zhu</surname><given-names>Q.</given-names></name> <name><surname>Yang</surname><given-names>D.</given-names></name> <name><surname>Xie</surname><given-names>Z.</given-names></name> <name><surname>Dong</surname><given-names>K.</given-names></name> <name><surname>Zhang</surname><given-names>W.</given-names></name></person-group> (<year>2024</year>). <source>DeepSeek-coder: When the large language model meets programming--the rise of code intelligence</source>.</mixed-citation></ref>
<ref id="ref15"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Hinck</surname><given-names>M.</given-names></name> <name><surname>Olson</surname><given-names>M. L.</given-names></name> <name><surname>Cobbley</surname><given-names>D.</given-names></name> <name><surname>Tseng</surname><given-names>S. Y.</given-names></name> <name><surname>Lal</surname><given-names>V.</given-names></name></person-group> (<year>2024</year>). <source>Llava-gemma: accelerating multimodal foundation models with a compact language model</source>.</mixed-citation></ref>
<ref id="ref16"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Huang</surname><given-names>J.</given-names></name> <name><surname>Zhang</surname><given-names>J.</given-names></name> <name><surname>Jiang</surname><given-names>K.</given-names></name> <name><surname>Qiu</surname><given-names>H.</given-names></name> <name><surname>Lu</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). <source>Visual instruction tuning towards general-purpose multimodal model: A survey</source>.</mixed-citation></ref>
<ref id="ref17"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Hwang</surname><given-names>C.</given-names></name> <name><surname>Cui</surname><given-names>W.</given-names></name> <name><surname>Xiong</surname><given-names>Y.</given-names></name> <name><surname>Yang</surname><given-names>Z.</given-names></name> <name><surname>Liu</surname><given-names>Z.</given-names></name> <name><surname>Hu</surname><given-names>H.</given-names></name> <etal/></person-group>. (<year>2023</year>). <article-title>Tutel: adaptive mixture-of-experts at scale</article-title>. <source>Proc. Mach. Learning Syst.</source> <volume>5</volume>, <fpage>269</fpage>&#x2013;<lpage>287</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2206.03382</pub-id></mixed-citation></ref>
<ref id="ref18"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Jiao</surname><given-names>Q.</given-names></name> <name><surname>Chen</surname><given-names>D.</given-names></name> <name><surname>Huang</surname><given-names>Y.</given-names></name> <name><surname>Li</surname><given-names>Y.</given-names></name> <name><surname>Shen</surname><given-names>Y.</given-names></name></person-group> (<year>2024</year>). <source>Enhancing multimodal large language models with vision detection models: An empirical study</source>.</mixed-citation></ref>
<ref id="ref19"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Jin</surname><given-names>Y.</given-names></name> <name><surname>Li</surname><given-names>J.</given-names></name> <name><surname>Liu</surname><given-names>Y.</given-names></name> <name><surname>Gu</surname><given-names>T.</given-names></name> <name><surname>Wu</surname><given-names>K.</given-names></name> <name><surname>Jiang</surname><given-names>Z.</given-names></name></person-group> (<year>2024</year>). <source>Efficient multimodal large language models: A survey</source>.</mixed-citation></ref>
<ref id="ref20"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Khan</surname><given-names>S.</given-names></name> <name><surname>Naseer</surname><given-names>M.</given-names></name> <name><surname>Hayat</surname><given-names>M.</given-names></name> <name><surname>Zamir</surname><given-names>S. W.</given-names></name> <name><surname>Khan</surname><given-names>F. S.</given-names></name> <name><surname>Shah</surname><given-names>M.</given-names></name></person-group> (<year>2022</year>). <article-title>Transformers in vision: a survey</article-title>. <source>ACM Computing Surveys (CSUR)</source> <volume>54</volume>, <fpage>1</fpage>&#x2013;<lpage>41</lpage>. doi: <pub-id pub-id-type="doi">10.1145/3505244</pub-id></mixed-citation></ref>
<ref id="ref21"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Kolomiyets</surname><given-names>O.</given-names></name> <name><surname>Moens</surname><given-names>M. F.</given-names></name></person-group> (<year>2011</year>). <article-title>A survey on question answering technology from an information retrieval perspective</article-title>. <source>Inf. Sci.</source> <volume>181</volume>, <fpage>5412</fpage>&#x2013;<lpage>5434</lpage>. doi: <pub-id pub-id-type="doi">10.1016/j.ins.2011.07.047</pub-id></mixed-citation></ref>
<ref id="ref22"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lan</surname><given-names>Y.</given-names></name> <name><surname>Guo</surname><given-names>Y.</given-names></name> <name><surname>Chen</surname><given-names>Q.</given-names></name> <name><surname>Lin</surname><given-names>S.</given-names></name> <name><surname>Chen</surname><given-names>Y.</given-names></name> <name><surname>Deng</surname><given-names>X.</given-names></name></person-group> (<year>2023</year>). <article-title>Visual question answering model for fruit tree disease decision-making based on multimodal deep learning</article-title>. <source>Front. Plant Sci.</source> <volume>13</volume>:<fpage>1064399</fpage>. doi: <pub-id pub-id-type="doi">10.3389/fpls.2022.1064399</pub-id>, PMID: <pub-id pub-id-type="pmid">36684756</pub-id></mixed-citation></ref>
<ref id="ref23"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Li</surname><given-names>J.</given-names></name> <name><surname>Li</surname><given-names>D.</given-names></name> <name><surname>Savarese</surname><given-names>S.</given-names></name> <name><surname>Hoi</surname><given-names>S.</given-names></name></person-group> (<year>2023</year>). &#x201C;Blip-2: bootstrapping language-image pre-training with frozen image encoders and large language models,&#x201D; in <italic>International Conference on Machine Learning</italic> (19730&#x2013;19742). PMLR.</mixed-citation></ref>
<ref id="ref24"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>J.</given-names></name> <name><surname>Li</surname><given-names>D.</given-names></name> <name><surname>Xiong</surname><given-names>C.</given-names></name> <name><surname>Hoi</surname><given-names>S.</given-names></name></person-group> (<year>2022</year>). <article-title>Blip: bootstrapping language-image pre-training for unified vision-language understanding and generation</article-title>. <source>Int. Conf. n Mach. Learning</source> <volume>2</volume>, <fpage>12888</fpage>&#x2013;<lpage>12900</lpage>.</mixed-citation></ref>
<ref id="ref25"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Li</surname><given-names>B.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name> <name><surname>Guo</surname><given-names>D.</given-names></name> <name><surname>Zhang</surname><given-names>R.</given-names></name> <name><surname>Li</surname><given-names>F.</given-names></name> <name><surname>Zhang</surname><given-names>H.</given-names></name></person-group> (<year>2024</year>). <source>Llava-onevision: Easy visual task transfer</source>.</mixed-citation></ref>
<ref id="ref26"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Liang</surname><given-names>Z.</given-names></name> <name><surname>Xu</surname><given-names>Y.</given-names></name> <name><surname>Hong</surname><given-names>Y.</given-names></name> <name><surname>Shang</surname><given-names>P.</given-names></name> <name><surname>Wang</surname><given-names>Q.</given-names></name> <name><surname>Fu</surname><given-names>Q.</given-names></name></person-group> (<year>2024</year>). &#x201C;A survey of multimodel large language models,&#x201D; in <italic>Proceedings of the 3rd International Conference on Computer, Artificial Intelligence and Control Engineering</italic> (405&#x2013;409).</mixed-citation></ref>
<ref id="ref27"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Lin</surname><given-names>B.</given-names></name> <name><surname>Tang</surname><given-names>Z.</given-names></name> <name><surname>Ye</surname><given-names>Y.</given-names></name> <name><surname>Cui</surname><given-names>J.</given-names></name> <name><surname>Zhu</surname><given-names>B.</given-names></name> <name><surname>Jin</surname><given-names>P.</given-names></name></person-group> (<year>2024</year>). <source>Moe-llava: Mixture of experts for large vision-language models</source>.</mixed-citation></ref>
<ref id="ref28"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>Y.</given-names></name> <name><surname>Duan</surname><given-names>H.</given-names></name> <name><surname>Zhang</surname><given-names>Y.</given-names></name> <name><surname>Li</surname><given-names>B.</given-names></name> <name><surname>Zhang</surname><given-names>S.</given-names></name> <name><surname>Zhao</surname><given-names>W.</given-names></name></person-group> (<year>2024</year>). &#x201C;Mmbench: is your multi-modal model an all-around player?,&#x201D; in <italic>European Conference on Computer Vision</italic> (216&#x2013;233). Cham: Springer Nature Switzerland.</mixed-citation></ref>
<ref id="ref29"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>H.</given-names></name> <name><surname>Li</surname><given-names>C.</given-names></name> <name><surname>Li</surname><given-names>Y.</given-names></name> <name><surname>Lee</surname><given-names>Y. J.</given-names></name></person-group> (<year>2024</year>). &#x201C;Improved baselines with visual instruction tuning,&#x201D; in <italic>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</italic> 26296&#x2013;26306).</mixed-citation></ref>
<ref id="ref30"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Lu</surname><given-names>P.</given-names></name> <name><surname>Mishra</surname><given-names>S.</given-names></name> <name><surname>Xia</surname><given-names>T.</given-names></name> <name><surname>Qiu</surname><given-names>L.</given-names></name> <name><surname>Chang</surname><given-names>K. W.</given-names></name> <name><surname>Zhu</surname><given-names>S. C.</given-names></name></person-group> (<year>2022</year>). <article-title>Learn to explain: multimodal reasoning via thought chains for science question answering</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>35</volume>, <fpage>2507</fpage>&#x2013;<lpage>2521</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2209.09513</pub-id></mixed-citation></ref>
<ref id="ref31"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Menger</surname><given-names>F. M.</given-names></name> <name><surname>Keiper</surname><given-names>J. S.</given-names></name></person-group> (<year>2000</year>). <article-title>Gemini surfactants</article-title>. <source>Angew. Chem. Int. Ed.</source> <volume>39</volume>, <fpage>1906</fpage>&#x2013;<lpage>1920</lpage>. doi: <pub-id pub-id-type="doi">10.1002/1521-3773(20000602)39:11&#x003C;1906::AID-ANIE1906&#x003E;3.0.CO;2-Q</pub-id>, PMID: <pub-id pub-id-type="pmid">10940980</pub-id></mixed-citation></ref>
<ref id="ref32"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Radford</surname><given-names>A.</given-names></name> <name><surname>Kim</surname><given-names>J. W.</given-names></name> <name><surname>Hallacy</surname><given-names>C.</given-names></name> <name><surname>Ramesh</surname><given-names>A.</given-names></name> <name><surname>Goh</surname><given-names>G.</given-names></name> <name><surname>Agarwal</surname><given-names>S.</given-names></name></person-group> (<year>2021</year>). &#x201C;Learning transferable visual models from natural language supervision,&#x201D; in <italic>International Conference on Machine Learning</italic> (8748&#x2013;8763). PmLR.</mixed-citation></ref>
<ref id="ref33"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Roumeliotis</surname><given-names>K. I.</given-names></name> <name><surname>Tselikas</surname><given-names>N. D.</given-names></name></person-group> (<year>2023</year>). <article-title>Chatgpt and open-ai models: a preliminary review</article-title>. <source>Future Internet</source> <volume>15</volume>:<fpage>192</fpage>. doi: <pub-id pub-id-type="doi">10.3390/fi15060192</pub-id></mixed-citation></ref>
<ref id="ref34"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Shu</surname><given-names>F.</given-names></name> <name><surname>Liao</surname><given-names>Y.</given-names></name> <name><surname>Zhuo</surname><given-names>L.</given-names></name> <name><surname>Xu</surname><given-names>C.</given-names></name> <name><surname>Zhang</surname><given-names>L.</given-names></name> <name><surname>Zhang</surname><given-names>G.</given-names></name></person-group> (<year>2024</year>). <source>Llava-mod: Making llava tiny via Moe knowledge distillation</source>.</mixed-citation></ref>
<ref id="ref35"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Singh</surname><given-names>A.</given-names></name> <name><surname>Hu</surname><given-names>R.</given-names></name> <name><surname>Goswami</surname><given-names>V.</given-names></name> <name><surname>Couairon</surname><given-names>G.</given-names></name> <name><surname>Galuba</surname><given-names>W.</given-names></name> <name><surname>Rohrbach</surname><given-names>M.</given-names></name></person-group> (<year>2022</year>). &#x201C;Flava: A foundational language and vision alignment model,&#x201D; in <italic>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</italic> (15638&#x2013;15650).</mixed-citation></ref>
<ref id="ref36"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Stiennon</surname><given-names>N.</given-names></name> <name><surname>Ouyang</surname><given-names>L.</given-names></name> <name><surname>Wu</surname><given-names>J.</given-names></name> <name><surname>Ziegler</surname><given-names>D.</given-names></name> <name><surname>Lowe</surname><given-names>R.</given-names></name> <name><surname>Voss</surname><given-names>C.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Learning to summarize with human feedback</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>33</volume>, <fpage>3008</fpage>&#x2013;<lpage>3021</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2009.01325</pub-id></mixed-citation></ref>
<ref id="ref37"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Touvron</surname><given-names>H.</given-names></name> <name><surname>Lavril</surname><given-names>T.</given-names></name> <name><surname>Izacard</surname><given-names>G.</given-names></name> <name><surname>Martinet</surname><given-names>X.</given-names></name> <name><surname>Lachaux</surname><given-names>M. A.</given-names></name> <name><surname>Lacroix</surname><given-names>T.</given-names></name></person-group> (<year>2023</year>). <source>Llama: Open and efficient foundation language models</source>.</mixed-citation></ref>
<ref id="ref38"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Yang</surname><given-names>H.</given-names></name> <name><surname>Xu</surname><given-names>M.</given-names></name> <name><surname>Sun</surname><given-names>Z.</given-names></name> <name><surname>Song</surname><given-names>B.</given-names></name> <name><surname>Cheng</surname><given-names>E.</given-names></name></person-group> (<year>2024</year>). &#x201C;CLIP-ViT detector: side adapter with prompt for vision transformer object detection,&#x201D; in <italic>2024 7th International Conference on Algorithms, Computing and Artificial Intelligence (ACAI)</italic> (pp. 1&#x2013;8). IEEE.</mixed-citation></ref>
<ref id="ref04"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yang</surname><given-names>Z.</given-names></name> <name><surname>Li</surname><given-names>L.</given-names></name> <name><surname>Lin</surname><given-names>K.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Lin</surname><given-names>C.</given-names></name> <name><surname>Liu</surname><given-names>L.</given-names></name> <etal/></person-group>. (<year>2023</year>). The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision), [2309.17421] The Dawn of LMMs: Preliminary Explorations with GPT-4V(ision).</mixed-citation></ref>
<ref id="ref39"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Yin</surname><given-names>Z.</given-names></name> <name><surname>Wang</surname><given-names>J.</given-names></name> <name><surname>Cao</surname><given-names>J.</given-names></name> <name><surname>Shi</surname><given-names>Z.</given-names></name> <name><surname>Liu</surname><given-names>D.</given-names></name> <name><surname>Li</surname><given-names>M.</given-names></name></person-group> (<year>2023</year>). <article-title>Lamm: language-assisted multi-modal instruction-tuning dataset, framework, and benchmark</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>36</volume>, <fpage>26650</fpage>&#x2013;<lpage>26685</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2306.06687</pub-id></mixed-citation></ref>
<ref id="ref40"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>Y.</given-names></name> <name><surname>Zhang</surname><given-names>J.</given-names></name> <name><surname>Zong</surname><given-names>C.</given-names></name></person-group> (<year>2023</year>). <article-title>Transformer: a general framework from machine translation to others</article-title>. <source>Mach. Int. Res.</source> <volume>20</volume>, <fpage>514</fpage>&#x2013;<lpage>538</lpage>. doi: <pub-id pub-id-type="doi">10.1007/s11633-022-1393-5</pub-id></mixed-citation></ref>
<ref id="ref41"><mixed-citation publication-type="other"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>B.</given-names></name> <name><surname>Hu</surname><given-names>Y.</given-names></name> <name><surname>Weng</surname><given-names>X.</given-names></name> <name><surname>Jia</surname><given-names>J.</given-names></name> <name><surname>Luo</surname><given-names>J.</given-names></name> <name><surname>Liu</surname><given-names>X.</given-names></name></person-group> (<year>2024</year>). <source>Tinyllava: A framework of small-scale large multimodal models</source>.</mixed-citation></ref>
<ref id="ref42"><mixed-citation publication-type="journal"><person-group person-group-type="author"><name><surname>Zhou</surname><given-names>Y.</given-names></name> <name><surname>Lei</surname><given-names>T.</given-names></name> <name><surname>Liu</surname><given-names>H.</given-names></name> <name><surname>Du</surname><given-names>N.</given-names></name> <name><surname>Huang</surname><given-names>Y.</given-names></name> <name><surname>Zhao</surname><given-names>V.</given-names></name></person-group> (<year>2022</year>). <article-title>Mixture-of-experts with expert choice routing</article-title>. <source>Adv. Neural Inf. Proces. Syst.</source> <volume>35</volume>, <fpage>7103</fpage>&#x2013;<lpage>7114</lpage>. doi: <pub-id pub-id-type="doi">10.48550/arXiv.2202.09368</pub-id></mixed-citation></ref>
</ref-list>
<fn-group><fn id="fn0001" fn-type="custom" custom-type="edited-by"><p>Edited by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3005013/overview">J. de Curt&#x00F2;</ext-link>, Barcelona Supercomputing Center, Spain</p></fn>
<fn id="fn0002" fn-type="custom" custom-type="reviewed-by"><p>Reviewed by: <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1839716/overview">Lei Chen</ext-link>, Nanjing Forestry University, China; <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/460238/overview">Rohit Shukla</ext-link>, University of Wisconsin-Madison, United States</p></fn>
<fn fn-type="abbr"><label>Abbreviations</label>
<p>LLaVA, Large Language and Vision Assistant; VQA, Visual Question Answering; MoE, Mixture of Experts; LVLM, Large Visual Language Model; MHSA, Multi-Head Self-Attention; FFNN, Feed-Forward Neural Network.</p>
</fn>
</fn-group></back>
</article>