<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="EN">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Digit. Health</journal-id>
<journal-title>Frontiers in Digital Health</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Digit. Health</abbrev-journal-title>
<issn pub-type="epub">2673-253X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fdgth.2025.1478688</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Digital Health</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Skin disease diagnosis using decision and feature level fusion of deep features</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name><surname>Zasim Uddin</surname><given-names>Md.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="corresp" rid="cor1">&#x002A;</xref>
<xref ref-type="author-notes" rid="an1"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2812237/overview" />
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/><role content-type="https://credit.niso.org/contributor-roles/data-curation/"/><role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/><role content-type="https://credit.niso.org/contributor-roles/methodology/"/><role content-type="https://credit.niso.org/contributor-roles/resources/"/><role content-type="https://credit.niso.org/contributor-roles/software/"/><role content-type="https://credit.niso.org/contributor-roles/visualization/"/><role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/><role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/></contrib>
<contrib contrib-type="author" equal-contrib="yes"><name><surname>Arif Shahriar</surname><given-names>Md.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="author-notes" rid="an1"><sup>&#x2020;</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2814594/overview" />
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/><role content-type="https://credit.niso.org/contributor-roles/software/"/><role content-type="https://credit.niso.org/contributor-roles/visualization/"/><role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/><role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/></contrib>
<contrib contrib-type="author"><name><surname>Schuller</surname><given-names>Bj&#x00F6;rn W.</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref><role content-type="https://credit.niso.org/contributor-roles/data-curation/"/><role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/><role content-type="https://credit.niso.org/contributor-roles/methodology/"/><role content-type="https://credit.niso.org/contributor-roles/validation/"/><role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/><role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/></contrib>
<contrib contrib-type="author"><name><surname>Nadim Mahamood</surname><given-names>Md.</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="https://loop.frontiersin.org/people/2814129/overview" />
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/><role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/><role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/></contrib>
<contrib contrib-type="author"><name><surname>Atiqur Rahman Ahad</surname><given-names>Md.</given-names></name>
<xref ref-type="aff" rid="aff4"><sup>4</sup></xref><role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/><role content-type="https://credit.niso.org/contributor-roles/data-curation/"/><role content-type="https://credit.niso.org/contributor-roles/funding-acquisition/"/><role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/><role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/></contrib>
</contrib-group>
<aff id="aff1"><label><sup>1</sup></label><institution>Department of Computer Science and Engineering, Begum Rokeya University</institution>, <addr-line>Rangpur</addr-line>, <country>Bangladesh</country></aff>
<aff id="aff2"><label><sup>2</sup></label><institution>MRI, CHI&#x2014;Chair of Health Informatics, Technische Universit&#x00E4;t M&#x00FC;nchen</institution>, <addr-line>Munich</addr-line>, <country>Germany</country></aff>
<aff id="aff3"><label><sup>3</sup></label><institution>GLAM, Imperial College London</institution>, <addr-line>London</addr-line>, <country>United Kingdom</country></aff>
<aff id="aff4"><label><sup>4</sup></label><institution>Department of Computer Science and Digital Technology, University of East London</institution>, <addr-line>London</addr-line>, <country>United Kingdom</country></aff>
<author-notes>
<fn fn-type="edited-by"><p><bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/992002/overview">Hazrat Ali</ext-link>, University of Stirling, United Kingdom</p></fn>
<fn fn-type="edited-by"><p><bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1714608/overview">Sandeep Poddar</ext-link>, Lincoln University College, Malaysia </p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3071098/overview">Mohamed Lachgar</ext-link>, Cadi Ayyad University, Morocco </p>
<p><ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/3191328/overview">Wei Dai</ext-link>, City University of Hong Kong, Hong Kong SAR, China</p></fn>
<corresp id="cor1"><label>&#x002A;</label><bold>Correspondence:</bold> Md. Zasim Uddin <email>zasim@brur.ac.bd</email></corresp>
<fn fn-type="equal" id="an1"><label><sup>&#x2020;</sup></label><p>These authors have contributed equally to this work</p></fn>
</author-notes>
<pub-date pub-type="epub"><day>17</day><month>10</month><year>2025</year></pub-date>
<pub-date pub-type="collection"><year>2025</year></pub-date>
<volume>7</volume><elocation-id>1478688</elocation-id>
<history>
<date date-type="received"><day>10</day><month>08</month><year>2024</year></date>
<date date-type="accepted"><day>25</day><month>09</month><year>2025</year></date>
</history>
<permissions>
<copyright-statement>&#x00A9; 2025 Zasim Uddin, Arif Shahriar, Schuller, Nadim Mahamood and Atiqur Rahman Ahad.</copyright-statement>
<copyright-year>2025</copyright-year><copyright-holder>Zasim Uddin, Arif Shahriar, Schuller, Nadim Mahamood and Atiqur Rahman Ahad</copyright-holder><license license-type="open-access" xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the <ext-link ext-link-type="uri" xlink:href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution License (CC BY)</ext-link>. The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract><sec><title>Introduction</title>
<p>Early skin disease diagnosis is essential and one of the challenging tasks for a dermatologist. Manual diagnosis by healthcare providers is subjective, costly, and may yield inconsistent results. In contrast, automated skin disease detection and classification using traditional machine learning and deep learning approaches have shown promise in addressing this problem.</p>
</sec><sec><title>Methods</title>
<p>In this study, we propose a hybrid ensemble framework that integrates both feature-level fusion (FLF) and decision-level fusion (DLF) to leverage complementary strengths for detecting and classifying skin diseases. We employ two convolutional neural network (CNN)-based models, i.e., a modified DenseNet201 and VGG19, along with an attention-based model vision transformer (ViT) to identify and classify skin diseases. In FLF, feature representations from these models are point-wise added and passed through a shared classification head to make the final prediction. In DLF, decisions from each base model are collected, and the majority voting scheme is used to make a final decision. Furthermore, we incorporate a generative adversarial network (GAN)-based approach for offline-based training data augmentation to reduce overfitting and improve performance.</p>
</sec><sec><title>Results</title>
<p>Based on different evaluation metrics (i.e., accuracy, precision, recall, and F1-score), our proposed framework demonstrates superior performance on four benchmark datasets: the PH2, HAM10000, ISIC 2018, and ISIC 2019 datasets, with an accuracy of 99.3&#x0025;/99.2&#x0025;, 92.7&#x0025;/96.1&#x0025;, 86.7&#x0025;/89.0&#x0025;, and 94.5&#x0025;/95.0&#x0025;, respectively, for FLF/DLF.</p>
</sec><sec><title>Discussion</title>
<p>These results demonstrate that while both fusion strategies are effective, DLF slightly outperforms FLF, emphasizing the value of ensemble decision aggregation for robust skin disease classification.</p>
</sec>
</abstract>
<kwd-group>
<kwd>skin disease diagnosis</kwd>
<kwd>deep learning</kwd>
<kwd>feature-level fusion</kwd>
<kwd>decision-level fusion</kwd>
<kwd>GAN</kwd>
<kwd>classification</kwd>
</kwd-group><contract-num rid="cn001">1280101-120008431-3631108</contract-num><contract-sponsor id="cn001">ICT division</contract-sponsor><contract-sponsor id="cn002">Center for Natural Science and Engineering Research (CNSER)</contract-sponsor><counts>
<fig-count count="10"/>
<table-count count="9"/><equation-count count="82"/><ref-count count="100"/><page-count count="19"/><word-count count="0"/></counts><custom-meta-wrap><custom-meta><meta-name>section-at-acceptance</meta-name><meta-value>Health Technology Implementation</meta-value></custom-meta></custom-meta-wrap>
</article-meta>
</front>
<body><sec id="s1" sec-type="intro"><label>1</label><title>Introduction</title>
<p>The human skin is the largest and most powerful organ in the body. It guards the body against outer temperature, ultraviolet rays, and harmful chemicals. Furthermore, the skin produces essential vitamin D in the human body. However, the human skin suffers from different causes, namely pollution, poor immunity, viruses, alcohol, unhealthy lifestyles, and ultraviolet light. Therefore, various diseases affect the human skin (<xref ref-type="bibr" rid="B1">1</xref>). Skin diseases are important public health problems that prevail in almost all age groups and are one of the most widespread kinds of illnesses worldwide (<xref ref-type="bibr" rid="B2">2</xref>). In the current context, diagnosing diseases still necessitates self-monitoring and regular medical examinations. In most cases, skin diseases can be tackled without any special treatment, whereas some of them lead to cancer and are life-threatening. The World Health Organization (WHO) reports that by the age of 70, one in five Americans will receive a diagnosis of skin cancer, with approximately 95,000 new cases being diagnosed daily in the US alone (<xref ref-type="bibr" rid="B3">3</xref>).</p>
<p>Early detection and treatment of skin disease are essential for reducing patient suffering and improving outcomes (<xref ref-type="bibr" rid="B4">4</xref>); otherwise, it may advance, possibly spread, and penetrate deeper layers of the skin, resulting in more severe stages of the condition (<xref ref-type="bibr" rid="B5">5</xref>). In extreme circumstances, skin diseases can lead to serious outcomes, including hindrance of daily functions, breakdown of relationships, and harm to internal organs, even death in cases like melanoma (a skin disease primarily characterized by the abnormal growth of melanocytes). Furthermore, they present a genuine risk of mental health issues such as isolation, depression, and potentially even suicide. However, if diagnosed early and properly treated, the survival rate can be as high as 97.0&#x0025; (<xref ref-type="bibr" rid="B6">6</xref>).</p>
<p>For early diagnosis of skin disease, self-examination is a crucial step (<xref ref-type="bibr" rid="B7">7</xref>). The American Center for the Study of Dermatology developed an ABCD guideline so that individuals can be vigilant in recognizing asymmetry, wavy borders, color changes, and diameter on their skin (<xref ref-type="bibr" rid="B8">8</xref>). Later, manual diagnoses are employed to detect skin diseases by dermatologists or other healthcare providers. Dermoscopy is one of the very popular techniques (<xref ref-type="bibr" rid="B9">9</xref>) to detect skin disease by magnifying and lighting the skin surface and underlying structures (<xref ref-type="bibr" rid="B10">10</xref>). For further investigations, dermatologists may perform a skin biopsy for pathological examination if it is required (<xref ref-type="bibr" rid="B11">11</xref>). However, these types of manual diagnosis heavily rely on visual interpretation and subjective judgment. Particularly, clinicians with varying levels of experience, knowledge, and diagnostic abilities may obtain inconsistent diagnoses of skin diseases. Furthermore, it is costly and necessitates the use of specialized medical diagnostic tools such as dedicated laser-based devices, micro-spectroscopy, and other dermoscopy tools to locate the lesion (<xref ref-type="bibr" rid="B12">12</xref>).</p>
<p>To tackle this challenge and alleviate the burden of clinicians, automated computer vision and machine learning systems have been developed for computer-aided diagnosis (CAD) systems for skin disease detection and diagnosis (<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>). The use of CAD is convenient, less expensive, and faster (<xref ref-type="bibr" rid="B15">15</xref>), and systems can be divided into two categories: traditional machine learning (ML) and modern deep learning (DL)-based methods. Traditional machine learning (ML) approaches rely on manually hand-crafted features, typically involving pre-processing and extracting features like texture, color, size, and shape, followed by classification using methods such as gradient boosting, SVM, or artificial neural networks (ANN). Different ML-based approaches were employed for skin disease diagnosis in the literature; for example, Ahammed et al. (<xref ref-type="bibr" rid="B16">16</xref>) utilized Decision Tree (DT), Support Vector Machine (SVM), and K-Nearest Neighbor (KNN) models for skin disease detection and classification. Similarly, Jagdish et al. (<xref ref-type="bibr" rid="B17">17</xref>) employed KNN and SVM with wavelet analysis for skin disease detection and classification. However, applying these traditional ML methods to new, unfamiliar scenarios is often challenging.</p>
<p>In contrast, DL-based methods are convenient as they can automatically extract features and reduce errors, leading to better performance (<xref ref-type="bibr" rid="B18">18</xref>). They have produced promising results for the detection and classification of skin disease (<xref ref-type="bibr" rid="B19">19</xref>&#x2013;<xref ref-type="bibr" rid="B22">22</xref>). For example, Abd et al. (<xref ref-type="bibr" rid="B22">22</xref>) developed a robust DL-based model for the classification of skin disease that uses MobileNetV3 for features extraction purposes. Khan et al. (<xref ref-type="bibr" rid="B20">20</xref>) used deep convolutional neural network-based models such as VGG and AlexNet to classify skin disease. Similarly, Brinker et al. (<xref ref-type="bibr" rid="B19">19</xref>) used the residual network ResNet50 for skin disease classification. Most studies rely on a single end-to-end model, and such models are prone to overfitting and hinder the adaptability and generalizability to other unfamiliar datasets.</p>
<p>To overcome these limitations, we propose a DL-based ensemble framework that classifies skin disease using feature-level fusion (FLF) in an end-to-end way and fusion at the decision level for a non-end-to-end manner for decision-level fusion (DLF). FLF merges feature representations before classification, allowing the model to learn richer, more fine-grained complementary information of lesions in a shared space, whereas DLF aggregates final predictions from multiple base models, reducing bias from a single base model. Using both allows the system to benefit from joint representation learning for FLF while still leveraging the robustness of majority voting for the DLF. More specifically, we demonstrated that DLF slightly outperforms FLF on most benchmarks, but the combination offers insights into which level of fusion is more beneficial for specific datasets. The contribution of this study is summarized as follows:</p>
<p>
<list list-type="simple">
<list-item><label>&#x2022;</label>
<p>We introduce a comprehensive end-to-end ensemble framework for diagnosing skin diseases, comprising two CNN-based and an attention-based vision transformer model. The features extracted from these base models are fused at a feature level to generate conclusive features in the final layer and employ Softmax for diagnosis. In addition, the individual classifiers&#x2019; decisions are merged using a majority voting technique to make the final decision for the skin disease diagnosis.</p></list-item>
<list-item><label>&#x2022;</label>
<p>We utilize data augmentation with a deep generative adversarial network (GAN) to produce additional training data. Through empirical investigations on the benchmark datasets, we observe a notable improvement in the performance using data augmentation.</p></list-item>
<list-item><label>&#x2022;</label>
<p>We evaluate the proposed framework on four publicly available skin disease datasets: PH2, HAM10000, ISIC 2018, and ISIC 2019. The results demonstrate that the proposed framework achieves superior performance compared to various metrics such as accuracy, precision, recall, and F1-score.</p></list-item>
</list></p>
</sec>
<sec id="s2"><label>2</label><title>Related work</title>
<sec id="s2a"><label>2.1</label><title>CNN-based approaches</title>
<p>Convolutional neural networks (CNNs) have been remarkably efficient methods for handling pre-processing, extracting features, and performing classification in various domains of computer vision, including biometrics (<xref ref-type="bibr" rid="B23">23</xref>, <xref ref-type="bibr" rid="B24">24</xref>), medical imaging, as well as diagnosis of skin diseases (<xref ref-type="bibr" rid="B25">25</xref>&#x2013;<xref ref-type="bibr" rid="B29">29</xref>). Some studies (<xref ref-type="bibr" rid="B25">25</xref>, <xref ref-type="bibr" rid="B26">26</xref>) propose dedicated CNN architectures for skin disease classification. For example, Shanthi et al. (<xref ref-type="bibr" rid="B25">25</xref>) implemented an architecture consisting of 11 layers, incorporating convolution, pooling, fully connected (FC) layers, and Softmax for classification. On the other hand, four convolutional layers, two max-pooling layers, one FC layer, and three dense layers are found in (<xref ref-type="bibr" rid="B26">26</xref>). By contrast, some studies (<xref ref-type="bibr" rid="B27">27</xref>, <xref ref-type="bibr" rid="B30">30</xref>, <xref ref-type="bibr" rid="B31">31</xref>) employed existing pre-trained models for the classification of skin disease. For example, Muhaba et al. (<xref ref-type="bibr" rid="B27">27</xref>) utilized a pre-trained MobileNet CNN model and demonstrated it on a dataset collected from a clinic using different smartphone cameras. In contrast, the studies in (<xref ref-type="bibr" rid="B32">32</xref>) used four different CNN-based models: DenseNet121, ResNet50, VGG16, and ResNet18, and demonstrated on the HAM10000 dataset and found out that the ResNet50 obtained the best accuracy at 90.0&#x0025;. Furthermore, Kousis et al. (<xref ref-type="bibr" rid="B30">30</xref>) conducted a study on the identification of skin lesions using 11 different CNN architectures. They demonstrated the classification of seven different types of skin lesions, where the DenseNet169 model achieved the best performance at 92.2&#x0025;, 93.6&#x0025;, and 93.3&#x0025;, of accuracy, sensitivity, and F1-score, respectively, compared to the other end-to-end CNN architecture using the HAM10000 dataset. Similarly, Mondal et al. (<xref ref-type="bibr" rid="B31">31</xref>) utilized a modified-DenseNet201 by replacing the last layers with a single global average pooling layer, five FC layers, dropout, and finally, one Softmax layer for classification and showed that it outperforms the existing DenseNet169 and DenseNet121 models where it gains 13.8&#x0025; more accuracy than the non-modified DenseNet201 on the HAM10000 dataset. Similarly, Karthik et al. (<xref ref-type="bibr" rid="B33">33</xref>) have proposed a modification to the EfficientNet V2 model for the classification of skin disease. Specifically, they replaced the standard Squeeze-and-Excite block with an Efficient Channel Attention block. Shan et al. (<xref ref-type="bibr" rid="B34">34</xref>) introduced a convolutional Block Attention Module (CBAM) and used it in combination with DenseNet121 to enhance the feature representation capabilities. Additionally, they utilized an improved focal loss algorithm to deal with data imbalance effectively. These modifications have shown promising results in improving the performance of the model and achieving an AUC of 0.99 on the HAM10000 dataset. Similarly, Raghavendra et al. (<xref ref-type="bibr" rid="B35">35</xref>) used a model with CNN and a global average pooling layer to classify skin diseases. They also implemented the black hat filtering approach and the resampling technique to remove artifacts and increase data, which aided in outperformance by achieving accuracy at 97.2&#x0025; on the HAM10000 dataset.</p>
<p>In addition, several studies explored the uses of CNN-based models for feature extraction. For example, the studies in (<xref ref-type="bibr" rid="B22">22</xref>, <xref ref-type="bibr" rid="B36">36</xref>) implement a lightweight MobileNet for feature extraction. Additionally, the authors used Long Short-term Memory (LSTM) in (<xref ref-type="bibr" rid="B36">36</xref>) and the Artificial Rabbits Optimizer in (<xref ref-type="bibr" rid="B22">22</xref>) and achieved an accuracy at 87.2&#x0025;, 96.8&#x0025;, and 88.7&#x0025; on the ISIC 2016, PH2, and HAM10000 datasets, respectively, while an accuracy of 85.3&#x0025; was reached in (<xref ref-type="bibr" rid="B36">36</xref>) on the HAM10000 dataset. Similarly, Yu et al. (<xref ref-type="bibr" rid="B37">37</xref>) employed ResNet50 to extract features and obtain the global feature descriptor using a fisher vector and finally classified skin diseases using SVM with a Chi-squared kernel. They validated their model on the ISBI 2016 challenge dataset, achieving accuracy and AUC at 86.8&#x0025; and 85.2&#x0025;, respectively. Similarly, Hameed et al. (<xref ref-type="bibr" rid="B29">29</xref>) utilized AlexNet for feature extraction and an SVM for classification. They evaluated a privately collected dataset and discovered that their approach achieved an accuracy of 86.2&#x0025;. On the other hand, Seeja et al. (<xref ref-type="bibr" rid="B38">38</xref>) used U-Net in conjunction with an SVM for classification, demonstrating its effectiveness on the ISBI 2016 dataset. Their approach achieved an accuracy of 85.2&#x0025;, precision of 42.6&#x0025;, recall of 50.0&#x0025;, and F1-score of 46.0&#x0025;. In contrast, Bandyopadhyay et al. (<xref ref-type="bibr" rid="B39">39</xref>) employed AlexNet, GoogLeNet, ResNet50, and VGG16 for feature extraction, and used SVM, AdaBoost, and Decision Tree classifiers for classification using the ISIC 2016 challenge dataset.</p>
<p>Some studies employed segmentation techniques to segment the area of the disease lesion, and subsequently, lesions were utilized to enhance the classification accuracy. Son et al. (<xref ref-type="bibr" rid="B40">40</xref>) proposed a two-stage approach to classify skin diseases. In the first stage, they implement a U-net architecture to decompose and normalize the input images, generating a segmentation map of the skin lesion. In the second stage, they introduce EfficientNets to classify the segmented images. This approach showed promising results in accurately identifying various skin diseases. Similarly, Adla et al. (<xref ref-type="bibr" rid="B41">41</xref>) utilized Tsallis entropy-based segmentation to detect the lesion area. Later, the classification of segmented lesions was done using a convolutional sparse Autoencoder. Furthermore, Kalpana et al. (<xref ref-type="bibr" rid="B28">28</xref>) segmented the malignant lesion using a threshold-based technique and classified it through an ensemble model with an SVM classifier and a random forest kernel. In addition, Zhu et al. (<xref ref-type="bibr" rid="B42">42</xref>) employed a CNN-based model for both binary classification (i.e., benign vs. malignant) and multiclass classification using high-frequency ultrasound images of skin lesions.</p>
<p>All of the single end-to-end or custom CNN-based models used a traditional convolutional approach, which may similarly extract the features, leading to robustness on a single dataset and less generalize on other datasets (<xref ref-type="bibr" rid="B43">43</xref>). However, end-to-end methods are necessary for real-world applications because they can automatically extract relevant features directly from raw data, reduce multiple processing stages, and make decisions based on the features, which is particularly necessary where manual feature extraction is challenging, for example, skin disease detection and identification. In this study, we propose an ensemble framework and perform experiments end-to-end as a feature-level fusion and a decision-level fusion.</p>
</sec>
<sec id="s2b"><label>2.2</label><title>Vision transformer-based approaches</title>
<p>The Vision Transformer (ViT) (<xref ref-type="bibr" rid="B44">44</xref>)-based approach represents attention-based architectures showcasing the effectiveness of attention mechanisms in capturing extensive spatial relationships within images. These models partition an image into non-overlapping patches of fixed size, subsequently transforming them into a sequence of vectors through linear embedding. Similar to CNN-based approaches, ViT models are widely used for segmentation (<xref ref-type="bibr" rid="B45">45</xref>), detection and classification (<xref ref-type="bibr" rid="B46">46</xref>), as well as for skin disease diagnosis and classification (<xref ref-type="bibr" rid="B12">12</xref>, <xref ref-type="bibr" rid="B47">47</xref>, <xref ref-type="bibr" rid="B48">48</xref>). For example, Aladhadh et al. (<xref ref-type="bibr" rid="B12">12</xref>) employed a ViT model along with data augmentation for skin cancer diagnosis. They demonstrated on the HAM10000 dataset and found that the ViT-based model obtained better accuracy than CNN-based approaches for the classification of skin cancer with accuracy, precision, sensitivity, and F1-score at 96.1&#x0025;, 96.0&#x0025;, 96.5&#x0025;, and 97.0&#x0025;, respectively. Similarly, Xin et al. (<xref ref-type="bibr" rid="B47">47</xref>) introduced a framework including a multi-scale vision transformer and multi-scale patch embedding technique to improve the image features and finally apply contrastive learning for skin disease classification. Their proposed approach obtained accuracy, precision, and AUC at 94.3&#x0025;, 94.1&#x0025;, and 98.0&#x0025;, respectively, on the HAM10000 dataset. Further, Nie et al. (<xref ref-type="bibr" rid="B49">49</xref>) employed a two-stage model including a CNN-based module to extract local and low-level features, a ViT model for the high-level semantic information from these features, and finally, a multi-layer perceptron (MLP) head was used for the classification of skin disease, and achieved accuracy, precision, recall, and F1-score at 89.5&#x0025;, 89.6&#x0025;, 89.5&#x0025;, and 89.1&#x0025;, respectively, on the HAM10000 dataset. In addition, Dai et al. (<xref ref-type="bibr" rid="B48">48</xref>) introduced the HierAttn model, which uses a multi-stage and multi-branch attention mechanism to simultaneously learn local and global contextual features while maintaining a lightweight architecture. This is particularly suitable for real-time and mobile-based applications in skin disease diagnosis, and classification.</p>
</sec>
<sec id="s2c"><label>2.3</label><title>Fusion-based approaches</title>
<p>Feature-level fusion (FLF) and decision-level fusion (DLF) are the most commonly used techniques for ensemble learning for skin disease diagnosis. In FLF, concatenation or pointwise addition of the extracted features from the multiple base models takes place. In contrast, in DLF, the decision of the base classifiers is averaged or selected by majority voting for the final decision. Regarding FLF, Wang et al. (<xref ref-type="bibr" rid="B50">50</xref>) introduced a multiscale feature fusion model for classifying skin disease using DenseNet121 and an improved VGG16. They demonstrated its performance on the HAM10000 dataset, achieving an accuracy of 91.2&#x0025;, while Gairola et al. (<xref ref-type="bibr" rid="B51">51</xref>) introduced a multi-feature fusion approach using different deep networks to improve accuracy. Similarly, Elashiri et al. (<xref ref-type="bibr" rid="B52">52</xref>) extracted features from ResNet50, VGG16, and Deeplabv3 and concatenated them at the feature level. These concatenated features were sent to the feature transformation stage for weighted feature extraction, and finally, LSTM was employed for classification. They evaluated the PH2 and HAM10000 datasets and obtained an accuracy of 93.5&#x0025; and 93.8&#x0025;, respectively, for the PH2 and HAM10000 datasets. Similarly, Afza et al. (<xref ref-type="bibr" rid="B53">53</xref>) introduce an approach including image acquisition and enhanced contrast, feature extraction using deep learning, and selecting the best feature using entropy-mutual information and fuse by employing a modified canonical correlation. They evaluated the HAM10000 and ISIC2018 datasets and found that their framework achieved an accuracy of 93.4&#x0025; on both datasets.</p>
<p>In contrast, Dang et al. (<xref ref-type="bibr" rid="B54">54</xref>) proposed an ensemble model comprised of five CNN-based models: Inception-v3, Densenet169, ResNet50, Inception-ResNet-v2, and Xception, along with Squeeze-and-Excitation Blocks to emphasize on informative features. They employed majority voting for decision-level fusion. They obtained accuracy, precision, recall, F1-score, and AUC at 90.9&#x0025;, 85.9&#x0025;, 80.8&#x0025;, 82.8&#x0025;, and 91.1&#x0025;, respectively, on the ISIC 2017 dataset. Similarly, Harangi (<xref ref-type="bibr" rid="B55">55</xref>) proposed an ensemble model where they considered four CNN-based methods: VGG, ResNet, GoogLeNet, and AlexNet. They employed the weighted average technique for the final prediction of the skin disease. They achieved an AUC of 0.891 on the official test dataset of the IEEE International Symposium on Biomedical Imaging (ISBI) 2017 challenge on Skin Lesion Analysis Towards Melanoma Detection. We observed that most of the methods employed either FLF or DLF; however, in this study, we studied extensively FLF and DLF in our ensemble framework.</p>
</sec>
</sec>
<sec id="s3"><label>3</label><title>Methodology</title>
<sec id="s3a"><label>3.1</label><title>Overview</title>
<p>In this study, we propose a novel ensemble framework that leverages the complementary strengths of three modules to extract smart features: two of which are CNN-based, and the other is an attention-based Vision Transformer (ViT). An overview of the proposed framework is presented in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>. Our framework is based on a modified DenseNet201 (<xref ref-type="bibr" rid="B56">56</xref>), and VGG19 (<xref ref-type="bibr" rid="B57">57</xref>) as a CNN-based approach, while Vision Transformer (ViT) (<xref ref-type="bibr" rid="B44">44</xref>) is an attention-based vision transformer model. The fused features are subsequently fed into a fully connected embedding layer. Finally, a single-layer classification network with a Softmax activation function is employed. This network calculates the cross-entropy loss for end-to-end classification, realizing feature-level fusion (FLF). Additionally, the decision of each individual model is employed to fuse for the final decision for decision-level fusion (DLF) as a majority voting technique.</p>
<fig id="F1" position="float"><label>Figure 1</label>
<caption><p>Overview of our proposed ensemble framework for the feature extraction (left side) and classification (right side) of skin disease. For feature extraction, the framework includes three modules: two CNN-based models and an attention-based model. For classification, the framework includes feature-level fusion (FLF) and decision-level fusion (DLF). The extracted features from base models are fused as pointwise addition for FLF, while majority voting techniques are employed for DLF.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g001.tif"><alt-text content-type="machine-generated">Diagram showing a framework for skin lesion classification. It includes stages like input image processing, feature extraction through convolutional and dense blocks, multi-head attention, and classification through decision-level and feature-level fusion, leading to final class determination.</alt-text>
</graphic>
</fig>
<sec id="s3a1"><label>3.1.1</label><title>CNN-based model</title>
<p>Our CNN-based approaches are based on the modified DenseNet201 and VGG19 architecture.</p>
<p><italic>DenseNet</italic> (<xref ref-type="bibr" rid="B56">56</xref>) is a high parametric efficient CNN-based model. It reuses the features from different layers, which increases the variety of input for subsequent layers. Additionally, it prevents vanishing gradients by dense connections between layers and also ensures no loss of information (<xref ref-type="bibr" rid="B58">58</xref>) and efficient memory consumption (<xref ref-type="bibr" rid="B59">59</xref>). DenseNet has different versions, which are categorized based on the number of layers. In our proposed framework, we exploit the DenseNet201, which consists of 201 layers. The fundamental component of DenseNet is a defined number of dense blocks along with a transition block. At first, an input image <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM1"><mml:mi>X</mml:mi></mml:math></inline-formula> with spatial resolution <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM2"><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi></mml:math></inline-formula>, where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM3"><mml:mi>H</mml:mi></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM4"><mml:mi>W</mml:mi></mml:math></inline-formula> stand for height and width, respectively, are passed through a <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM5"><mml:mn>7</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>7</mml:mn></mml:math></inline-formula> convolution and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM6"><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:math></inline-formula> max-pooling layers and produces an output feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM7"><mml:msubsup><mml:mi>Z</mml:mi><mml:mn>0</mml:mn><mml:mn>0</mml:mn></mml:msubsup></mml:math></inline-formula> with dimension <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM8"><mml:msubsup><mml:mi>M</mml:mi><mml:mn>0</mml:mn><mml:mn>0</mml:mn></mml:msubsup><mml:mo>&#x00D7;</mml:mo><mml:msubsup><mml:mi>N</mml:mi><mml:mn>0</mml:mn><mml:mn>0</mml:mn></mml:msubsup></mml:math></inline-formula>, and can be expressed as:<disp-formula id="disp-formula1"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM1"><mml:msubsup><mml:mi>Z</mml:mi><mml:mn>0</mml:mn><mml:mn>0</mml:mn></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>where, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM9"><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is convolution, while <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM10"><mml:mrow><mml:mi>M</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> stands for max-pooling. Then, the feature map passes through several dense blocks and transition layers. In a dense block, each layer takes input from all preceding layers. Each dense block begins with a bottleneck layer, a <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM11"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> convolution layer, which decreases the number of channels in the input feature maps, followed by a <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM12"><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:math></inline-formula> convolution layer that is densely interconnected. For the <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM13"><mml:mi>k</mml:mi></mml:math></inline-formula>th block, it can be expressed as follows:<disp-formula id="disp-formula2"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM2"><mml:msubsup><mml:mi>Z</mml:mi><mml:mi>k</mml:mi><mml:mi>l</mml:mi></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mi>H</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>t</mml:mi></mml:mrow><mml:mo stretchy="false">[</mml:mo><mml:msubsup><mml:mi>Z</mml:mi><mml:mi>k</mml:mi><mml:mn>0</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mi>Z</mml:mi><mml:mi>k</mml:mi><mml:mn>1</mml:mn></mml:msubsup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msubsup><mml:mi>Z</mml:mi><mml:mi>k</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>where, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM14"><mml:msub><mml:mi>H</mml:mi><mml:mi>k</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is a non-linear transformation that comprises batch normalization, ReLU, and convolution, and generates a feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM15"><mml:msubsup><mml:mi>Z</mml:mi><mml:mi>k</mml:mi><mml:mi>l</mml:mi></mml:msubsup></mml:math></inline-formula> with dimension <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM16"><mml:msubsup><mml:mi>M</mml:mi><mml:mi>k</mml:mi><mml:mi>l</mml:mi></mml:msubsup><mml:mo>&#x00D7;</mml:mo><mml:msubsup><mml:mi>N</mml:mi><mml:mi>k</mml:mi><mml:mi>l</mml:mi></mml:msubsup></mml:math></inline-formula> in the <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM17"><mml:mi>l</mml:mi></mml:math></inline-formula>th layer along with the <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM18"><mml:mi>k</mml:mi></mml:math></inline-formula>th dense block, while <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM19"><mml:mrow><mml:mi mathvariant="normal">cat</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> is the concatenation of all preceding layers&#x2019; feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM20"><mml:msup><mml:mi>Z</mml:mi><mml:mn>0</mml:mn></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mi>Z</mml:mi><mml:mn>1</mml:mn></mml:msup><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:msup><mml:mi>Z</mml:mi><mml:mrow><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, respectively, for the layers <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM21"><mml:mn>0</mml:mn><mml:mo>,</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>,</mml:mo><mml:mi>l</mml:mi><mml:mo>&#x2212;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula>.</p>
<p>Furthermore, a transitional block is introduced between dense blocks to reduce the size of the feature maps and the number of channels. The transition includes a BN layer, a <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM22"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>1</mml:mn></mml:math></inline-formula> convolutional layer, and an average pooling layer with a stride of <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM23"><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula>. Later, an FC layer with dimension <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM24"><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula> is added to extract features and fuse with other features [in <xref ref-type="disp-formula" rid="disp-formula6">Equation 1</xref>] for an end-to-end feature-level fusion (FLF). Regarding the decision-level fusion (DLF), a global average pooling (GAP) layer is exploited to aggregate the spatial information into a fixed-length feature vector and Softmax layer classification.</p>
<p><italic>VGG19</italic> (<xref ref-type="bibr" rid="B57">57</xref>) is the most widely explored method for image classification. A series of stacked convolutional layers are the foundation of the VGG19 structure, which is then followed by FC layers. The convolutional part is made up of 16 convolutional and is divided into five blocks and three FC layers with ReLU activation. Each convolution consists of a <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM25"><mml:mn>3</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>3</mml:mn></mml:math></inline-formula> kernel with a <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM26"><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>2</mml:mn></mml:math></inline-formula> pooling layer. Firstly, the input image, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM27"><mml:mi>X</mml:mi></mml:math></inline-formula>, is passed through Block A consists of two consecutive convolutions and max-pooling along with 64 number of channel and generates a feature map, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM28"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> with dimension <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM29"><mml:msub><mml:mi>K</mml:mi><mml:mn>2</mml:mn></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mn>2</mml:mn></mml:msub></mml:math></inline-formula> and is outlined as:<disp-formula id="disp-formula3"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM3"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mi>M</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>where, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM30"><mml:mrow><mml:mi>C</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi><mml:mi>v</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM31"><mml:mrow><mml:mi>M</mml:mi><mml:mi mathvariant="normal">&#x005F;</mml:mi><mml:mi>P</mml:mi><mml:mi>o</mml:mi><mml:mi>o</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:math></inline-formula> represents convolution and max-pooling respectively. Afterward, this feature map is passed through Block B in the same way as Block A, generating a feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM32"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, and sent to the Block C consisting of four consecutive convolutions followed by max-pooling along with 256 channels, generates a feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM33"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> with a size of <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM34"><mml:msub><mml:mi>K</mml:mi><mml:mn>4</mml:mn></mml:msub><mml:mo>&#x00D7;</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mn>4</mml:mn></mml:msub></mml:math></inline-formula>. Similarly, the feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM35"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>3</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula> is sent to Block D with four consecutive convolutions followed by max-pooling along with 512 channels, and generates a feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM36"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>4</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, then this is sent to Block E and generates the final feature map <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM37"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mi>i</mml:mi><mml:mn>5</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>, using in the same way as Block D. For more information, please follow the original paper (<xref ref-type="bibr" rid="B57">57</xref>).</p>
<p>Finally, we added an FC layer after the Block E to have the same dimensions as the <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM38"><mml:mi>V</mml:mi><mml:msub><mml:mi>g</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula>, which encodes rich spatial information and fuses with the FLF used in <xref ref-type="disp-formula" rid="disp-formula6">Equation 1</xref>. By contrast, we employed a global average pooling (GAP) layer to aggregate the spatial information into a fixed-length feature vector and Softmax layer classification for the DLF, as shown in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>.</p>
</sec>
<sec id="s3a2"><label>3.1.2</label><title>Attention-based model</title>
<p>Our framework employs a Vision Transformer-based model (<xref ref-type="bibr" rid="B44">44</xref>), which applies the standard multi-head self-attention (MHSA) mechanism originally introduced for natural language processing (<xref ref-type="bibr" rid="B60">60</xref>). The input image <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM39"><mml:mi>X</mml:mi><mml:mo>&#x2208;</mml:mo><mml:msup><mml:mrow><mml:mrow><mml:mi mathvariant="double-struck">R</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mi>H</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>W</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>C</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula> is reshaped into <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM40"><mml:mi>N</mml:mi><mml:mo>=</mml:mo><mml:mi>H</mml:mi><mml:mi>W</mml:mi><mml:mo>/</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mn>2</mml:mn></mml:msup></mml:math></inline-formula> non-overlapping patches of size <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM41"><mml:mi>P</mml:mi><mml:mo>&#x00D7;</mml:mo><mml:mi>P</mml:mi></mml:math></inline-formula>, linearly projected to a latent dimension <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM42"><mml:mi>D</mml:mi></mml:math></inline-formula>, and prepended with a learnable class label <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM43"><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">cls</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:math></inline-formula>. A positional embedding <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM44"><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mspace width=".1em"/><mml:mi>p</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:math></inline-formula> is added to preserve spatial relationships, it can be represented as:<disp-formula id="disp-formula4"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM4"><mml:msub><mml:mi>Z</mml:mi><mml:mn>0</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:msub><mml:mi>X</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">cls</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>;</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">pch</mml:mi></mml:mrow></mml:mrow><mml:mn>1</mml:mn></mml:msubsup><mml:mi>E</mml:mi><mml:mo>;</mml:mo><mml:mo>&#x2026;</mml:mo><mml:mo>;</mml:mo><mml:msubsup><mml:mi>X</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">pch</mml:mi></mml:mrow></mml:mrow><mml:mi>N</mml:mi></mml:msubsup><mml:mi>E</mml:mi><mml:mo>]</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>E</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">pos</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>.</mml:mo></mml:math></disp-formula>The resulting sequence is passed through several encoder layers, each containing a standard MHSA block followed by a two-layer feed-forward multi-layer perceptron (MLP) with GELU activation and residual connections. The MHSA computes attention as:<disp-formula id="disp-formula5"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM5"><mml:mi>A</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mi>Q</mml:mi><mml:mo>,</mml:mo><mml:mi>K</mml:mi><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:mo stretchy="false">)</mml:mo><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">Softmax</mml:mi></mml:mrow><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mi>Q</mml:mi><mml:msup><mml:mi>K</mml:mi><mml:mi>T</mml:mi></mml:msup></mml:mrow><mml:msqrt><mml:mi>D</mml:mi></mml:msqrt></mml:mfrac></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mi>V</mml:mi><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM45"><mml:mi>Q</mml:mi></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM46"><mml:mi>K</mml:mi></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM47"><mml:mi>V</mml:mi></mml:math></inline-formula> are the learned query, key, and value projections. The outputs from all heads are concatenated and linearly projected to produce the final representation. For additional information on the MHSA formulation, see (<xref ref-type="bibr" rid="B44">44</xref>). Finally, the <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM48"><mml:mi>V</mml:mi><mml:msub><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula> features from the MLP head are used for FLF in <xref ref-type="disp-formula" rid="disp-formula6">Equation 1</xref>, while the classification decision is used for DLF.</p>
</sec>
</sec>
<sec id="s3b"><label>3.2</label><title>Feature-level fusion (FLF)</title>
<p>We employ the point-wise addition of the extracted features (e.g., <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM49"><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM50"><mml:mi>V</mml:mi><mml:msub><mml:mi>g</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM51"><mml:mi>V</mml:mi><mml:msub><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula>) from the previously mentioned base models in our proposed ensemble model for feature-level fusion, which can be performed as follows:<disp-formula id="disp-formula6"><label>(1)</label><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="DM1"><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">fused</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">add</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo stretchy="false">[</mml:mo><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:msub><mml:mi>g</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:mi>V</mml:mi><mml:msub><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:msub><mml:mo stretchy="false">]</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:math></disp-formula>where <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM52"><mml:msub><mml:mi>P</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">add</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes point-wise addition of the feature vectors. Prior to fusion, each feature vector is normalized to ensure comparable scale and distribution across the CNN and ViT models. After that, we employ a FC layer with 512 dimensions to enable the model to learn appropriate weighting and alignment of the fused features during training. Finally, a Softmax layer produces the output probabilities for skin disease classification in an end-to-end manner:<disp-formula id="disp-formula7"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM6"><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">class</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">Softmax</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="normal">FC</mml:mi></mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>F</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">fused</mml:mi></mml:mrow></mml:mrow></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">)</mml:mo><mml:mo>.</mml:mo></mml:math></disp-formula></p>
</sec><sec id="s3c"><label>3.3</label><title>Decision-level fusion (DLF)</title>
<p>Decision-level Fusion (DLF) combines the decisions for each of the classifier&#x2019;s decisions instead of only a single model. In our framework, we consider the majority voting strategy to count the votes received from each classifier. The class with the most votes is chosen as the consensus decision, and the overall procedure can be outlined as follows:<disp-formula id="disp-formula8"><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM7"><mml:mtable columnalign="right left" rowspacing=".5em" columnspacing="thickmathspace" displaystyle="true"><mml:mtr><mml:mtd><mml:msub><mml:mi>P</mml:mi><mml:mi>A</mml:mi></mml:msub></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>A</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>P</mml:mi><mml:mi>B</mml:mi></mml:msub></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>B</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>P</mml:mi><mml:mi>C</mml:mi></mml:msub></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>C</mml:mi></mml:msub><mml:mo stretchy="false">(</mml:mo><mml:mi>X</mml:mi><mml:mo stretchy="false">)</mml:mo></mml:mtd></mml:mtr><mml:mtr><mml:mtd><mml:msub><mml:mi>Y</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">class</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:mtd><mml:mtd><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>V</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>A</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>B</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>P</mml:mi><mml:mi>C</mml:mi></mml:msub><mml:mo stretchy="false">)</mml:mo><mml:mo>,</mml:mo></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>where, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM53"><mml:mi>X</mml:mi></mml:math></inline-formula> represents the input image, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM54"><mml:msub><mml:mi>P</mml:mi><mml:mi>A</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM55"><mml:msub><mml:mi>P</mml:mi><mml:mi>B</mml:mi></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM56"><mml:msub><mml:mi>P</mml:mi><mml:mi>C</mml:mi></mml:msub></mml:math></inline-formula> represent the prediction classes using the modules <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM57"><mml:mi>A</mml:mi></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM58"><mml:mi>B</mml:mi></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM59"><mml:mi>C</mml:mi></mml:math></inline-formula>, respectively, while <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM60"><mml:mi>M</mml:mi><mml:mi>V</mml:mi><mml:mo stretchy="false">(</mml:mo><mml:mo>&#x22C5;</mml:mo><mml:mo stretchy="false">)</mml:mo></mml:math></inline-formula> denotes majority voting. The overview for the DLF portion is shown in <xref ref-type="fig" rid="F1">Figure&#x00A0;1</xref>.</p>
</sec>
</sec>
<sec id="s4"><label>4</label><title>Datasets and evaluation metrics</title>
<p>To demonstrate the proposed framework for diagnosing skin diseases, experiments were conducted on four publicly available benchmark datasets.</p>
<sec id="s4a"><label>4.1</label><title>Datasets</title>
<p><italic>PH2 dataset</italic> (<xref ref-type="bibr" rid="B61">61</xref>) is a dataset with three skin disease classes: Atypical Nevus (AN), Common Nevus (CN), and Melanoma (MEL) captured from the Dermatology Service of the Hospital Pedro Hispano, Matosinhos, Portugal. It comprises 200 images that were captured under identical conditions and instrumentation resolution. We followed the K-fold cross-validation technique to ensure a robust and unbiased evaluation of our proposed method. Specifically, we used K = 5, dividing the dataset into five equal parts. In each iteration, four folds were used for training and the remaining one for testing, with the test fold rotating across the five runs. Finally, the results were averaged across all five folds. The benchmark dataset used in this evaluation is denoted as PH2 in the experimental discussions.</p>
<p><italic>HAM10000</italic> (<xref ref-type="bibr" rid="B62">62</xref>) is a training subset of the ISIC 2018 challenge dataset, including 10,015 training dermatoscopic image samples. The dataset includes images of seven types of skin disease: Actinic keratosis (AKIEC), Basal cell carcinoma (BCC), Benign keratosis (BKL), Dermatofibroma (DF), Melanocytic nevi (NV), Melanoma (MEL), and Vascular lesions (VASC). The data were captured over 20 years from Australia and Austria from 54.0&#x0025; male and 45.0&#x0025; female participants. An example sample image for each class is shown in <xref ref-type="fig" rid="F2">Figure&#x00A0;2</xref>. Initially, the HAM10000 dataset was released only as a training set, with no corresponding official test labels provided. Consequently, numerous state-of-the-art studies adopted a common practice of splitting the HAM10000 dataset (i.e., 80:20 ratio) into training and testing subsets for performance evaluation. Following this widely used approach, we similarly conducted experiments for a fair comparison with prior works. The benchmark dataset is denoted by HAM10000 in the experiment discussions.</p>
<fig id="F2" position="float"><label>Figure 2</label>
<caption><p>Example images for each skin disease from the HAM10000 dataset, where AKIEC, actinic keratosis; BCC, basal cell carcinoma; BKL, benign keratosis; DF, dermatofibroma; NV, melanocytic nevi; MEL, melanoma; VASC, vascular lesions. <bold>(a)</bold> AKIEC. <bold>(b)</bold> BCC. <bold>(c)</bold> BKL. <bold>(d)</bold> DF. <bold>(e)</bold> NV. <bold>(f)</bold> MEL. <bold>(g)</bold> VASC.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g002.tif"><alt-text content-type="machine-generated">Seven images display various skin conditions labeled as follows: a) AKIEC with reddish-brown lesion, b) BCC with small pink area, c) BKL with light and dark brown patches, d) DF with a dark spot on pink skin, e) NV with reddish-brown mark, f) MEL with a dark brown irregular shape, g) VASC with a red spot surrounding a darker core.</alt-text>
</graphic>
</fig>
<p><italic>ISIC 2018</italic><xref ref-type="fn" rid="FN0001"><sup>1</sup></xref> is an official test dataset released by the ISIC 2018 challenge organizers, consisting of 1,512 dermatoscopic images covering the same seven classes: AKIEC, BCC, BKL, DF, NV, MEL, and VASC. Unlike HAM10000, which was originally provided solely as a training set, the ISIC 2018 dataset includes ground-truth labels for the test samples, enabling independent evaluation of model performance. Following the official ISIC 2018 challenge protocol, the HAM10000 dataset is used for training, and the ISIC 2018 dataset serves as the independent test set. This setup provides a robust assessment of the model&#x2019;s generalizability to unseen data beyond the HAM10000 distribution. The benchmark dataset is denoted by ISIC 2018 in the experiment discussions.</p>
<p><italic>ISIC 2019</italic><xref ref-type="fn" rid="FN0002"><sup>2</sup></xref> is a further challenge training dataset that comprises two datasets, namely HAM10000 and BCN&#x005F;20000. It includes a total of 25,331 images. The dataset covers eight different skin disease categories, which are AKIEC, BCC, BKL, DF, MEL, NV, Squamous cell carcinoma (SCC), and VASC. For a fair comparison with the existing approaches, we followed the same protocol as the dataset was randomly divided into 90&#x0025; for training and the remaining 10&#x0025; for testing.</p>
</sec>
<sec id="s4b"><label>4.2</label><title>Data augmentation</title>
<p>The deep learning-based approaches require large-scale training data to enhance performance and mitigate the risk of overfitting. A common strategy to address this challenge involves artificially augmenting the training samples to allow the models to gain a deeper understanding and insight. Typically, there are two types of data augmentation&#x2014;offline and online data augmentation&#x2014;for computer vision (<xref ref-type="bibr" rid="B63">63</xref>, <xref ref-type="bibr" rid="B64">64</xref>). Pre-training data augmentation involves the a priori application of image transformations to the training set. This process generates augmented images, which are then stored alongside their original counterparts within the dataset. During model training, both the original and augmented data are utilized. In contrast, real-time data augmentation entails the application of image transformations on a per-batch basis during the training process. These transformations effectively generate variations of the original training images, which are subsequently fed into the model for training. Common real-time augmentation techniques encompass rotation, resizing, horizontal and vertical flipping, and cropping.</p>
<p>To augment the training set, we employ a pre-training strategy that leverages a generative adversarial network (GAN) (<xref ref-type="bibr" rid="B65">65</xref>) for data augmentation. Additionally, we incorporate images from the ISIC archive<xref ref-type="fn" rid="FN0003"><sup>3</sup></xref> to increase the training data volume. Furthermore, we employ rotation, resizing, and cropping as online data augmentation. The class-wise distribution of the sample skin disease images before and after augmentation is presented in <xref ref-type="fig" rid="F3">Figure&#x00A0;3</xref>.</p>
<fig id="F3" position="float"><label>Figure 3</label>
<caption><p>The distribution of the training samples for each class in the different datasets before and after data augmentation, using a generative adversarial network (GAN) and with ISIC archives. <bold>(a)</bold> PH2. <bold>(b)</bold> HAM10000. <bold>(c)</bold> ISIC 2018. <bold>(d)</bold> ISIC 2019.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g003.tif"><alt-text content-type="machine-generated">Four bar charts labeled a through d compare the number of images across different datasets and conditions. Each chart represents various skin conditions with colors indicating data sources: PH2, HAM10000, ISIC 2018, ISIC 2019, GAN, Augmentation, ISIC Archive, and Total. Charts display data for conditions like AN, CN, MEL, AKIEC, BCC, BKL, DF, NV, SCC, and VASC, with a range from a few hundred to over ten thousand images. Each dataset shows varying contributions across conditions, with total numbers depicted prominently in red.</alt-text>
</graphic>
</fig>
<p>To assess the quality of the GAN-generated sample images, we computed the Fr&#x00E9;chet Inception Distance (FID) (<xref ref-type="bibr" rid="B66">66</xref>) scores across datasets and classes. The generated images achieved average FID scores of 82.9 for PH2, 37.8 for HAM10000, and 36.5 for ISIC 2019. Lower FID values indicate a higher similarity between the generated and real images, suggesting that the generated samples are visually realistic and diverse overall. The detailed class-wise FID scores are summarized in <xref ref-type="table" rid="T1">Table&#x00A0;1</xref>, and representative examples of the generated images are shown in <xref ref-type="fig" rid="F4">Figures&#x00A0;4</xref>, <xref ref-type="fig" rid="F5">5</xref> for the PH2 and HAM10000 datasets, respectively.</p>
<table-wrap id="T1" position="float"><label>Table 1</label>
<caption><p>Fr&#x00E9;chet Inception Distance (FID) scores for sample images generated by the generative adversarial network (GAN) across datasets and disease classes.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Disease name</th>
<th valign="top" align="center">PH2</th>
<th valign="top" align="center">HAM10000</th>
<th valign="top" align="center">ISIC 2018</th>
<th valign="top" align="center">ISIC 2019</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">AKIEC</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">40.5</td>
<td valign="top" align="center">40.5</td>
<td valign="top" align="center">38.8</td>
</tr>
<tr>
<td valign="top" align="left">BCC</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">38.3</td>
<td valign="top" align="center">38.3</td>
<td valign="top" align="center">36.9</td>
</tr>
<tr>
<td valign="top" align="left">BKL</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">29.0</td>
<td valign="top" align="center">29.0</td>
<td valign="top" align="center">27.8</td>
</tr>
<tr>
<td valign="top" align="left">DF</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">40.7</td>
<td valign="top" align="center">40.7</td>
<td valign="top" align="center">39.1</td>
</tr>
<tr>
<td valign="top" align="left">NV</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">29.4</td>
<td valign="top" align="center">29.4</td>
<td valign="top" align="center">27.2</td>
</tr>
<tr>
<td valign="top" align="left">VASC</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">50.5</td>
<td valign="top" align="center">50.5</td>
<td valign="top" align="center">44.1</td>
</tr>
<tr>
<td valign="top" align="left">SCC</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">42.4</td>
</tr>
<tr>
<td valign="top" align="left">AN</td>
<td valign="top" align="center">80.6</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">CN</td>
<td valign="top" align="center">83.2</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">MEL</td>
<td valign="top" align="center">84.8</td>
<td valign="top" align="center">35.0</td>
<td valign="top" align="center">35.0</td>
<td valign="top" align="center">35.4</td>
</tr>
<tr>
<td valign="top" align="left"><bold>Average</bold></td>
<td valign="top" align="center"><bold>82.9</bold></td>
<td valign="top" align="center"><bold>37.8</bold></td>
<td valign="top" align="center"><bold>37.8</bold></td>
<td valign="top" align="center"><bold>36.5</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-fn1"><p>A lower FID score indicates higher similarity to real images. A &#x201C;&#x2013;&#x201D; denotes the absence of a particular disease class in the respective dataset.</p></fn>
</table-wrap-foot>
</table-wrap>
<fig id="F4" position="float"><label>Figure 4</label>
<caption><p>Example of images generated by the generative adversarial network (GAN) for the PH2 dataset. <bold>(a)</bold> Atypical Nevus (AN). <bold>(b)</bold> Common Nevus (CN). <bold>(c)</bold> Melanoma (MEL).</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g004.tif"><alt-text content-type="machine-generated">Three close-up images of skin conditions: a) Atypical Nevus - an irregular, darker brown lesion; b) Common Nevus - a smaller, evenly colored brown mole; c) Melanoma - a dark, irregularly shaped lesion.</alt-text>
</graphic>
</fig>
<fig id="F5" position="float"><label>Figure 5</label>
<caption><p>Example of images generated by the generative adversarial network (GAN) for the HAM10000 dataset, where AKIEC, actinic keratosis; BCC, basal cell carcinoma; BKL, benign keratosis; DF, Dermatofibroma; NV, Melanocytic nevi; MEL, Melanoma; VASC, Vascular lesions. <bold>(a)</bold> AKIEC. <bold>(b)</bold> BCC. <bold>(c)</bold> BKL. <bold>(d)</bold> DF. <bold>(e)</bold> NV. <bold>(f)</bold> MEL. <bold>(g)</bold> VASC.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g005.tif"><alt-text content-type="machine-generated">Seven close-up images of various skin lesions, labeled as follows: 5a depicts AKIEC, showing a reddish, irregular area; 5b shows BCC with a translucent, raised lesion; 5c presents BKL, a brown, pigmented patch; 5d displays DF, a pinkish, slightly raised spot; 5e shows NV with a brown, symmetrical mole; 5f depicts MEL, a dark, irregular lesion; 5g features VASC with a red, round mark.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s4c"><label>4.3</label><title>Evaluation metrics</title>
<p>We evaluate the effectiveness of our proposed framework using different evaluation criteria: Accuracy, Precision, Recall, F1-score, Balanced accuracy, ROC (Receiver Operating Characteristic), and AUC (Area Under the Curve) (<xref ref-type="bibr" rid="B67">67</xref>). These evaluation metrics are calculated from the confusion matrices&#x2019; key four parameters, i.e., True Positives (TP), True Negatives (TN), False Negatives (FN), and False Positives (FP). TP refers to the number of instances correctly predicted as a positive class, and TN refers to the number of instances correctly predicted by the model as belonging to the negative class. On the other hand, FP is the number of instances where the model incorrectly predicts the positive class, and FN is the number of instances where the model incorrectly predicts the negative class.</p>
<p>In addition, we consider ROC, which visualizes the trade-off between True Positive Rate (TPR) and False Positive Rate (FPR) across classification thresholds, while AUC quantifies the model&#x2019;s overall performance, with higher values indicating better discrimination between classes, where 1.0 represents perfect classification and 0.5 indicates random guessing. <italic>Accuracy</italic> is the ratio of correct predictions made by the model out of the total number of predictions, and can be calculated as follows:<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM8"><mml:mrow><mml:mi>A</mml:mi><mml:mi>c</mml:mi><mml:mi>c</mml:mi><mml:mi>u</mml:mi><mml:mi>r</mml:mi><mml:mi>a</mml:mi><mml:mi>c</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">TP</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">TN</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">TP</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">TN</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">FP</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">FN</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><italic>Precision</italic> measures the proportion of true positive predictions out of all positive predictions made by the model and is calculated as:<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM9"><mml:mrow><mml:mi>P</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>i</mml:mi><mml:mi>s</mml:mi><mml:mi>i</mml:mi><mml:mi>o</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">TP</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">TP</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">FP</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><italic>Recall</italic> also known as True Positive Rate, TPR measures the proportion of true positive predictions out of all actual positive instances in the experiment, which can be calculated as:<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM10"><mml:mrow><mml:mi>R</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi><mml:mi>l</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mi mathvariant="normal">TPR</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">TP</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">TP</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">FN</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><italic>False Positive Rate</italic> (FPR) measures the proportion of false positive predictions out of all actual negative instances, calculated as:<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM11"><mml:mrow><mml:mi>F</mml:mi><mml:mi>P</mml:mi><mml:mi>R</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">FP</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow><mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo><mml:mrow><mml:mi mathvariant="normal">FP</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">TN</mml:mi></mml:mrow><mml:mo fence="false" stretchy="false">|</mml:mo></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><italic>F1-score</italic> is the harmonic mean of precision and recall; it strikes a balance between precision and recall, making it an effective metric for assessing both false positives and false negatives. The F1-score is calculated as follows:<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM12"><mml:mrow><mml:mi>F</mml:mi><mml:mn>1</mml:mn><mml:mo>&#x2212;</mml:mo><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mfrac><mml:mrow><mml:mn>2</mml:mn><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="normal">Precision</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:mi mathvariant="normal">Recall</mml:mi></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">Precision</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">Recall</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mrow></mml:mstyle></mml:math></disp-formula><italic>Balanced accuracy (BACC)</italic> is an evaluation metric used to evaluate the accuracy of a classification model when dealing with imbalanced datasets. It is defined as the average recall for each class.<disp-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="UDM13"><mml:mrow><mml:mi>B</mml:mi><mml:mi>A</mml:mi><mml:mi>C</mml:mi><mml:mi>C</mml:mi></mml:mrow><mml:mo>=</mml:mo><mml:mrow><mml:mfrac><mml:mn>1</mml:mn><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">classes</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:mfrac></mml:mrow><mml:munderover><mml:mo>&#x2211;</mml:mo><mml:mi>i</mml:mi><mml:mrow><mml:msub><mml:mi>N</mml:mi><mml:mrow><mml:mrow><mml:mi mathvariant="normal">classes</mml:mi></mml:mrow></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mi mathvariant="normal">Recall</mml:mi></mml:mrow><mml:mi>i</mml:mi></mml:msub></mml:math></disp-formula></p>
</sec>
</sec>
<sec id="s5"><label>5</label><title>Experiments and results</title>
<p>In this section, we will introduce the system implementation and performance of the proposed framework.</p>
<sec id="s5a"><label>5.1</label><title>Implementation details</title>
<p>The proposed framework was implemented by leveraging the TensorFlow library on an NVIDIA GeForce RTX 3090 GPU. The AdamW optimizer with a learning rate of <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM61"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>4</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula>, a weight decay of <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM62"><mml:mn>4</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>3</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> and an epsilon of <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM63"><mml:mn>1</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:msup><mml:mi>e</mml:mi><mml:mrow><mml:mo>&#x2212;</mml:mo><mml:mn>7</mml:mn></mml:mrow></mml:msup></mml:math></inline-formula> were used to optimize our proposed framework. Additionally, categorical cross-entropy loss is used as a loss function. We employed 150 epochs with 8 mini-batch sizes to train our end-to-end FLF and DLF framework. The learning rate (LR) was reset to 1e-5 after 50 epochs and again reset to 1e-6 after 100 epochs. Moreover, the the dimension <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM64"><mml:mi>d</mml:mi></mml:math></inline-formula> was set to 768 for the <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM65"><mml:mi>D</mml:mi><mml:msub><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula>, <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM66"><mml:mi>V</mml:mi><mml:msub><mml:mi>g</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula>, and <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM67"><mml:mi>V</mml:mi><mml:msub><mml:mi>i</mml:mi><mml:mi>d</mml:mi></mml:msub></mml:math></inline-formula> in <xref ref-type="disp-formula" rid="disp-formula6">Equation 1</xref>.</p>
</sec>
<sec id="s5b"><label>5.2</label><title>Comparison with SOTA methods</title>
<sec id="s5b1"><label>5.2.1</label><title>Evaluation of PH2 dataset</title>
<p>The accuracy, precision, recall, and F1-score on the PH2 dataset are presented in <xref ref-type="table" rid="T2">Table&#x00A0;2</xref> and <xref ref-type="sec" rid="s13">Supplementary Figure S1</xref>. Compared with the CNN-based method for feature extraction and then feeding these features into the ML-based classifier (<xref ref-type="bibr" rid="B68">68</xref>, <xref ref-type="bibr" rid="B70">70</xref>, <xref ref-type="bibr" rid="B72">72</xref>), the custom CNN-based model (<xref ref-type="bibr" rid="B71">71</xref>), and CNN-based End-to-End models (<xref ref-type="bibr" rid="B68">68</xref>, <xref ref-type="bibr" rid="B69">69</xref>), our proposed feature-level fusion (FLF) and decision-level fusion (DLF) ensemble framework achieved an accuracy of 99.3&#x0025;, and 99.2&#x0025; respectively for the FLF and DLF. Moreover, we observe that the proposed FLF approach achieves performance comparable to the best existing method reported by Maniraj et al. (<xref ref-type="bibr" rid="B69">69</xref>), while also providing consistently high precision, recall, and F1-scores.</p>
<table-wrap id="T2" position="float"><label>Table 2</label>
<caption><p>Comparison of the proposed framework with existing methods applied to the PH2 dataset.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Reference</th>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Precision</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Benyahia et al. (<xref ref-type="bibr" rid="B68">68</xref>)</td>
<td valign="top" align="left">DenseNet+SVM</td>
<td valign="top" align="center">99.0</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Maniraj et al. (<xref ref-type="bibr" rid="B69">69</xref>)</td>
<td valign="top" align="left">VGG</td>
<td valign="top" align="center"><bold>99.3</bold></td>
<td valign="top" align="center"><italic>99.2</italic></td>
<td valign="top" align="center"><bold>99.4</bold></td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Elashiri et al. (<xref ref-type="bibr" rid="B52">52</xref>)</td>
<td valign="top" align="left">ResNet50+VGG16+DeepLabv3</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">90.4</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Afza et al. (<xref ref-type="bibr" rid="B70">70</xref>)</td>
<td valign="top" align="left">ResNet50+NB</td>
<td valign="top" align="center">95.4</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">95.2</td>
</tr>
<tr>
<td valign="top" align="left">Reddy et al. (<xref ref-type="bibr" rid="B71">71</xref>)</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">96.2</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">93.9</td>
</tr>
<tr>
<td valign="top" align="left">Maqsood et al. (<xref ref-type="bibr" rid="B72">72</xref>)</td>
<td valign="top" align="left">Xception+ResNet50 ResNet101+VGG16+SVM</td>
<td valign="top" align="center">98.9</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Mustafa et al. (<xref ref-type="bibr" rid="B73">73</xref>)</td>
<td valign="top" align="left">ResUNet+AlexNet</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">FLF (DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center"><bold>99.3</bold></td>
<td valign="top" align="center"><bold>99.3</bold></td>
<td valign="top" align="center"><italic>99.3</italic></td>
<td valign="top" align="center"><bold>99.3</bold></td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">DLF (DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center"><italic>99.2</italic></td>
<td valign="top" align="center"><italic>99.2</italic></td>
<td valign="top" align="center">99.2</td>
<td valign="top" align="center">99.2</td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-fn2"><p>Bold values indicate the best benchmark.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
<sec id="s5b2"><label>5.2.2</label><title>Evaluation of the HAM10000 dataset</title>
<p>The accuracy, precision, recall, and F1-score on the HAM10000 dataset are presented in <xref ref-type="table" rid="T3">Table&#x00A0;3</xref> and <xref ref-type="sec" rid="s13">Supplementary Figure S1</xref>, along with the ROC curves and corresponding AUC values in <xref ref-type="fig" rid="F7">Figure&#x00A0;7</xref>. The confusion matrices of our proposed approaches are shown in <xref ref-type="fig" rid="F6">Figure&#x00A0;6</xref>. Compared with existing well-established models including the pre-trained CNN-based models ResNet50 (<xref ref-type="bibr" rid="B75">75</xref>), EfficientNetB4 (<xref ref-type="bibr" rid="B77">77</xref>), EfficinetNetB1 (<xref ref-type="bibr" rid="B80">80</xref>) and Xception (<xref ref-type="bibr" rid="B97">97</xref>), custom CNN (<xref ref-type="bibr" rid="B78">78</xref>, <xref ref-type="bibr" rid="B79">79</xref>, <xref ref-type="bibr" rid="B82">82</xref>), and studies with attention-based or combined with a CNN-based approach (<xref ref-type="bibr" rid="B49">49</xref>, <xref ref-type="bibr" rid="B76">76</xref>, <xref ref-type="bibr" rid="B81">81</xref>), our proposed end-to-end FLF achieves 92.7&#x0025;, 93.5&#x0025;, 92.6&#x0025;, and 92.8&#x0025; accuracy, precision, recall, and F1-score. On the other hand, Our DLF achieves 96.1&#x0025;, 96.2&#x0025;, 96.1&#x0025;, 96.1&#x0025; accuracy, precision, recall, and F1-score. Notably, we also compared our approach with the recent CNN along with ViT-based hybrid method proposed in (<xref ref-type="bibr" rid="B83">83</xref>), which achieved 95.0&#x0025; accuracy, 94.7&#x0025; precision, 92.1&#x0025; recall, and 93.3&#x0025; F1-score.</p>
<table-wrap id="T3" position="float"><label>Table 3</label>
<caption><p>Comparison of the proposed framework with existing methods applied to the HAM10000 dataset.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Reference</th>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Precision</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Liu et al. (<xref ref-type="bibr" rid="B74">74</xref>)</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">71.5</td>
<td valign="top" align="center">60.7</td>
</tr>
<tr>
<td valign="top" align="left">Al et al. (<xref ref-type="bibr" rid="B75">75</xref>)</td>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center">89.3</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">81.0</td>
<td valign="top" align="center">81.3</td>
</tr>
<tr>
<td valign="top" align="left">Nie et al. (<xref ref-type="bibr" rid="B49">49</xref>)</td>
<td valign="top" align="left">CNN+Attention</td>
<td valign="top" align="center">89.5</td>
<td valign="top" align="center">89.6</td>
<td valign="top" align="center">89.5</td>
<td valign="top" align="center">89.1</td>
</tr>
<tr>
<td valign="top" align="left">Cai et al. (<xref ref-type="bibr" rid="B76">76</xref>)</td>
<td valign="top" align="left">Attention</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">90.1</td>
<td valign="top" align="center">90.1</td>
</tr>
<tr>
<td valign="top" align="left">Ali et al. (<xref ref-type="bibr" rid="B77">77</xref>)</td>
<td valign="top" align="left">EfficientNetB4</td>
<td valign="top" align="center">87.9</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">87.0</td>
</tr>
<tr>
<td valign="top" align="left">Shetty et al. (<xref ref-type="bibr" rid="B78">78</xref>)</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">88.0</td>
<td valign="top" align="center">85.0</td>
<td valign="top" align="center">86.0</td>
</tr>
<tr>
<td valign="top" align="left">Wu et al. (<xref ref-type="bibr" rid="B79">79</xref>)</td>
<td valign="top" align="left">ResNet50</td>
<td valign="top" align="center"><italic>95.8</italic></td>
<td valign="top" align="center"><italic>96.0</italic></td>
<td valign="top" align="center"><italic>96.0</italic></td>
<td valign="top" align="center"><italic>96.0</italic></td>
</tr>
<tr>
<td valign="top" align="left">Tajerian et al. (<xref ref-type="bibr" rid="B80">80</xref>)</td>
<td valign="top" align="left">EfficientNetB1</td>
<td valign="top" align="center">84.3</td>
<td valign="top" align="center">73.4</td>
<td valign="top" align="center">67.4</td>
<td valign="top" align="center">70.0</td>
</tr>
<tr>
<td valign="top" align="left">You et al. (<xref ref-type="bibr" rid="B81">81</xref>)</td>
<td valign="top" align="left">Attention+CNN</td>
<td valign="top" align="center">80.4</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Wei et al. (<xref ref-type="bibr" rid="B82">82</xref>)</td>
<td valign="top" align="left">DenseNet+ConvNeXt</td>
<td valign="top" align="center">90.9</td>
<td valign="top" align="center">83.8</td>
<td valign="top" align="center">83.8</td>
<td valign="top" align="center">83.5</td>
</tr>
<tr>
<td valign="top" align="left">Mustafa et al. (<xref ref-type="bibr" rid="B73">73</xref>)</td>
<td valign="top" align="left">ResUNet+AlexNet</td>
<td valign="top" align="center">92.0</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Pacal et al. (<xref ref-type="bibr" rid="B83">83</xref>)</td>
<td valign="top" align="left">CNN + ViT</td>
<td valign="top" align="center">95.0</td>
<td valign="top" align="center">94.7</td>
<td valign="top" align="center">92.1</td>
<td valign="top" align="center">93.3</td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">FLF (DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center">92.7</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">92.8</td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">DLF (DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center"><bold>96.1</bold></td>
<td valign="top" align="center"><bold>96.2</bold></td>
<td valign="top" align="center"><bold>96.1</bold></td>
<td valign="top" align="center"><bold>96.1</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-fn3"><p>Bold and italic values indicate the best and second-best benchmarks, respectively.</p></fn>
</table-wrap-foot>
</table-wrap>
<fig id="F6" position="float"><label>Figure 6</label>
<caption><p>Normalized confusion matrices of FLF and DLF approaches on the HAM10000 dataset. AKIEC, actinic keratosis; BCC, basal cell carcinoma; BKL, benign keratosis; DF, dermatofibroma; NV, melanocytic nevi; MEL, melanoma; VASC, vascular lesions. <bold>(a)</bold> Feature-level fusion. <bold>(b)</bold> Decision-level fusion.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g006.tif"><alt-text content-type="machine-generated">Two confusion matrices compare classification performance. The left matrix, labeled \"Feature-level fusion,\" shows accuracy for classes AKIEC to VASC. Notable values include AKIEC at 0.68 and DF at 1.00. The right matrix, labeled \"Decision-level fusion,\" shows improved accuracy for AKIEC at 0.86, BCC at 1.00, and DF at 1.00. A color scale from light to dark blue represents accuracy from 0.0 to 1.0.</alt-text>
</graphic>
</fig>
<fig id="F7" position="float"><label>Figure 7</label>
<caption><p>ROC curves for the FLF and DLF approaches on the HAM10000 dataset, with corresponding AUC values included in the legend. <bold>(a)</bold> Feature-level fusion. <bold>(b)</bold> Decision-level fusion.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g007.tif"><alt-text content-type="machine-generated">Two side-by-side Receiver Operating Characteristic (ROC) curves compare feature-level and decision-level fusion. The left graph, titled \"Feature-Level Fusion,\" shows high true positive rates with AUC values: AKIEC 0.99, BCC 1.00, BKL 0.99, DF 1.00, MEL 0.99, NV 0.99, VASC 1.00. The right graph, titled \"Decision-Level Fusion,\" presents slightly varying AUC values: AKIEC 0.98, BCC 1.00, BKL 0.99, DF 1.00, MEL 0.99, NV 0.99, VASC 1.00. Both charts demonstrate models with excellent performance, as indicated by curves close to the top-left corner. The black dashed line represents random performance.</alt-text>
</graphic>
</fig>
<p>Furthermore, our proposed DLF framework surpasses the best-performing existing benchmarks by 0.3&#x0025;, 0.2&#x0025;, 0.1&#x0025;, and 0.1&#x0025; in terms of accuracy, precision, recall, and F1-score, respectively. To further validate the robustness of our method, we performed a bootstrap analysis with 1,000 iterations to compute 95&#x0025; confidence intervals (CIs) for the key performance metrics. The results are summarized in <xref ref-type="table" rid="T4">Table&#x00A0;4</xref>. Our DLF achieved an accuracy of 96.1&#x0025; [95&#x0025; CI: 95.2&#x0025;, 96.9&#x0025;], while the second-best method achieved an accuracy of 95.8&#x0025;. Notably, the lower bound of our method&#x2019;s CI (95.2&#x0025;) is close to the mean accuracy of the second-best method, indicating a consistent&#x2014;though modest&#x2014;improvement. Similarly, the precision, recall, and F1-score exhibit tight confidence intervals, reflecting stable and reliable performance across multiple resamples. These findings statistically reinforce that our method offers a robust and consistent improvement over the existing benchmarks, with reduced variability in performance.</p>
<table-wrap id="T4" position="float"><label>Table 4</label>
<caption><p>Bootstrap results with 95&#x0025; confidence intervals (CI) for the decision-level fusion (DLF) method on the HAM10000 dataset for 1,000 iterations.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Metric</th>
<th valign="top" align="center">Mean</th>
<th valign="top" align="center">CI lower</th>
<th valign="top" align="center">CI upper</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Accuracy [&#x0025;]</td>
<td valign="top" align="center">96.1</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">96.9</td>
</tr>
<tr>
<td valign="top" align="left">Precision [&#x0025;]</td>
<td valign="top" align="center">96.2</td>
<td valign="top" align="center">95.4</td>
<td valign="top" align="center">97.0</td>
</tr>
<tr>
<td valign="top" align="left">Recall [&#x0025;]</td>
<td valign="top" align="center">96.1</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">96.9</td>
</tr>
<tr>
<td valign="top" align="left">F1-score [&#x0025;]</td>
<td valign="top" align="center">96.1</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">96.9</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s5b3"><label>5.2.3</label><title>Evaluation on the ISIC 2018 dataset</title>
<p>The balanced accuracy, precision, recall, and F1-score on the ISIC 2018 dataset are presented in <xref ref-type="table" rid="T5">Table&#x00A0;5</xref> and <xref ref-type="sec" rid="s13">Supplementary Figure S1</xref>, along with the confusion matrix of our approaches, which is shown in <xref ref-type="fig" rid="F8">Figure&#x00A0;8</xref>. Compared with CNN-based models (<xref ref-type="bibr" rid="B84">84</xref>, <xref ref-type="bibr" rid="B87">87</xref>&#x2013;<xref ref-type="bibr" rid="B90">90</xref>), our proposed FLF approach has achieved 86.7&#x0025;, 97.0&#x0025;, 84.6&#x0025;, and 85.2&#x0025;, respectively, for balanced accuracy, specificity, recall, and F1-score while 89.0&#x0025;, 97.3&#x0025;, 86.1&#x0025;, and 86.4&#x0025; for DLF, respectively. This implies the supremacy of our proposed approaches, where the DLF approach achieves 0.5&#x0025; higher balanced accuracy than the best-performing existing benchmarks. Our observations demonstrate that decision-level fusion (DLF) achieves superior benchmark performance among the evaluated methods.</p>
<table-wrap id="T5" position="float"><label>Table 5</label>
<caption><p>Comparison of the proposed framework with existing methods applied to the ISIC 2018 dataset.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Reference</th>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">B. Acc.</th>
<th valign="top" align="center">Specificity</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Nozdryn et al. (<xref ref-type="bibr" rid="B84">84</xref>)</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center"><italic>88.5</italic></td>
<td valign="top" align="center"><bold>98.6</bold></td>
<td valign="top" align="center">83.3</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Gessert et al. (<xref ref-type="bibr" rid="B85">85</xref>)</td>
<td valign="top" align="left">DenseNet+ResNeXt+SENets</td>
<td valign="top" align="center">85.6</td>
<td valign="top" align="center"><italic>98.4</italic></td>
<td valign="top" align="center">80.9</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Zhuang et al. (<xref ref-type="bibr" rid="B86">86</xref>)</td>
<td valign="top" align="left">SENet+PNASNet</td>
<td valign="top" align="center">84.5</td>
<td valign="top" align="center">98.0</td>
<td valign="top" align="center">80.4</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Mahbod et al. (<xref ref-type="bibr" rid="B87">87</xref>)</td>
<td valign="top" align="left">EfficientNetB0+EfficientNetB1+SeReNeXt50</td>
<td valign="top" align="center">86.2</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Shen et al. (<xref ref-type="bibr" rid="B88">88</xref>)</td>
<td valign="top" align="left">EfficientNetB0</td>
<td valign="top" align="center">85.3</td>
<td valign="top" align="center">97.3</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Barata et al. (<xref ref-type="bibr" rid="B89">89</xref>)</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">79.1</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Tsai et al. (<xref ref-type="bibr" rid="B90">90</xref>)</td>
<td valign="top" align="left">CNN</td>
<td valign="top" align="center">82.1</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">FLF(DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center">86.7</td>
<td valign="top" align="center">97.0</td>
<td valign="top" align="center"><italic>84.6</italic></td>
<td valign="top" align="center"><italic>85.2</italic></td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">DLF(DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center"><bold>89.0</bold></td>
<td valign="top" align="center">97.3</td>
<td valign="top" align="center"><bold>86.1</bold></td>
<td valign="top" align="center"><bold>86.4</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-fn4"><p>Bold and italic values indicate the best and second-best benchmarks, respectively. Here, B.Acc. indicates the balance accuracy.</p></fn>
</table-wrap-foot>
</table-wrap>
<fig id="F8" position="float"><label>Figure 8</label>
<caption><p>Normalized confusion matrices of FLF and DLF on the ISIC 2018 challenge test dataset. AKIEC, actinic keratosis; BCC, basal cell carcinoma; BKL, benign keratosis; DF, dermatofibroma; NV, melanocytic nevi; MEL, melanoma; VASC, vascular lesions. <bold>(a)</bold> Feature-level fusion. <bold>(b)</bold> Decision-level fusion.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g008.tif"><alt-text content-type="machine-generated">Two confusion matrices comparing feature-level fusion and decision-level fusion for classification performance. The left matrix shows feature-level fusion, highlighting high accuracy for BCC, DF, NV, and VASC. The right matrix represents decision-level fusion, indicating perfect accuracy for BCC, DF, and VASC, with improved AKIEC classification compared to feature-level fusion. Color intensity reflects classification accuracy, ranging from light (low accuracy) to dark blue (high accuracy).</alt-text>
</graphic>
</fig>
</sec>
<sec id="s5b4"><label>5.2.4</label><title>Evaluation on the ISIC 2019 dataset</title>
<p>The accuracy, precision, recall, and F1-score on the ISIC 2019 dataset are presented in <xref ref-type="table" rid="T6">Table&#x00A0;6</xref> and <xref ref-type="sec" rid="s13">Supplementary Figure S1</xref>, along with the confusion matrix of our approaches, which is shown in <xref ref-type="fig" rid="F9">Figure&#x00A0;9</xref>. Comparing the CNN-based model with an SVM (<xref ref-type="bibr" rid="B91">91</xref>), GoogleNet, and DarkNet (<xref ref-type="bibr" rid="B96">96</xref>), EfficientNets, SENet, and ResNeXt (<xref ref-type="bibr" rid="B92">92</xref>), and the single end-to-end CNN-based model MobileNetV2 (<xref ref-type="bibr" rid="B95">95</xref>), our proposed FLF framework achieves 94.5&#x0025;, 94.7&#x0025;, 94.4&#x0025;, and 94.4&#x0025; accuracy, precision, recall, and F1-score, respectively, while 95.0&#x0025;, 94.9&#x0025;, 94.8&#x0025;, and 94.8&#x0025; for DLF, respectively. This implies that our proposed end-to-end FLF ensemble framework achieves comparable performance while DLF slightly improves over the existing best-performing benchmark. For example, DLF surpasses by 2.9&#x0025; and 1.8&#x0025;, respectively, for the precision and recall from the existing best-performing benchmark [i.e., the approach in (<xref ref-type="bibr" rid="B94">94</xref>)] while by 2.7&#x0025;, and 1.4&#x0025; for FLF.</p>
<table-wrap id="T6" position="float"><label>Table 6</label>
<caption><p>Comparison of the proposed framework with existing methods applied to the ISIC 2019 dataset.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Reference</th>
<th valign="top" align="center">Method</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Precision</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Kassem et al. (<xref ref-type="bibr" rid="B91">91</xref>)</td>
<td valign="top" align="left">GoogleNet+SVM</td>
<td valign="top" align="center"><italic>94.9</italic></td>
<td valign="top" align="center">80.4</td>
<td valign="top" align="center">79.8</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Gessert et al. (<xref ref-type="bibr" rid="B92">92</xref>)</td>
<td valign="top" align="left">EfficientNets+SENet+ResNeXt</td>
<td valign="top" align="center">63.0</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">73.0</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Bhardwa et al. (<xref ref-type="bibr" rid="B93">93</xref>)</td>
<td valign="top" align="left">CNN+SVM</td>
<td valign="top" align="center">86.0</td>
<td valign="top" align="center">80.0</td>
<td valign="top" align="center">60.0</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Jain et al. (<xref ref-type="bibr" rid="B94">94</xref>)</td>
<td valign="top" align="left">DNN</td>
<td valign="top" align="center"><bold>95.0</bold></td>
<td valign="top" align="center">92.0</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Wang et al. (<xref ref-type="bibr" rid="B95">95</xref>)</td>
<td valign="top" align="left">MobileNetV2</td>
<td valign="top" align="center">84.6</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Abdelhafeez et al. (<xref ref-type="bibr" rid="B96">96</xref>)</td>
<td valign="top" align="left">GoogleNet+DarkNet+SVM</td>
<td valign="top" align="center">85.7</td>
<td valign="top" align="center">84.0</td>
<td valign="top" align="center">76.1</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Mustafa et al. (<xref ref-type="bibr" rid="B73">73</xref>)</td>
<td valign="top" align="left">ResUNet+AlexNet</td>
<td valign="top" align="center">93.4</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
<td valign="top" align="center">&#x2013;</td>
</tr>
<tr>
<td valign="top" align="left">Pacal et al. (<xref ref-type="bibr" rid="B83">83</xref>)</td>
<td valign="top" align="left">CNN + ViT</td>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">90.4</td>
<td valign="top" align="center">87.7</td>
<td valign="top" align="center">88.9</td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">FLF(DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center">94.5</td>
<td valign="top" align="center"><italic>94.7</italic></td>
<td valign="top" align="center"><italic>94.4</italic></td>
<td valign="top" align="center"><italic>94.4</italic></td>
</tr>
<tr>
<td valign="top" align="left">Our</td>
<td valign="top" align="left">DLF(DenseNet201+VGG19+ViT)</td>
<td valign="top" align="center"><bold>95.0</bold></td>
<td valign="top" align="center"><bold>94.9</bold></td>
<td valign="top" align="center"><bold>94.8</bold></td>
<td valign="top" align="center"><bold>94.8</bold></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-fn5"><p>Bold and italic values indicate the best and second-best benchmarks, respectively.</p></fn>
</table-wrap-foot>
</table-wrap>
<fig id="F9" position="float"><label>Figure 9</label>
<caption><p>Normalized confusion matrices of FLF and DLF on the ISIC 2019 dataset. AKIEC, actinic keratosis; BCC, basal cell carcinoma; BKL, benign keratosis; DF, dermatofibroma; NV, melanocytic nevi; MEL, melanoma; SCC, squamous cell carcinoma; VASC, vascular lesions. <bold>(a)</bold> Feature-level fusion. <bold>(b)</bold> Decision-level fusion.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g009.tif"><alt-text content-type="machine-generated">Two confusion matrices comparing classification results. The first matrix, labeled \"Feature-level fusion\", shows higher accuracy in \"NV\" and \"DF\" classifications. The second matrix, labeled \"Decision-level fusion\", exhibits similar trends, with \"NV\" and \"VASC\" classifications showing high accuracy. Both matrices display scores ranging from 0.0 to 1.0, with darker shades indicating higher values.</alt-text>
</graphic>
</fig>
</sec>
</sec>
</sec>
<sec id="s6" sec-type="discussion"><label>6</label><title>Discussion</title>
<p>Our proposed framework leverages an ensemble approach that integrates two convolutional neural networks (CNN)-based architectures: a modified DenseNet201 and a VGG19. Additionally, it incorporates an attention-based vision transformer model, ViT. To address data scarcity, we employed a pre-training strategy utilizing a generative adversarial network (GAN) (<xref ref-type="bibr" rid="B65">65</xref>) for generating image samples artificially. We also added samples from other ISIC archives. Moreover, we added other online data augmentation techniques during training. Here, we conduct a comparative analysis of the performance achieved by our proposed framework against various baselines (i.e., base models w/o Data Augmentation (DA), DA with GAN (DA&#x005F;GAN), and ISIC archives (DA&#x005F;Archive)). Additionally, we also compared each of the base models. Similarly, we further delve into an in-depth performance analysis of the FLF and DLF for the final classification. In this section, for ablation studies of our proposed framework, we selected the small-scale and large-scale datasets PH2 and ISIC 2019 and adhered to the identical protocol outlined in <xref ref-type="sec" rid="s4a">Section 4.1</xref> for these analyses.</p>
<sec id="s6a"><label>6.1</label><title>Impact of data augmentation</title>
<p>We illustrate the impact of data augmentation for each of the base models, modified VGG19, DenseNet201 and ViT along with the proposed frameworks in <xref ref-type="table" rid="T7">Table&#x00A0;7</xref>. We can observe that the accuracy is improved by a large margin for a small-scale dataset (i.e., PH2 dataset) when employing the augmentation using GAN as well as ISIC archives. As shown in <xref ref-type="table" rid="T7">Table&#x00A0;7</xref>, the accuracy is improved from 11.6&#x0025; to 22.8&#x0025; when we increase the training sample size using the deep generative approach. Moreover, improvement continues when the training data volume is again increased by adding samples from ISIC archives. Overall, we can see that the accuracy is improved from 15.2&#x0025; to 25.4&#x0025; when we augment the training dataset using a generative approach and add samples from ISIC archives. We think that this large margin accuracy improvement for the small-scale dataset PH2 when augmenting the training dataset because large-scale training datasets are essential for the DL-based approach for effective training and generalization.</p>
<table-wrap id="T7" position="float"><label>Table 7</label>
<caption><p>Result of each base model before and after data augmentation.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Dataset</th>
<th valign="top" align="left">Method</th>
<th valign="top" align="center">DA&#x005F;GAN</th>
<th valign="top" align="center">DA&#x005F;Archive</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Precision</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="15">PH2</td>
<td valign="top" align="left" rowspan="3">VGG19</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">70.0</td>
<td valign="top" align="center">71.6</td>
<td valign="top" align="center">70.0</td>
<td valign="top" align="center">68.9</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">89.0</td>
<td valign="top" align="center">88.9</td>
<td valign="top" align="center">88.9</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">93.4</td>
<td valign="top" align="center">93.5</td>
<td valign="top" align="center">93.4</td>
<td valign="top" align="center">93.4</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">DenseNet201</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">72.5</td>
<td valign="top" align="center">74.0</td>
<td valign="top" align="center">72.5</td>
<td valign="top" align="center">72.6</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">95.3</td>
<td valign="top" align="center">95.3</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">97.9</td>
<td valign="top" align="center">97.9</td>
<td valign="top" align="center">97.9</td>
<td valign="top" align="center">97.9</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">ViT</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">80.0</td>
<td valign="top" align="center">83.6</td>
<td valign="top" align="center">80.0</td>
<td valign="top" align="center">79.8</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">95.4</td>
<td valign="top" align="center">95.2</td>
<td valign="top" align="center">95.2</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">98.8</td>
<td valign="top" align="center">98.8</td>
<td valign="top" align="center">98.8</td>
<td valign="top" align="center">98.8</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Our FLF</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">82.5</td>
<td valign="top" align="center">84.1</td>
<td valign="top" align="center">82.5</td>
<td valign="top" align="center">82.6</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">96.9</td>
<td valign="top" align="center">95.7</td>
<td valign="top" align="center">95.5</td>
<td valign="top" align="center">95.5</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">99.3</td>
<td valign="top" align="center">99.3</td>
<td valign="top" align="center">99.3</td>
<td valign="top" align="center">99.3</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Our DLF</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">75.0</td>
<td valign="top" align="center">75.2</td>
<td valign="top" align="center">75.0</td>
<td valign="top" align="center">74.9</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">95.4</td>
<td valign="top" align="center">95.4</td>
<td valign="top" align="center">95.4</td>
<td valign="top" align="center">95.4</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.2</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="15">ISIC 2019</td>
<td valign="top" align="left" rowspan="3">VGG19</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">82.0</td>
<td valign="top" align="center">82.0</td>
<td valign="top" align="center">82.0</td>
<td valign="top" align="center">82.0</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">85.1</td>
<td valign="top" align="center">85.0</td>
<td valign="top" align="center">85.1</td>
<td valign="top" align="center">85.0</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">87.4</td>
<td valign="top" align="center">86.5</td>
<td valign="top" align="center">86.7</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">DenseNet201</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">90.0</td>
<td valign="top" align="center">90.0</td>
<td valign="top" align="center">90.0</td>
<td valign="top" align="center">90.0</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">90.7</td>
<td valign="top" align="center">90.8</td>
<td valign="top" align="center">90.7</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">93.3</td>
<td valign="top" align="center">93.0</td>
<td valign="top" align="center">93.1</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">ViT</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">91.5</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">91.5</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">92.3</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">94.3</td>
<td valign="top" align="center">94.2</td>
<td valign="top" align="center">94.1</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Our FLF</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">91.7</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">91.6</td>
<td valign="top" align="center">91.5</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">92.3</td>
<td valign="top" align="center">92.4</td>
<td valign="top" align="center">92.3</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">94.5</td>
<td valign="top" align="center">94.7</td>
<td valign="top" align="center">94.4</td>
<td valign="top" align="center">94.4</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">Our DLF</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">91.9</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">91.8</td>
<td valign="top" align="center">91.7</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2717;</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">92.5</td>
<td valign="top" align="center">92.6</td>
<td valign="top" align="center">92.5</td>
</tr>
<tr>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">&#x2713;</td>
<td valign="top" align="center">95.0</td>
<td valign="top" align="center">94.9</td>
<td valign="top" align="center">94.8</td>
<td valign="top" align="center">94.8</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In contrast, for the large-scale dataset ISIC 2019, we observed marginally improved accuracy when employing data augmentation techniques. For instance, the accuracy is improved from 0.7&#x0025; to 0.8&#x0025; when the training sample is augmented by a generative approach GAN. A similar tendency we observed when we added samples from ISIC archives. In general, the accuracy is improved by around 3.0&#x0025; when the training data is augmented using GAN and ISIC archives. This modest improvement can be attributed to the inherent characteristics of the ISIC 2019 dataset. As a large-scale dataset encompassing 22,797 samples, it already possesses a high degree of diversity and quantity, providing a sufficient foundation for robust model training.</p>
</sec>
<sec id="s6b"><label>6.2</label><title>Impact of individuals module</title>
<p>We evaluated each of the base models considered in our framework separately: The modified VGG19, DenseNet201 and ViT. We can observe that the vision transformer-based ViT model works better than the CNN-based model. For example, the accuracy of the ViT model with data augmentation is 98.8&#x0025; on the small-scale dataset PH2 while 97.9&#x0025;/93.4&#x0025; for the DenseNet201/VGG19. Regarding the large-scale dataset ISIC 2019, we observe a similar tendency that the ViT model works better than the CNN-based approach DenseNet201 and VGG19. We think that ViT-based models work better because they capture global and local contexts more effectively and learn complex relationships without relying on fixed receptive fields. Furthermore, unlike the CNN-based approach, ViT leverages self-attention mechanisms to consider interactions between image patches, enabling them to better understand long-range dependencies crucial for tasks like detection and classification.</p>
<p>Regarding the CNN-based approaches of the modified DenseNet201 and VGG19, we can observe that DenseNet201 works better than VGG19. For example, DenseNet201 obtained accuracy at 97.9&#x0025; on the small-scale dataset PH2 while 93.4&#x0025; on the large-scale dataset ISIC 2019. This indicates that it surpassed 4.5&#x0025; and 6.5&#x0025; from the VGG19, respectively, for the PH2 and ISIC 2019 datasets. We think that it may be the cause of reason, such as VGG19 is a relatively straightforward network where each layer feeds into the next. At the same time, DenseNet201 incorporates dense connections, where each layer receives additional inputs from all preceding layers and passes its feature maps to all subsequent layers. This characteristic allows for feature reuse throughout the network, consequently enhancing model performance and mitigating the risks of overfitting and vanishing gradients.</p>
<p>To further assess the interpretability and clinical relevance of the models, we generated Grad-CAM (<xref ref-type="bibr" rid="B98">98</xref>) visualizations using the DenseNet201 architecture. These heatmaps highlight the image regions that most strongly contributed to each prediction, showing that the model predominantly focuses on lesion areas rather than irrelevant background. An example Grad-CAM activation map is presented in <xref ref-type="fig" rid="F10">Figure&#x00A0;10</xref>, demonstrating the alignment between the model&#x2019;s attention and dermatological diagnostic regions.</p>
<fig id="F10" position="float"><label>Figure 10</label>
<caption><p>Grad-CAM visualizations of different skin lesion classes from the HAM10000 dataset, where AKIEC, actinic keratosis; BCC, basal cell carcinoma; BKL, benign keratosis; DF, dermatofibroma; NV, melanocytic nevi; MEL, Melanoma; VASC, vascular lesions. <bold>(a)</bold> AKIEC. <bold>(b)</bold> BCC. <bold>(c)</bold> BKL. <bold>(d)</bold> DF. <bold>(e)</bold> NV. <bold>(f)</bold> MEL. <bold>(g)</bold> VASC.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="fdgth-07-1478688-g0010.tif"><alt-text content-type="machine-generated">Seven images show skin lesions with varying colors and patterns, labeled as Figures 10a to 10g. Each lesion is depicted with heatmap colors, indicating different skin conditions: AKIEC, BCC, BKL, DF, NV, MEL, and VASC.</alt-text>
</graphic>
</fig>
</sec>
<sec id="s6c"><label>6.3</label><title>Impact of the attention mechanism</title>
<p>Our framework employs the MHSA mechanism of ViT (<xref ref-type="bibr" rid="B44">44</xref>). To assess the impact of using an attention-based model, we compare the performance of ViT against CNN-based models (i.e., DenseNet201 and VGG19). ViT consistently outperforms CNN models, achieving an accuracy of 98.8&#x0025; on PH2 compared to 97.9&#x0025; and 93.4&#x0025; for DenseNet201 and VGG19, respectively. A similar trend is observed on ISIC 2019. These results confirm that the inclusion of MHSA into the ViT improves accuracy and the ViT&#x2019;s ability to capture long-range dependencies and global contextual features.</p>
</sec>
<sec id="s6d"><label>6.4</label><title>Comparison with feature and decision-level fusion</title>
<p>For the final classification stage of our proposed ensemble model, we employed a fusion strategy that leverages both feature-level fusion (FLF) and decision-level fusion (DLF). The performance acheived by this framework is presented in <xref ref-type="table" rid="T2">Tables&#x00A0;2</xref>, <xref ref-type="table" rid="T3">3</xref>, <xref ref-type="table" rid="T5">5</xref>, <xref ref-type="table" rid="T6">6</xref>. <xref ref-type="sec" rid="s13">Supplementary Figure S2</xref> includes the classification report figures for FLF and DLF models, based on the HAM10000, ISIC 2018, and ISIC 2019 datasets. We can observe that, DLF exhibits marginally superior performance compared to the end-to-end FLF model. For example, the DLF surpasses the accuracy by 2.3&#x0025; from FLF for the ISIC 2018 dataset and 0.5&#x0025; for the ISIC 2019 dataset. This may cause robust training for individual models and merge the individual decision from the respective classifier. However, the decision-level fusion (DLF) necessitates a longer processing time to arrive at the final classification result, and it is not an end-to-end process.</p>
</sec>
<sec id="s6e"><label>6.5</label><title>Comparison of different decision-level fusion techniques</title>
<p>We performed various fusion strategies for decision-level fusion, specifically employing averaging voting, weighted averaging voting, and majority voting (<xref ref-type="bibr" rid="B99">99</xref>). Averaging voting (AVG) refers to taking the mean of the prediction scores from base classifiers to make the final decision, while weighted averaging voting (WAVG) applies different weights to these scores. For the weighted average case, we empirically assigned weights of 0.4, 0.3, and 0.3 to the prediction scores of ViT, DenseNet201, and VGG19, respectively. These weightings were determined through a sensitivity analysis, which revealed that the selected values provide the best balance between model performance across the HAM10000, ISIC 2018, and ISIC 2019 datasets. The majority voting (MJ) technique, as described in <xref ref-type="sec" rid="s3c">Section 3.3</xref>, involves selecting the class that appears most frequently among the predictions of the base classifiers. The results are presented in <xref ref-type="table" rid="T8">Table&#x00A0;8</xref>. Our observations show that the MJ technique achieves superior accuracy, while AVG and WAVG perform almost equally. This superiority of MJ can be attributed to its core principle of aggregating predictions and selecting the most frequent class, which reduces the impact of outliers or misclassifications from individual base models.</p>
<table-wrap id="T8" position="float"><label>Table 8</label>
<caption><p>Performance evaluation of our proposed ensemble framework for decision-level fusion using different fusion techniques [average (AVG), weighted average (WAVG), and majority voting (MJ)].</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Dataset</th>
<th valign="top" align="left">Method</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Precision</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left" rowspan="3">PH2</td>
<td valign="top" align="left">AVG</td>
<td valign="top" align="center">97.9</td>
<td valign="top" align="center">98.0</td>
<td valign="top" align="center">97.9</td>
<td valign="top" align="center">97.9</td>
</tr>
<tr>
<td valign="top" align="left">WAVG</td>
<td valign="top" align="center">98.1</td>
<td valign="top" align="center">98.1</td>
<td valign="top" align="center">98.1</td>
<td valign="top" align="center">98.1</td>
</tr>
<tr>
<td valign="top" align="left">MJ</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.2</td>
<td valign="top" align="center">98.1</td>
</tr>
<tr>
<td valign="top" align="left" rowspan="3">ISIC 2019</td>
<td valign="top" align="left">AVG</td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">93.8</td>
<td valign="top" align="center">93.8</td>
</tr>
<tr>
<td valign="top" align="left">WAVG</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">94.0</td>
<td valign="top" align="center">93.9</td>
<td valign="top" align="center">93.8</td>
</tr>
<tr>
<td valign="top" align="left">MJ</td>
<td valign="top" align="center">95.0</td>
<td valign="top" align="center">94.9</td>
<td valign="top" align="center">94.8</td>
<td valign="top" align="center">94.8</td>
</tr>
</tbody>
</table>
</table-wrap>
</sec>
<sec id="s6f"><label>6.6</label><title>Cross-dataset evaluation</title>
<p>To assess the generalizability and robustness of the proposed approach, we conducted a cross-dataset evaluation by training the models on the PH2 dataset and testing them on the Derm7pt test dataset (<xref ref-type="bibr" rid="B100">100</xref>). This setup simulates a real-world scenario in which a model is trained on a small-scale dataset and applied to an independent large-scale dataset with potentially different data distributions. The Derm7pt dataset includes five general disease classes: melanoma, nevus, seborrheic keratosis, basal cell carcinoma, and miscellaneous. In our experimental setting, we focused on the two disease classes common to the PH2 dataset (i.e., nevus and melanoma). For this purpose, we merged common nevus (CN) and atypical nevus (AN) into a single nevus class. The experimental results are shown in <xref ref-type="table" rid="T9">Table&#x00A0;9</xref> for each of the base models: ViT, DenseNet201, and VGG19, as well as our proposed FLF and DLF approaches. We can observe that DenseNet201 achieved the highest accuracy (80.6&#x0025;) and F1-score (79.6&#x0025;) among the individual models. Compared to all base models, the FLF ensemble yielded the best overall performance, with an accuracy of 82.1&#x0025;, precision of 82.4&#x0025;, recall of 82.1&#x0025;, and F1-score of 80.8&#x0025;. The DLF approach also outperformed the individual models, achieving 81.3&#x0025; accuracy and an F1-score of 79.4&#x0025;. These results demonstrate that the proposed fusion frameworks improve generalization and robustness for cross-dataset evaluation.</p>
<table-wrap id="T9" position="float"><label>Table 9</label>
<caption><p>Cross-dataset evaluation results: Models trained on the PH2 dataset and tested on the Derm7pt dataset.</p></caption>
<table frame="hsides" rules="groups">
<colgroup>
<col align="left"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
<col align="center"/>
</colgroup>
<thead>
<tr>
<th valign="top" align="left">Method</th>
<th valign="top" align="center">Accuracy</th>
<th valign="top" align="center">Precision</th>
<th valign="top" align="center">Recall</th>
<th valign="top" align="center">F1-score</th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">VGG19</td>
<td valign="top" align="center">79.6</td>
<td valign="top" align="center">80.5</td>
<td valign="top" align="center">79.6</td>
<td valign="top" align="center">77.3</td>
</tr>
<tr>
<td valign="top" align="left">DenseNet201</td>
<td valign="top" align="center">80.6</td>
<td valign="top" align="center">80.0</td>
<td valign="top" align="center">80.6</td>
<td valign="top" align="center">79.6</td>
</tr>
<tr>
<td valign="top" align="left">ViT</td>
<td valign="top" align="center">78.4</td>
<td valign="top" align="center">79.0</td>
<td valign="top" align="center">78.4</td>
<td valign="top" align="center">75.8</td>
</tr>
<tr>
<td valign="top" align="left">Our FLF</td>
<td valign="top" align="center"><bold>82.1</bold></td>
<td valign="top" align="center"><bold>82.4</bold></td>
<td valign="top" align="center"><bold>82.1</bold></td>
<td valign="top" align="center"><bold>80.8</bold></td>
</tr>
<tr>
<td valign="top" align="left">Our DLF</td>
<td valign="top" align="center"><italic>81.3</italic></td>
<td valign="top" align="center"><italic>81.9</italic></td>
<td valign="top" align="center"><italic>81.2</italic></td>
<td valign="top" align="center"><italic>79.4</italic></td>
</tr>
</tbody>
</table>
<table-wrap-foot>
<fn id="table-fn9a"><p>Bold and italic values indicate the best and second-best benchmarks, respectively.</p></fn>
</table-wrap-foot>
</table-wrap>
</sec>
</sec>
<sec id="s7" sec-type="conclusions"><label>7</label><title>Conclusion</title>
<p>Skin disease is one of the most prevalent and potentially life-threatening diseases that has affected people all over the world. Early detection and treatment are crucial for improving patient outcomes. However, the subjective nature of the healthcare providers&#x2019; approach to early diagnosis can be both costly and unpredictable, potentially leading to variable results in patient care. In this paper, we proposed a deep learning-based ensemble model, including CNN-based base models and an attention-based vision transformer network for diagnosing skin diseases. The proposed framework considers the feature-level fusion (FLF) that is extracted from each of the base models and merges them through pointwise addition in a separated layer along with a final classification layer with Softmax. We employed the decision-level fusion (DLF) by employing the majority voting for each classification result.</p>
<p>To evaluate the proposed framework, we employed four publicly available datasets encompassing ten distinct skin diseases: Actinic keratosis, Basal cell carcinoma, Benign keratosis, Dermatofibroma, Melanocytic nevi, Melanoma, Squamous cell carcinoma, Common nevi, Atypical nevi, and Vascular lesions. We assessed performance using standard metrics: accuracy, precision, recall, and F1-score. Our results demonstrate that the proposed FLF and DLF outperform existing methods. The experimental evaluation shows the majority voting techniques&#x2019; effectiveness over other ensemble techniques like Averaging and Weighted Averaging. Furthermore, we conducted a comprehensive analysis of each base model within the proposed framework, revealing a significant accuracy improvement attributable to the framework itself. Additionally, we employed a variety of online and offline data augmentation methods to expand the training dataset, mitigate overfitting, and enhance model generalizability. It is evident from our findings that data augmentation significantly enhances accuracy. Despite these promising results, the proposed approach has certain limitations. Particularly, the architecture of the proposed ensemble model requires the concurrent training and inference of two CNN-based models and an attention-based Vision Transformer (ViT) model, leading to increased training time and demanding significant computational resources. Therefore, this may affect practical challenges in resource-constrained real-time clinical settings. Future work could explore a more lightweight ensemble model to mitigate these constraints. Additionally, future architectures could best investigate a potential closed loop between data generation and data analysis to avoid the explicit generation and training of data.</p>
<p>Since our model uses the canonical MHSA without modification, an additional ablation comparing alternative attention mechanisms is outside the scope of this study but is suggested as future work.</p>
</sec>
</body>
<back>
<sec id="s8" sec-type="data-availability"><title>Data availability statement</title>
<p>The raw data supporting the conclusions of this article will be made available by the authors, without undue reservation.</p>
</sec>
<sec id="s9" sec-type="ethics-statement"><title>Ethics statement</title>
<p>Ethical approval was not required for the study involving humans in accordance with the local legislation and institutional requirements. Written informed consent to participate in this study was not required from the participants or the participants&#x2019; legal guardians/next of kin in accordance with the national legislation and the institutional requirements.</p>
</sec>
<sec id="s10" sec-type="author-contributions"><title>Author contributions</title>
<p>MZU: Conceptualization, Data curation, Funding acquisition, Methodology, Resources, Software, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. MAS: Methodology, Software, Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. BS: Data curation, Formal analysis, Methodology, Validation, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. MNM: Visualization, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing. MARA: Conceptualization, Data curation, Funding acquisition, Writing &#x2013; original draft, Writing &#x2013; review &#x0026; editing.</p>
</sec>
<sec id="s11" sec-type="funding-information"><title>Funding</title>
<p>The author(s) declare that financial support was received for the research and/or publication of this article. This work was partially supported by the ICT division, Government of the People&#x2019;s Republic of Bangladesh, No: (1280101-120008431-3631108). The authors are grateful for the partial grant from the Center for Natural Science and Engineering Research (CNSER).</p>
</sec>
<sec id="s12" sec-type="COI-statement"><title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec id="s14" sec-type="disclaimer"><title>Publisher&#x0027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s13" sec-type="supplementary-material"><title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fdgth.2025.1478688/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fdgth.2025.1478688/full&#x0023;supplementary-material</ext-link></p>
<supplementary-material id="SD1" content-type="local-data">
<media mimetype="application" mime-subtype="pdf" xlink:href="Datasheet1.pdf"/></supplementary-material>
</sec>
<fn-group>
<fn id="FN0001"><p><sup>1</sup><ext-link ext-link-type="uri" xlink:href="https://challenge.isic-archive.com/data/#2018">https://challenge.isic-archive.com/data/&#x0023;2018</ext-link></p></fn>
<fn id="FN0002"><p><sup>2</sup><ext-link ext-link-type="uri" xlink:href="https://challenge.isic-archive.com/data/#2019">https://challenge.isic-archive.com/data/&#x0023;2019</ext-link></p></fn>
<fn id="FN0003"><p><sup>3</sup><ext-link ext-link-type="uri" xlink:href="https://www.isic-archive.com/">https://www.isic-archive.com/</ext-link></p></fn>
</fn-group>
<ref-list><title>References</title>
<ref id="B1"><label>1.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Li</surname><given-names>H</given-names></name><name><surname>Pan</surname><given-names>Y</given-names></name><name><surname>Zhao</surname><given-names>J</given-names></name><name><surname>Zhang</surname><given-names>L</given-names></name></person-group>. <article-title>Skin disease diagnosis with deep learning: a review</article-title>. <source>Neurocomputing</source>. (<year>2021</year>) <volume>464</volume>:<fpage>364</fpage>&#x2013;<lpage>93</lpage>. <pub-id pub-id-type="doi">10.1016/j.neucom.2021.08.096</pub-id></citation></ref>
<ref id="B2"><label>2.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Thieme</surname><given-names>AH</given-names></name><name><surname>Zheng</surname><given-names>Y</given-names></name><name><surname>Machiraju</surname><given-names>G</given-names></name><name><surname>Sadee</surname><given-names>C</given-names></name><name><surname>Mittermaier</surname><given-names>M</given-names></name><name><surname>Gertler</surname><given-names>M</given-names></name><etal/></person-group>. <article-title>A deep-learning algorithm to classify skin lesions from mpox virus infection</article-title>. <source>Nat Med</source>. (<year>2023</year>) <volume>29</volume>(<issue>3</issue>):<fpage>738</fpage>&#x2013;<lpage>47</lpage>. <pub-id pub-id-type="doi">10.1038/s41591-023-02225-7</pub-id><pub-id pub-id-type="pmid">36864252</pub-id></citation></ref>
<ref id="B3"><label>3.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khan</surname><given-names>MA</given-names></name><name><surname>Muhammad</surname><given-names>K</given-names></name><name><surname>Sharif</surname><given-names>M</given-names></name><name><surname>Akram</surname><given-names>T</given-names></name><name><surname>de Albuquerque</surname><given-names>VHC</given-names></name></person-group>. <article-title>Multi-class skin lesion detection and classification via teledermatology</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2021</year>) <volume>25</volume>(<issue>12</issue>):<fpage>4267</fpage>&#x2013;<lpage>75</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2021.3067789</pub-id><pub-id pub-id-type="pmid">33750716</pub-id></citation></ref>
<ref id="B4"><label>4.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Hu</surname><given-names>M</given-names></name><name><surname>Li</surname><given-names>Y</given-names></name><name><surname>Yang</surname><given-names>X</given-names></name></person-group>. <article-title>Skinsam: Empowering skin cancer segmentation with segment anything model</article-title>. <comment><italic>arXiv</italic> [Preprint]. <italic>arXiv:2304.13973</italic> (2023)</comment>.</citation></ref>
<ref id="B5"><label>5.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Davis</surname><given-names>LE</given-names></name><name><surname>Shalin</surname><given-names>SC</given-names></name><name><surname>Tackett</surname><given-names>AJ</given-names></name></person-group>. <article-title>Current state of melanoma diagnosis and treatment</article-title>. <source>Cancer Biol Ther</source>. (<year>2019</year>) <volume>20</volume>(<issue>11</issue>):<fpage>1366</fpage>&#x2013;<lpage>79</lpage>. <pub-id pub-id-type="doi">10.1080/15384047.2019.1640032</pub-id><pub-id pub-id-type="pmid">31366280</pub-id></citation></ref>
<ref id="B6"><label>6.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Siegel</surname><given-names>RL</given-names></name><name><surname>Miller</surname><given-names>KD</given-names></name><name><surname>Jemal</surname><given-names>A</given-names></name></person-group>. <article-title>Cancer statistics, 2019</article-title>. <source>CA Cancer J Clin</source>. (<year>2019</year>) <volume>69</volume>(<issue>1</issue>):<fpage>7</fpage>&#x2013;<lpage>34</lpage>.<pub-id pub-id-type="pmid">30620402</pub-id></citation></ref>
<ref id="B7"><label>7.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pathan</surname><given-names>S</given-names></name><name><surname>Prabhu</surname><given-names>KG</given-names></name><name><surname>Siddalingaswamy</surname><given-names>PC</given-names></name></person-group>. <article-title>Techniques and algorithms for computer aided diagnosis of pigmented skin lesions&#x2014;a review</article-title>. <source>Biomed Signal Process Control</source>. (<year>2018</year>) <volume>39</volume>:<fpage>237</fpage>&#x2013;<lpage>62</lpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2017.07.010</pub-id></citation></ref>
<ref id="B8"><label>8.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chatterjee</surname><given-names>S</given-names></name><name><surname>Dey</surname><given-names>D</given-names></name><name><surname>Munshi</surname><given-names>S</given-names></name><name><surname>Gorai</surname><given-names>S</given-names></name></person-group>. <article-title>Extraction of features from cross correlation in space and frequency domains for classification of skin lesions</article-title>. <source>Biomed Signal Process Control</source>. (<year>2019</year>) <volume>53</volume>:<fpage>101581</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2019.101581</pub-id></citation></ref>
<ref id="B9"><label>9.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Celebi</surname><given-names>ME</given-names></name><name><surname>Kingravi</surname><given-names>HA</given-names></name><name><surname>Uddin</surname><given-names>B</given-names></name><name><surname>Iyatomi</surname><given-names>H</given-names></name><name><surname>Alp Aslandogan</surname><given-names>Y</given-names></name><name><surname>Stoecker</surname><given-names>WV</given-names></name><etal/></person-group>. <article-title>A methodological approach to the classification of dermoscopy images</article-title>. <source>Comput Med Imaging Graph</source>. (<year>2007</year>) <volume>31</volume>(<issue>6</issue>):<fpage>362</fpage>&#x2013;<lpage>73</lpage>. <pub-id pub-id-type="doi">10.1016/j.compmedimag.2007.01.003</pub-id><pub-id pub-id-type="pmid">17387001</pub-id></citation></ref>
<ref id="B10"><label>10.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Campos-do Carmo</surname><given-names>G</given-names></name><name><surname>Ramos-e Silva</surname><given-names>M</given-names></name></person-group>. <article-title>Dermoscopy: basic concepts</article-title>. <source>Int J Dermatol</source>. (<year>2008</year>) <volume>47</volume>(<issue>7</issue>):<fpage>712</fpage>&#x2013;<lpage>9</lpage>.<pub-id pub-id-type="pmid">18613881</pub-id></citation></ref>
<ref id="B11"><label>11.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Korfitis</surname><given-names>C</given-names></name><name><surname>Gregoriou</surname><given-names>S</given-names></name><name><surname>Antoniou</surname><given-names>C</given-names></name><name><surname>Katsambas</surname><given-names>AD</given-names></name><name><surname>Rigopoulos</surname><given-names>D</given-names></name></person-group>. <article-title>Skin biopsy in the context of dermatological diagnosis: a retrospective cohort study</article-title>. <source>Dermatol Res Pract</source>. (<year>2014</year>) <volume>2014</volume>:<fpage>734906</fpage>. <pub-id pub-id-type="doi">10.1155/2014/734906</pub-id><pub-id pub-id-type="pmid">24600476</pub-id></citation></ref>
<ref id="B12"><label>12.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Aladhadh</surname><given-names>S</given-names></name><name><surname>Alsanea</surname><given-names>M</given-names></name><name><surname>Aloraini</surname><given-names>M</given-names></name><name><surname>Khan</surname><given-names>T</given-names></name><name><surname>Habib</surname><given-names>S</given-names></name><name><surname>Islam</surname><given-names>M</given-names></name></person-group>. <article-title>An effective skin cancer classification mechanism via medical vision transformer</article-title>. <source>Sensors</source>. (<year>2022</year>) <volume>22</volume>(<issue>11</issue>):<fpage>4008</fpage>. <pub-id pub-id-type="doi">10.3390/s22114008</pub-id><pub-id pub-id-type="pmid">35684627</pub-id></citation></ref>
<ref id="B13"><label>13.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Zhao</surname><given-names>Z</given-names></name></person-group>. <article-title>Skin cancer classification based on convolutional neural networks and vision transformers</article-title>. <comment>In: <italic>Journal of Physics: Conference Series</italic>. IOP Publishing (2022). Vol. 2405. p. 012037</comment>.</citation></ref>
<ref id="B14"><label>14.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Attallah</surname><given-names>O</given-names></name></person-group>. <article-title>Skin-cad: explainable deep learning classification of skin cancer from dermoscopic images by feature selection of dual high-level cnns features and transfer learning</article-title>. <source>Comput Biol Med</source>. (<year>2024</year>) <volume>178</volume>:<fpage>108798</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108798</pub-id><pub-id pub-id-type="pmid">38925085</pub-id></citation></ref>
<ref id="B15"><label>15.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shehzad</surname><given-names>K</given-names></name><name><surname>Zhenhua</surname><given-names>T</given-names></name><name><surname>Shoukat</surname><given-names>S</given-names></name><name><surname>Saeed</surname><given-names>A</given-names></name><name><surname>Ahmad</surname><given-names>I</given-names></name><name><surname>Bhatti</surname><given-names>SS</given-names></name><etal/></person-group>. <article-title>A deep-ensemble-learning-based approach for skin cancer diagnosis</article-title>. <source>Electronics</source>. (<year>2023</year>) <volume>12</volume>(<issue>6</issue>):<fpage>1342</fpage>. <pub-id pub-id-type="doi">10.3390/electronics12061342</pub-id></citation></ref>
<ref id="B16"><label>16.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ahammed</surname><given-names>M</given-names></name><name><surname>Mamun</surname><given-names>MA</given-names></name><name><surname>Uddin</surname><given-names>MS</given-names></name></person-group>. <article-title>A machine learning approach for skin disease detection and classification using image segmentation</article-title>. <source>Healthc Anal</source>. (<year>2022</year>) <volume>2</volume>:<fpage>100122</fpage>. <pub-id pub-id-type="doi">10.1016/j.health.2022.100122</pub-id></citation></ref>
<ref id="B17"><label>17.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jagdish</surname><given-names>M</given-names></name><name><surname>Guamangate</surname><given-names>SPG</given-names></name><name><surname>L&#x00F3;pez</surname><given-names>MAG</given-names></name><name><surname>De La Cruz-Vargas</surname><given-names>JA</given-names></name><name><surname>Camacho</surname><given-names>MER</given-names></name></person-group>. <article-title>Advance study of skin diseases detection using image processing methods</article-title>. <source>Nat Volatiles Essent OILS J</source>. (<year>2022</year>) <volume>9</volume>:<fpage>997</fpage>&#x2013;<lpage>1007</lpage>.</citation></ref>
<ref id="B18"><label>18.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Niu</surname><given-names>K</given-names></name><name><surname>Guo</surname><given-names>J</given-names></name><name><surname>Pan</surname><given-names>Y</given-names></name><name><surname>Gao</surname><given-names>X</given-names></name><name><surname>Peng</surname><given-names>X</given-names></name><name><surname>Li</surname><given-names>N</given-names></name><etal/></person-group>. <article-title>Multichannel deep attention neural networks for the classification of autism spectrum disorder using neuroimaging and personal characteristic data</article-title>. <source>Complexity</source>. (<year>2020</year>) <volume>2020</volume>:<fpage>1</fpage>&#x2013;<lpage>9</lpage>.</citation></ref>
<ref id="B19"><label>19.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Brinker</surname><given-names>TJ</given-names></name><name><surname>Hekler</surname><given-names>A</given-names></name><name><surname>Enk</surname><given-names>AH</given-names></name><name><surname>Berking</surname><given-names>C</given-names></name><name><surname>Haferkamp</surname><given-names>S</given-names></name><name><surname>Hauschild</surname><given-names>A</given-names></name><etal/></person-group>. <article-title>Deep neural networks are superior to dermatologists in melanoma image classification</article-title>. <source>Eur J Cancer</source>. (<year>2019</year>) <volume>119</volume>:<fpage>11</fpage>&#x2013;<lpage>7</lpage>. <pub-id pub-id-type="doi">10.1016/j.ejca.2019.05.023</pub-id><pub-id pub-id-type="pmid">31401469</pub-id></citation></ref>
<ref id="B20"><label>20.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Khan</surname><given-names>MA</given-names></name><name><surname>Sharif</surname><given-names>MI</given-names></name><name><surname>Raza</surname><given-names>M</given-names></name><name><surname>Anjum</surname><given-names>A</given-names></name><name><surname>Saba</surname><given-names>T</given-names></name><name><surname>Ali Shad</surname><given-names>S</given-names></name></person-group>. <article-title>Skin lesion segmentation and classification: a unified framework of deep neural network features fusion and selection</article-title>. <source>Expert Syst</source>. (<year>2022</year>) <volume>39</volume>(<issue>7</issue>):<fpage>e12497</fpage>. <pub-id pub-id-type="doi">10.1111/exsy.12497</pub-id></citation></ref>
<ref id="B21"><label>21.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ganaie</surname><given-names>MA</given-names></name><name><surname>Hu</surname><given-names>M</given-names></name><name><surname>Malik</surname><given-names>AK</given-names></name><name><surname>Tanveer</surname><given-names>M</given-names></name><name><surname>Suganthan</surname><given-names>PN</given-names></name></person-group>. <article-title>Ensemble deep learning: a review</article-title>. <source>Eng Appl Artif Intell</source>. (<year>2022</year>) <volume>115</volume>:<fpage>105151</fpage>. <pub-id pub-id-type="doi">10.1016/j.engappai.2022.105151</pub-id></citation></ref>
<ref id="B22"><label>22.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Abd Elaziz</surname><given-names>M</given-names></name><name><surname>Dahou</surname><given-names>A</given-names></name><name><surname>Mabrouk</surname><given-names>A</given-names></name><name><surname>El-Sappagh</surname><given-names>S</given-names></name><name><surname>Aseeri</surname><given-names>AO</given-names></name></person-group>. <article-title>An efficient artificial rabbits optimization based on mutation strategy for skin cancer prediction</article-title>. <source>Comput Biol Med</source>. (<year>2023</year>) <volume>163</volume>:<fpage>107154</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107154</pub-id><pub-id pub-id-type="pmid">37364532</pub-id></citation></ref>
<ref id="B23"><label>23.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Uddin</surname><given-names>MZ</given-names></name><name><surname>Ngo</surname><given-names>TT</given-names></name><name><surname>Makihara</surname><given-names>Y</given-names></name><name><surname>Takemura</surname><given-names>N</given-names></name><name><surname>Li</surname><given-names>X</given-names></name><name><surname>Muramatsu</surname><given-names>D</given-names></name><etal/></person-group>. <article-title>The ou-isir large population gait database with real-life carried object and its performance evaluation</article-title>. <source>IPSJ Trans Comput Vision Appl</source>. (<year>2018</year>) <volume>10</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>11</lpage>.</citation></ref>
<ref id="B24"><label>24.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Uddin</surname><given-names>MZ</given-names></name><name><surname>Muramatsu</surname><given-names>D</given-names></name><name><surname>Takemura</surname><given-names>N</given-names></name><name><surname>Ahad</surname><given-names>MAR</given-names></name><name><surname>Yagi</surname><given-names>Y</given-names></name></person-group>. <article-title>Spatio-temporal silhouette sequence reconstruction for gait recognition against occlusion</article-title>. <source>IPSJ Trans Comput Vis Appl</source>. (<year>2019</year>) <volume>11</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>18</lpage>.</citation></ref>
<ref id="B25"><label>25.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shanthi</surname><given-names>T</given-names></name><name><surname>Sabeenian</surname><given-names>RS</given-names></name><name><surname>Anand</surname><given-names>R</given-names></name></person-group>. <article-title>Automatic diagnosis of skin diseases using convolution neural network</article-title>. <source>Microprocess Microsyst</source>. (<year>2020</year>) <volume>76</volume>:<fpage>103074</fpage>. <pub-id pub-id-type="doi">10.1016/j.micpro.2020.103074</pub-id></citation></ref>
<ref id="B26"><label>26.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Anand</surname><given-names>V</given-names></name><name><surname>Gupta</surname><given-names>S</given-names></name><name><surname>Koundal</surname><given-names>D</given-names></name><name><surname>Mahajan</surname><given-names>S</given-names></name><name><surname>Pandit</surname><given-names>AK</given-names></name><name><surname>Zaguia</surname><given-names>A</given-names></name></person-group>. <article-title>Deep learning based automated diagnosis of skin diseases using dermoscopy</article-title>. <source>Comput Mater Contin</source>. (<year>2022</year>) <volume>71</volume>(<issue>2</issue>):<fpage>3145</fpage>&#x2013;<lpage>60</lpage>.</citation></ref>
<ref id="B27"><label>27.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Muhaba</surname><given-names>KA</given-names></name><name><surname>Dese</surname><given-names>K</given-names></name><name><surname>Aga</surname><given-names>TM</given-names></name><name><surname>Zewdu</surname><given-names>FT</given-names></name><name><surname>Simegn</surname><given-names>GL</given-names></name></person-group>. <article-title>Automatic skin disease diagnosis using deep learning from clinical image and patient information</article-title>. <source>Skin Health Dis</source>. (<year>2022</year>) <volume>2</volume>(<issue>1</issue>):<fpage>e81</fpage>. <pub-id pub-id-type="doi">10.1002/ski2.81</pub-id><pub-id pub-id-type="pmid">35665205</pub-id></citation></ref>
<ref id="B28"><label>28.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kalpana</surname><given-names>B</given-names></name><name><surname>Reshmy</surname><given-names>AK</given-names></name><name><surname>Senthil Pandi</surname><given-names>S</given-names></name><name><surname>Dhanasekaran</surname><given-names>S</given-names></name></person-group>. <article-title>Oesv-krf: optimal ensemble support vector kernel random forest based early detection and classification of skin diseases</article-title>. <source>Biomed Signal Process Control</source>. (<year>2023</year>) <volume>85</volume>:<fpage>104779</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2023.104779</pub-id></citation></ref>
<ref id="B29"><label>29.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Hameed</surname><given-names>N</given-names></name><name><surname>Shabut</surname><given-names>AM</given-names></name><name><surname>Hossain</surname><given-names>MA</given-names></name></person-group>. <article-title>Multi-class skin diseases classification using deep convolutional neural network and support vector machine</article-title>. <comment>In: <italic>2018 12th International Conference on Software, Knowledge, Information Management &#x0026; Applications (SKIMA)</italic>. IEEE (2018). p. 1&#x2013;7</comment>.</citation></ref>
<ref id="B30"><label>30.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kousis</surname><given-names>I</given-names></name><name><surname>Perikos</surname><given-names>I</given-names></name><name><surname>Hatzilygeroudis</surname><given-names>I</given-names></name><name><surname>Virvou</surname><given-names>M</given-names></name></person-group>. <article-title>Deep learning methods for accurate skin cancer recognition and mobile application</article-title>. <source>Electronics</source>. (<year>2022</year>) <volume>11</volume>(<issue>9</issue>):<fpage>1294</fpage>. <pub-id pub-id-type="doi">10.3390/electronics11091294</pub-id></citation></ref>
<ref id="B31"><label>31.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mondal</surname><given-names>A</given-names></name><name><surname>Shrivastava</surname><given-names>VK</given-names></name></person-group>. <article-title>A fine tuning approach using modified densenet model for skin cancer classification</article-title>. <source>Int J Med Eng Inform</source>. (<year>2023</year>) <volume>15</volume>(<issue>4</issue>):<fpage>323</fpage>&#x2013;<lpage>35</lpage>.</citation></ref>
<ref id="B32"><label>32.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Anand</surname><given-names>V</given-names></name><name><surname>Gupta</surname><given-names>S</given-names></name><name><surname>Nayak</surname><given-names>SR</given-names></name><name><surname>Koundal</surname><given-names>D</given-names></name><name><surname>Prakash</surname><given-names>D</given-names></name><name><surname>Verma</surname><given-names>KD</given-names></name></person-group>. <article-title>An automated deep learning models for classification of skin disease using dermoscopy images: a comprehensive study</article-title>. <source>Multimed Tools Appl</source>. (<year>2022</year>) <volume>81</volume>(<issue>26</issue>):<fpage>37379</fpage>&#x2013;<lpage>401</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-021-11628-y</pub-id></citation></ref>
<ref id="B33"><label>33.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vaichole</surname><given-names>TS</given-names></name><name><surname>Kulkarni</surname><given-names>SK</given-names></name><name><surname>Yadav</surname><given-names>O</given-names></name><name><surname>Khan</surname><given-names>F</given-names></name></person-group>. <article-title>Eff2net: an efficient channel attention-based convolutional neural network for skin disease classification</article-title>. <source>Biomed Signal Process Control</source>. (<year>2022</year>) <volume>73</volume>:<fpage>103406</fpage>.</citation></ref>
<ref id="B34"><label>34.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shan</surname><given-names>P</given-names></name><name><surname>Chen</surname><given-names>J</given-names></name><name><surname>Fu</surname><given-names>C</given-names></name><name><surname>Cao</surname><given-names>L</given-names></name><name><surname>Tie</surname><given-names>M</given-names></name><name><surname>Sham</surname><given-names>C-W</given-names></name></person-group>. <article-title>Automatic skin lesion classification using a novel densely connected convolutional network integrated with an attention module</article-title>. <source>J Ambient Intell Humaniz Comput</source>. (<year>2023</year>) <volume>14</volume>(<issue>7</issue>):<fpage>8943</fpage>&#x2013;<lpage>56</lpage>. <pub-id pub-id-type="doi">10.1007/s12652-022-04400-z</pub-id></citation></ref>
<ref id="B35"><label>35.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Raghavendra</surname><given-names>PVSP</given-names></name><name><surname>Charitha</surname><given-names>C</given-names></name><name><surname>Begum</surname><given-names>KG</given-names></name><name><surname>Prasath</surname><given-names>VBS</given-names></name></person-group>. <article-title>Deep learning&#x2013;based skin lesion multi-class classification with global average pooling improvement</article-title>. <source>J Digit Imaging</source>. (<year>2023</year>) <volume>36</volume>:<fpage>1</fpage>&#x2013;<lpage>22</lpage>.<pub-id pub-id-type="pmid">36316619</pub-id></citation></ref>
<ref id="B36"><label>36.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Srinivasu</surname><given-names>PN</given-names></name><name><surname>SivaSai</surname><given-names>JG</given-names></name><name><surname>Ijaz</surname><given-names>MF</given-names></name><name><surname>Bhoi</surname><given-names>AK</given-names></name><name><surname>Kim</surname><given-names>W</given-names></name><name><surname>Kang</surname><given-names>JJ</given-names></name></person-group>. <article-title>Classification of skin disease using deep learning neural networks with mobilenet v2 and lstm</article-title>. <source>Sensors</source>. (<year>2021</year>) <volume>21</volume>(<issue>8</issue>):<fpage>2852</fpage>. <pub-id pub-id-type="doi">10.3390/s21082852</pub-id><pub-id pub-id-type="pmid">33919583</pub-id></citation></ref>
<ref id="B37"><label>37.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname><given-names>Z</given-names></name><name><surname>Jiang</surname><given-names>X</given-names></name><name><surname>Zhou</surname><given-names>F</given-names></name><name><surname>Qin</surname><given-names>J</given-names></name><name><surname>Ni</surname><given-names>D</given-names></name><name><surname>Chen</surname><given-names>S</given-names></name><etal/></person-group>. <article-title>Melanoma recognition in dermoscopy images via aggregated deep convolutional features</article-title>. <source>IEEE Trans Biomed Eng</source>. (<year>2018</year>) <volume>66</volume>(<issue>4</issue>):<fpage>1006</fpage>&#x2013;<lpage>16</lpage>. <pub-id pub-id-type="doi">10.1109/TBME.2018.2866166</pub-id><pub-id pub-id-type="pmid">30130171</pub-id></citation></ref>
<ref id="B38"><label>38.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Seeja</surname><given-names>RD</given-names></name><name><surname>Suresh</surname><given-names>A</given-names></name></person-group>. <article-title>Deep learning based skin lesion segmentation and classification of melanoma using support vector machine (svm)</article-title>. <source>Asian Pac J Cancer Prev</source>. (<year>2019</year>) <volume>20</volume>(<issue>5</issue>):<fpage>1555</fpage>.<pub-id pub-id-type="pmid">31128062</pub-id></citation></ref>
<ref id="B39"><label>39.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bandyopadhyay</surname><given-names>SK</given-names></name><name><surname>Bose</surname><given-names>P</given-names></name><name><surname>Bhaumik</surname><given-names>A</given-names></name><name><surname>Poddar</surname><given-names>S</given-names></name></person-group>. <article-title>Machine learning and deep learning integration for skin diseases prediction</article-title>. <source>Int J Eng Trends Technol</source>. (<year>2022</year>) <volume>70</volume>(<issue>2</issue>):<fpage>11</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.14445/22315381/IJETT-V70I2P202</pub-id></citation></ref>
<ref id="B40"><label>40.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Min Son</surname><given-names>H</given-names></name><name><surname>Jeon</surname><given-names>W</given-names></name><name><surname>Kim</surname><given-names>J</given-names></name><name><surname>Yeong Heo</surname><given-names>C</given-names></name><name><surname>Jin Yoon</surname><given-names>H</given-names></name><name><surname>Park</surname><given-names>J-U</given-names></name><etal/></person-group>. <article-title>Ai-based localization and classification of skin disease with erythema</article-title>. <source>Sci Rep</source>. (<year>2021</year>) <volume>11</volume>(<issue>1</issue>):<fpage>5350</fpage>.<pub-id pub-id-type="pmid">33674636</pub-id></citation></ref>
<ref id="B41"><label>41.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Adla</surname><given-names>D</given-names></name><name><surname>Reddy</surname><given-names>GVR</given-names></name><name><surname>Nayak</surname><given-names>P</given-names></name><name><surname>Karuna</surname><given-names>G</given-names></name></person-group>. <article-title>Deep learning-based computer aided diagnosis model for skin cancer detection and classification</article-title>. <source>Distrib Parallel Database</source>. (<year>2022</year>) <volume>40</volume>(<issue>4</issue>):<fpage>717</fpage>&#x2013;<lpage>36</lpage>. <pub-id pub-id-type="doi">10.1007/s10619-021-07360-z</pub-id></citation></ref>
<ref id="B42"><label>42.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhu</surname><given-names>A-Q</given-names></name><name><surname>Wang</surname><given-names>Q</given-names></name><name><surname>Shi</surname><given-names>Y-L</given-names></name><name><surname>Ren</surname><given-names>W-W</given-names></name><name><surname>Cao</surname><given-names>X</given-names></name><name><surname>Ren</surname><given-names>T-T</given-names></name><etal/></person-group>. <article-title>A deep learning fusion network trained with clinical and high-frequency ultrasound images in the multi-classification of skin diseases in comparison with dermatologists: a prospective and multicenter study</article-title>. <source>eClinicalMedicine</source>. (<year>2024</year>) <volume>67</volume>:<fpage>102391</fpage>. <pub-id pub-id-type="doi">10.1016/j.eclinm.2023.102391</pub-id><pub-id pub-id-type="pmid">38274117</pub-id></citation></ref>
<ref id="B43"><label>43.</label><citation citation-type="book"><person-group person-group-type="author"><name><surname>Goodfellow</surname><given-names>I</given-names></name><name><surname>Bengio</surname><given-names>Y</given-names></name><name><surname>Courville</surname><given-names>A</given-names></name></person-group>. <source>Deep Learning</source>. <publisher-loc>Cambridge, MA</publisher-loc>: <publisher-name>MIT Press</publisher-name> (<year>2016</year>).</citation></ref>
<ref id="B44"><label>44.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Dosovitskiy</surname><given-names>A</given-names></name><name><surname>Beyer</surname><given-names>L</given-names></name><name><surname>Kolesnikov</surname><given-names>A</given-names></name><name><surname>Weissenborn</surname><given-names>D</given-names></name><name><surname>Zhai</surname><given-names>X</given-names></name><name><surname>Unterthiner</surname><given-names>T</given-names></name><etal/></person-group>. <article-title>An image is worth <inline-formula><mml:math xmlns:mml="http://www.w3.org/1998/Math/MathML" id="IM68"><mml:mn>16</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mn>16</mml:mn></mml:math></inline-formula> words: transformers for image recognition at scale</article-title>. <comment><italic>arXiv</italic> [Preprint]. <italic>arXiv:2010.11929</italic> (2020)</comment>.</citation></ref>
<ref id="B45"><label>45.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Dai</surname><given-names>W</given-names></name><name><surname>Wu</surname><given-names>Z</given-names></name><name><surname>Liu</surname><given-names>R</given-names></name><name><surname>Zhou</surname><given-names>J</given-names></name><name><surname>Wang</surname><given-names>M</given-names></name><name><surname>Wu</surname><given-names>T</given-names></name><etal/></person-group>. <article-title>Sosegformer: a cross-scale feature correlated network for small medical object segmentation</article-title>. <comment>In: <italic>2024 IEEE International Symposium on Biomedical Imaging (ISBI)</italic>. IEEE (2024). p. 1&#x2013;4</comment>.</citation></ref>
<ref id="B46"><label>46.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dai</surname><given-names>W</given-names></name><name><surname>Wu</surname><given-names>Z</given-names></name><name><surname>Liu</surname><given-names>R</given-names></name><name><surname>Wu</surname><given-names>T</given-names></name><name><surname>Wang</surname><given-names>M</given-names></name><name><surname>Zhou</surname><given-names>J</given-names></name><etal/></person-group>. <article-title>Automated non-invasive analysis of motile sperms using sperm feature-correlated network</article-title>. <source>IEEE Trans Autom Sci Eng</source>. (<year>2024</year>) <volume>22</volume>:<fpage>3960</fpage>&#x2013;<lpage>70</lpage>. <pub-id pub-id-type="doi">10.1109/TASE.2024.3404488</pub-id></citation></ref>
<ref id="B47"><label>47.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xin</surname><given-names>C</given-names></name><name><surname>Liu</surname><given-names>Z</given-names></name><name><surname>Zhao</surname><given-names>K</given-names></name><name><surname>Miao</surname><given-names>L</given-names></name><name><surname>Ma</surname><given-names>Y</given-names></name><name><surname>Zhu</surname><given-names>X</given-names></name><etal/></person-group>. <article-title>An improved transformer network for skin cancer classification</article-title>. <source>Comput Biol Med</source>. (<year>2022</year>) <volume>149</volume>:<fpage>105939</fpage>. <pub-id pub-id-type="doi">10.1016/j.compbiomed.2022.105939</pub-id><pub-id pub-id-type="pmid">36037629</pub-id></citation></ref>
<ref id="B48"><label>48.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Dai</surname><given-names>W</given-names></name><name><surname>Liu</surname><given-names>R</given-names></name><name><surname>Wu</surname><given-names>T</given-names></name><name><surname>Wang</surname><given-names>M</given-names></name><name><surname>Yin</surname><given-names>J</given-names></name><name><surname>Liu</surname><given-names>J</given-names></name></person-group>. <article-title>Deeply supervised skin lesions diagnosis with stage and branch attention</article-title>. <source>IEEE J Biomed Health Inform</source>. (<year>2023</year>) <volume>28</volume>(<issue>2</issue>):<fpage>719</fpage>&#x2013;<lpage>29</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2023.3308697</pub-id></citation></ref>
<ref id="B49"><label>49.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Nie</surname><given-names>Y</given-names></name><name><surname>Sommella</surname><given-names>P</given-names></name><name><surname>Carrat&#x00F9;</surname><given-names>M</given-names></name><name><surname>O&#x2019;Nils</surname><given-names>M</given-names></name><name><surname>Lundgren</surname><given-names>J</given-names></name></person-group>. <article-title>A deep cnn transformer hybrid model for skin lesion classification of dermoscopic images using focal loss</article-title>. <source>Diagnostics</source>. (<year>2023</year>) <volume>13</volume>(<issue>1</issue>):<fpage>72</fpage>. <pub-id pub-id-type="doi">10.3390/diagnostics13010072</pub-id></citation></ref>
<ref id="B50"><label>50.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>G</given-names></name><name><surname>Yan</surname><given-names>P</given-names></name><name><surname>Tang</surname><given-names>Q</given-names></name><name><surname>Yang</surname><given-names>L</given-names></name><name><surname>Chen</surname><given-names>J</given-names></name></person-group>. <article-title>Multiscale feature fusion for skin lesion classification</article-title>. <source>Biomed Res Int</source>. (<year>2023</year>) <volume>2023</volume>:<fpage>1</fpage>&#x2013;<lpage>15</lpage>.</citation></ref>
<ref id="B51"><label>51.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gairola</surname><given-names>AK</given-names></name><name><surname>Kumar</surname><given-names>V</given-names></name><name><surname>Kumar Sahoo</surname><given-names>A</given-names></name><name><surname>Diwakar</surname><given-names>M</given-names></name><name><surname>Singh</surname><given-names>P</given-names></name><name><surname>Garg</surname><given-names>D</given-names></name></person-group>. <article-title>Multi-feature fusion deep network for skin disease diagnosis</article-title>. <source>Multimed Tools Appl</source>. (<year>2025</year>) <volume>84</volume>(<issue>1</issue>):<fpage>419</fpage>&#x2013;<lpage>44</lpage>. <pub-id pub-id-type="doi">10.1007/s11042-024-18958-7</pub-id></citation></ref>
<ref id="B52"><label>52.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Elashiri</surname><given-names>MA</given-names></name><name><surname>Rajesh</surname><given-names>A</given-names></name><name><surname>Nath Pandey</surname><given-names>S</given-names></name><name><surname>Shukla</surname><given-names>SK</given-names></name><name><surname>Urooj</surname><given-names>S</given-names></name><name><surname>Lay-Ekuakille</surname><given-names>A</given-names></name></person-group>. <article-title>Ensemble of weighted deep concatenated features for the skin disease classification model using modified long short term memory</article-title>. <source>Biomed Signal Process Control</source>. (<year>2022</year>) <volume>76</volume>:<fpage>103729</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2022.103729</pub-id></citation></ref>
<ref id="B53"><label>53.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Afza</surname><given-names>F</given-names></name><name><surname>Sharif</surname><given-names>M</given-names></name><name><surname>Khan</surname><given-names>MA</given-names></name><name><surname>Tariq</surname><given-names>U</given-names></name><name><surname>Yong</surname><given-names>H-S</given-names></name><name><surname>Cha</surname><given-names>J</given-names></name></person-group>. <article-title>Multiclass skin lesion classification using hybrid deep features selection and extreme learning machine</article-title>. <source>Sensors</source>. (<year>2022</year>) <volume>22</volume>(<issue>3</issue>):<fpage>799</fpage>. <pub-id pub-id-type="doi">10.3390/s22030799</pub-id><pub-id pub-id-type="pmid">35161553</pub-id></citation></ref>
<ref id="B54"><label>54.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ding</surname><given-names>J</given-names></name><name><surname>Song</surname><given-names>J</given-names></name><name><surname>Li</surname><given-names>J</given-names></name><name><surname>Tang</surname><given-names>J</given-names></name><name><surname>Guo</surname><given-names>F</given-names></name></person-group>. <article-title>Two-stage deep neural network via ensemble learning for melanoma classification</article-title>. <source>Front Bioeng Biotechnol</source>. (<year>2022</year>) <volume>9</volume>:<fpage>758495</fpage>. <pub-id pub-id-type="doi">10.3389/fbioe.2021.758495</pub-id><pub-id pub-id-type="pmid">35118054</pub-id></citation></ref>
<ref id="B55"><label>55.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Harangi</surname><given-names>B</given-names></name></person-group>. <article-title>Skin lesion classification with ensembles of deep convolutional neural networks</article-title>. <source>J Biomed Inform</source>. (<year>2018</year>) <volume>86</volume>:<fpage>25</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1016/j.jbi.2018.08.006</pub-id><pub-id pub-id-type="pmid">30103029</pub-id></citation></ref>
<ref id="B56"><label>56.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Huang</surname><given-names>G</given-names></name><name><surname>Liu</surname><given-names>Z</given-names></name><name><surname>Van Der Maaten</surname><given-names>L</given-names></name><name><surname>Weinberger</surname><given-names>KQ</given-names></name></person-group>. <article-title>Densely connected convolutional networks</article-title>. <comment>In: <italic>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</italic>. (2017). p. 4700&#x2013;8</comment>.</citation></ref>
<ref id="B57"><label>57.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Simonyan</surname><given-names>K</given-names></name><name><surname>Zisserman</surname><given-names>A</given-names></name></person-group>. <article-title>Very deep convolutional networks for large-scale image recognition</article-title>. <comment><italic>arXiv</italic> [Preprint]. <italic>arXiv:1409.1556</italic> (2014)</comment>.</citation></ref>
<ref id="B58"><label>58.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>Y</given-names></name><name><surname>Hao</surname><given-names>P</given-names></name><name><surname>Zhang</surname><given-names>P</given-names></name><name><surname>Xu</surname><given-names>X</given-names></name><name><surname>Wu</surname><given-names>J</given-names></name><name><surname>Chen</surname><given-names>W</given-names></name></person-group>. <article-title>Dense convolutional binary-tree networks for lung nodule classification</article-title>. <source>IEEE Access</source>. (<year>2018</year>) <volume>6</volume>:<fpage>49080</fpage>&#x2013;<lpage>8</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2018.2865544</pub-id></citation></ref>
<ref id="B59"><label>59.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Sanghvi</surname><given-names>HA</given-names></name><name><surname>Patel</surname><given-names>RH</given-names></name><name><surname>Agarwal</surname><given-names>A</given-names></name><name><surname>Gupta</surname><given-names>S</given-names></name><name><surname>Sawhney</surname><given-names>V</given-names></name><name><surname>Pandya</surname><given-names>AS</given-names></name></person-group>. <article-title>A deep learning approach for classification of covid and pneumonia using densenet-201</article-title>. <source>Int J Imaging Syst Technol</source>. (<year>2023</year>) <volume>33</volume>(<issue>1</issue>):<fpage>18</fpage>&#x2013;<lpage>38</lpage>. <pub-id pub-id-type="doi">10.1002/ima.22812</pub-id></citation></ref>
<ref id="B60"><label>60.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Vaswani</surname><given-names>A</given-names></name><name><surname>Shazeer</surname><given-names>N</given-names></name><name><surname>Parmar</surname><given-names>N</given-names></name><name><surname>Uszkoreit</surname><given-names>J</given-names></name><name><surname>Jones</surname><given-names>L</given-names></name><name><surname>Gomez</surname><given-names>AN</given-names></name><etal/></person-group>. <article-title>Attention is all you need</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2017</year>) <volume>30</volume>:<fpage>6000</fpage>&#x2013;<lpage>10</lpage>.</citation></ref>
<ref id="B61"><label>61.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Mendon&#x00E7;a</surname><given-names>T</given-names></name><name><surname>Ferreira</surname><given-names>PM</given-names></name><name><surname>Marques</surname><given-names>JS</given-names></name><name><surname>Marcal</surname><given-names>ARS</given-names></name><name><surname>Rozeira</surname><given-names>J</given-names></name></person-group>. <article-title>Ph 2-a dermoscopic image database for research and benchmarking</article-title>. <comment>In: <italic>2013 35th Annual International Conference of the IEEE Engineering in Medicine and Biology Society (EMBC)</italic>. IEEE (2013). p. 5437&#x2013;40</comment>.</citation></ref>
<ref id="B62"><label>62.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tschandl</surname><given-names>P</given-names></name><name><surname>Rosendahl</surname><given-names>C</given-names></name><name><surname>Kittler</surname><given-names>H</given-names></name></person-group>. <article-title>The ham10000 dataset, a large collection of multi-source dermatoscopic images of common pigmented skin lesions</article-title>. <source>Sci Data</source>. (<year>2018</year>) <volume>5</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>9</lpage>. <pub-id pub-id-type="doi">10.1038/sdata.2018.161</pub-id><pub-id pub-id-type="pmid">30482902</pub-id></citation></ref>
<ref id="B63"><label>63.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Xu</surname><given-names>M</given-names></name><name><surname>Yoon</surname><given-names>S</given-names></name><name><surname>Fuentes</surname><given-names>A</given-names></name><name><surname>Park</surname><given-names>DS</given-names></name></person-group>. <article-title>A comprehensive survey of image augmentation techniques for deep learning</article-title>. <source>Pattern Recognit</source>. (<year>2023</year>) <volume>137</volume>:<fpage>109347</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2023.109347</pub-id></citation></ref>
<ref id="B64"><label>64.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shorten</surname><given-names>C</given-names></name><name><surname>Khoshgoftaar</surname><given-names>TM</given-names></name></person-group>. <article-title>A survey on image data augmentation for deep learning</article-title>. <source>J Big Data</source>. (<year>2019</year>) <volume>6</volume>(<issue>1</issue>):<fpage>1</fpage>&#x2013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1186/s40537-019-0197-0</pub-id></citation></ref>
<ref id="B65"><label>65.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Karras</surname><given-names>T</given-names></name><name><surname>Aittala</surname><given-names>M</given-names></name><name><surname>Hellsten</surname><given-names>J</given-names></name><name><surname>Laine</surname><given-names>S</given-names></name><name><surname>Lehtinen</surname><given-names>J</given-names></name><name><surname>Aila</surname><given-names>T</given-names></name></person-group>. <article-title>Training generative adversarial networks with limited data</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2020</year>) <volume>33</volume>:<fpage>12104</fpage>&#x2013;<lpage>14</lpage>.</citation></ref>
<ref id="B66"><label>66.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Heusel</surname><given-names>M</given-names></name><name><surname>Ramsauer</surname><given-names>H</given-names></name><name><surname>Unterthiner</surname><given-names>T</given-names></name><name><surname>Nessler</surname><given-names>B</given-names></name><name><surname>Hochreiter</surname><given-names>S</given-names></name></person-group>. <article-title>Gans trained by a two time-scale update rule converge to a local nash equilibrium</article-title>. <source>Adv Neural Inf Process Syst</source>. (<year>2017</year>) <volume>30</volume>:<fpage>6629</fpage>&#x2013;<lpage>40</lpage>.</citation></ref>
<ref id="B67"><label>67.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Uddin</surname><given-names>MZ</given-names></name><name><surname>Shahriar</surname><given-names>MA</given-names></name><name><surname>Mahamood</surname><given-names>MN</given-names></name><name><surname>Alnajjar</surname><given-names>F</given-names></name><name><surname>Pramanik</surname><given-names>MI</given-names></name><name><surname>Ahad</surname><given-names>MAR</given-names></name></person-group>. <article-title>Deep learning with image-based autism spectrum disorder analysis: a systematic review</article-title>. <source>Eng Appl Artif Intell</source>. (<year>2024</year>) <volume>127</volume>:<fpage>107185</fpage>. <pub-id pub-id-type="doi">10.1016/j.engappai.2023.107185</pub-id></citation></ref>
<ref id="B68"><label>68.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Benyahia</surname><given-names>S</given-names></name><name><surname>Meftah</surname><given-names>B</given-names></name><name><surname>L&#x00E9;zoray</surname><given-names>O</given-names></name></person-group>. <article-title>Multi-features extraction based on deep learning for skin lesion classification</article-title>. <source>Tissue Cell</source>. (<year>2022</year>) <volume>74</volume>:<fpage>101701</fpage>. <pub-id pub-id-type="doi">10.1016/j.tice.2021.101701</pub-id><pub-id pub-id-type="pmid">34861582</pub-id></citation></ref>
<ref id="B69"><label>69.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Maniraj</surname><given-names>SP</given-names></name><name><surname>Sardar Maran</surname><given-names>P</given-names></name></person-group>. <article-title>A hybrid deep learning approach for skin cancer diagnosis using subband fusion of 3D wavelets</article-title>. <source>J Supercomput</source>. (<year>2022</year>) <volume>78</volume>(<issue>10</issue>):<fpage>12394</fpage>&#x2013;<lpage>409</lpage>. <pub-id pub-id-type="doi">10.1007/s11227-022-04371-0</pub-id></citation></ref>
<ref id="B70"><label>70.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Afza</surname><given-names>F</given-names></name><name><surname>Sharif</surname><given-names>M</given-names></name><name><surname>Mittal</surname><given-names>M</given-names></name><name><surname>Khan</surname><given-names>MA</given-names></name><name><surname>Hemanth</surname><given-names>DJ</given-names></name></person-group>. <article-title>A hierarchical three-step superpixels and deep learning framework for skin lesion classification</article-title>. <source>Methods</source>. (<year>2022</year>) <volume>202</volume>:<fpage>88</fpage>&#x2013;<lpage>102</lpage>. <pub-id pub-id-type="doi">10.1016/j.ymeth.2021.02.013</pub-id><pub-id pub-id-type="pmid">33610692</pub-id></citation></ref>
<ref id="B71"><label>71.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Reddy</surname><given-names>DA</given-names></name><name><surname>Roy</surname><given-names>S</given-names></name><name><surname>Kumar</surname><given-names>S</given-names></name><name><surname>Tripathi</surname><given-names>R</given-names></name></person-group>. <article-title>A scheme for effective skin disease detection using optimized region growing segmentation and autoencoder based classification</article-title>. <source>Procedia Comput Sci</source>. (<year>2023</year>) <volume>218</volume>:<fpage>274</fpage>&#x2013;<lpage>82</lpage>. <pub-id pub-id-type="doi">10.1016/j.procs.2023.01.009</pub-id></citation></ref>
<ref id="B72"><label>72.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Maqsood</surname><given-names>S</given-names></name><name><surname>Dama&#x0161;evi&#x010D;ius</surname><given-names>R</given-names></name></person-group>. <article-title>Multiclass skin lesion localization and classification using deep learning based features fusion and selection framework for smart healthcare</article-title>. <source>Neural Netw</source>. (<year>2023</year>) <volume>160</volume>:<fpage>238</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1016/j.neunet.2023.01.022</pub-id><pub-id pub-id-type="pmid">36701878</pub-id></citation></ref>
<ref id="B73"><label>73.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mustafa</surname><given-names>S</given-names></name><name><surname>Jaffar</surname><given-names>A</given-names></name><name><surname>Rashid</surname><given-names>M</given-names></name><name><surname>Akram</surname><given-names>S</given-names></name><name><surname>Masood Bhatti</surname><given-names>S</given-names></name></person-group>. <article-title>Deep learning-based skin lesion analysis using hybrid resunet++ and modified alexnet-random forest for enhanced segmentation and classification</article-title>. <source>PLoS One</source>. (<year>2025</year>) <volume>20</volume>(<issue>1</issue>):<fpage>e0315120</fpage>.<pub-id pub-id-type="pmid">39820868</pub-id></citation></ref>
<ref id="B74"><label>74.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Liu</surname><given-names>Q</given-names></name><name><surname>Yu</surname><given-names>L</given-names></name><name><surname>Luo</surname><given-names>L</given-names></name><name><surname>Dou</surname><given-names>Q</given-names></name><name><surname>Heng</surname><given-names>PA</given-names></name></person-group>. <article-title>Semi-supervised medical image classification with relation-driven self-ensembling model</article-title>. <source>IEEE Trans Med Imaging</source>. (<year>2020</year>) <volume>39</volume>(<issue>11</issue>):<fpage>3429</fpage>&#x2013;<lpage>40</lpage>. <pub-id pub-id-type="doi">10.1109/TMI.2020.2995518</pub-id><pub-id pub-id-type="pmid">32746096</pub-id></citation></ref>
<ref id="B75"><label>75.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Al-Masni</surname><given-names>MA</given-names></name><name><surname>Kim</surname><given-names>D-H</given-names></name><name><surname>Kim</surname><given-names>T-S</given-names></name></person-group>. <article-title>Multiple skin lesions diagnostics via integrated deep convolutional networks for segmentation and classification</article-title>. <source>Comput Methods Programs Biomed</source>. (<year>2020</year>) <volume>190</volume>:<fpage>105351</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2020.105351</pub-id><pub-id pub-id-type="pmid">32028084</pub-id></citation></ref>
<ref id="B76"><label>76.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Cai</surname><given-names>G</given-names></name><name><surname>Zhu</surname><given-names>Y</given-names></name><name><surname>Wu</surname><given-names>Y</given-names></name><name><surname>Jiang</surname><given-names>X</given-names></name><name><surname>Ye</surname><given-names>J</given-names></name><name><surname>Yang</surname><given-names>D</given-names></name></person-group>. <article-title>A multimodal transformer to fuse images and metadata for skin disease classification</article-title>. <source>Vis Comput</source>. (<year>2022</year>) <volume>39</volume>:<fpage>1</fpage>&#x2013;<lpage>13</lpage>.</citation></ref>
<ref id="B77"><label>77.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ali</surname><given-names>K</given-names></name><name><surname>Shaikh</surname><given-names>ZA</given-names></name><name><surname>Khan</surname><given-names>AA</given-names></name><name><surname>Laghari</surname><given-names>AA</given-names></name></person-group>. <article-title>Multiclass skin cancer classification using efficientnets&#x2013;a first step towards preventing skin cancer</article-title>. <source>Neurosci Inf</source>. (<year>2022</year>) <volume>2</volume>(<issue>4</issue>):<fpage>100034</fpage>.</citation></ref>
<ref id="B78"><label>78.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shetty</surname><given-names>B</given-names></name><name><surname>Fernandes</surname><given-names>R</given-names></name><name><surname>Rodrigues</surname><given-names>AP</given-names></name><name><surname>Chengoden</surname><given-names>R</given-names></name><name><surname>Bhattacharya</surname><given-names>S</given-names></name><name><surname>Lakshmanna</surname><given-names>K</given-names></name></person-group>. <article-title>Skin lesion classification of dermoscopic images using machine learning and convolutional neural network</article-title>. <source>Sci Rep</source>. (<year>2022</year>) <volume>12</volume>(<issue>1</issue>):<fpage>18134</fpage>. <pub-id pub-id-type="doi">10.1038/s41598-022-22644-9</pub-id><pub-id pub-id-type="pmid">36307467</pub-id></citation></ref>
<ref id="B79"><label>79.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wu</surname><given-names>Q</given-names></name><name><surname>Yu</surname><given-names>Y</given-names></name><name><surname>Zhang</surname><given-names>X</given-names></name></person-group>. <article-title>A skin cancer classification method based on discrete wavelet down-sampling feature reconstruction</article-title>. <source>Electronics</source>. (<year>2023</year>) <volume>12</volume>(<issue>9</issue>):<fpage>2103</fpage>. <pub-id pub-id-type="doi">10.3390/electronics12092103</pub-id></citation></ref>
<ref id="B80"><label>80.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tajerian</surname><given-names>A</given-names></name><name><surname>Kazemian</surname><given-names>M</given-names></name><name><surname>Tajerian</surname><given-names>M</given-names></name><name><surname>Akhavan Malayeri</surname><given-names>A</given-names></name></person-group>. <article-title>Design and validation of a new machine-learning-based diagnostic tool for the differentiation of dermatoscopic skin cancer images</article-title>. <source>PLoS One</source>. (<year>2023</year>) <volume>18</volume>(<issue>4</issue>):<fpage>e0284437</fpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0284437</pub-id><pub-id pub-id-type="pmid">37058446</pub-id></citation></ref>
<ref id="B81"><label>81.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>You</surname><given-names>H</given-names></name><name><surname>Yu</surname><given-names>L</given-names></name><name><surname>Tian</surname><given-names>S</given-names></name><name><surname>Cai</surname><given-names>W</given-names></name></person-group>. <article-title>A stereo spatial decoupling network for medical image classification</article-title>. <source>Complex Intell Syst</source>. (<year>2023</year>) <volume>9</volume>:<fpage>1</fpage>&#x2013;<lpage>10</lpage>.</citation></ref>
<ref id="B82"><label>82.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wei</surname><given-names>M</given-names></name><name><surname>Wu</surname><given-names>Q</given-names></name><name><surname>Ji</surname><given-names>H</given-names></name><name><surname>Wang</surname><given-names>J</given-names></name><name><surname>Lyu</surname><given-names>T</given-names></name><name><surname>Liu</surname><given-names>J</given-names></name><etal/></person-group>. <article-title>A skin disease classification model based on densenet and convnext fusion</article-title>. <source>Electronics</source>. (<year>2023</year>) <volume>12</volume>(<issue>2</issue>):<fpage>438</fpage>. <pub-id pub-id-type="doi">10.3390/electronics12020438</pub-id></citation></ref>
<ref id="B83"><label>83.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Pacal</surname><given-names>I</given-names></name><name><surname>Ozdemir</surname><given-names>B</given-names></name><name><surname>Zeynalov</surname><given-names>J</given-names></name><name><surname>Gasimov</surname><given-names>H</given-names></name><name><surname>Pacal</surname><given-names>N</given-names></name></person-group>. <article-title>A novel cnn-vit-based deep learning model for early skin cancer diagnosis</article-title>. <source>Biomed Signal Process Control</source>. (<year>2025</year>) <volume>104</volume>:<fpage>107627</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2025.107627</pub-id></citation></ref>
<ref id="B84"><label>84.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Nozdryn-Plotnicki</surname><given-names>A</given-names></name><name><surname>Yap</surname><given-names>J</given-names></name><name><surname>Yolland</surname><given-names>W</given-names></name></person-group>. <article-title>Ensembling convolutional neural networks for skin cancer classification</article-title>. <comment>In: <italic>International Skin Imaging Collaboration (ISIC) Challenge on Skin Image Analysis for Melanoma Detection. MICCAI</italic>. (2018)</comment>.</citation></ref>
<ref id="B85"><label>85.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Gessert</surname><given-names>N</given-names></name><name><surname>Sentker</surname><given-names>T</given-names></name><name><surname>Madesta</surname><given-names>F</given-names></name><name><surname>Schmitz</surname><given-names>R</given-names></name><name><surname>Kniep</surname><given-names>H</given-names></name><name><surname>Baltruschat</surname><given-names>I</given-names></name><etal/></person-group>. <article-title>Skin lesion diagnosis using ensembles, unscaled multi-crop evaluation and loss weighting</article-title>. <comment><italic>arXiv</italic>. [Preprint]. <italic>arXiv:1808.01694</italic> (2018)</comment>.</citation></ref>
<ref id="B86"><label>86.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhuang</surname><given-names>J</given-names></name><name><surname>Li</surname><given-names>W</given-names></name><name><surname>Manivannan</surname><given-names>S</given-names></name><name><surname>Wang</surname><given-names>R</given-names></name><name><surname>Zhang</surname><given-names>JJG</given-names></name><name><surname>Pan</surname><given-names>J</given-names></name><etal/></person-group>. <article-title>Skin lesion analysis towards melanoma detection using deep neural network ensemble</article-title>. <source>ISIC Chall</source>. (<year>2018</year>) <volume>2018</volume>(<issue>2</issue>):<fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation></ref>
<ref id="B87"><label>87.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mahbod</surname><given-names>A</given-names></name><name><surname>Schaefer</surname><given-names>G</given-names></name><name><surname>Wang</surname><given-names>C</given-names></name><name><surname>Dorffner</surname><given-names>G</given-names></name><name><surname>Ecker</surname><given-names>R</given-names></name><name><surname>Ellinger</surname><given-names>I</given-names></name></person-group>. <article-title>Transfer learning using a multi-scale and multi-network ensemble for skin lesion classification</article-title>. <source>Comput Methods Programs Biomed</source>. (<year>2020</year>) <volume>193</volume>:<fpage>105475</fpage>. <pub-id pub-id-type="doi">10.1016/j.cmpb.2020.105475</pub-id><pub-id pub-id-type="pmid">32268255</pub-id></citation></ref>
<ref id="B88"><label>88.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Shen</surname><given-names>S</given-names></name><name><surname>Xu</surname><given-names>M</given-names></name><name><surname>Zhang</surname><given-names>F</given-names></name><name><surname>Shao</surname><given-names>P</given-names></name><name><surname>Liu</surname><given-names>H</given-names></name><name><surname>Xu</surname><given-names>L</given-names></name><etal/></person-group>. <article-title>A low-cost high-performance data augmentation for deep learning-based skin lesion classification</article-title>. <source>BME Front</source>. (<year>2022</year>) <volume>2022</volume>:<fpage>9765307</fpage>. <pub-id pub-id-type="doi">10.34133/2022/9765307</pub-id><pub-id pub-id-type="pmid">37850173</pub-id></citation></ref>
<ref id="B89"><label>89.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Barata</surname><given-names>C</given-names></name><name><surname>Rotemberg</surname><given-names>V</given-names></name><name><surname>Codella</surname><given-names>NCF</given-names></name><name><surname>Tschandl</surname><given-names>P</given-names></name><name><surname>Rinner</surname><given-names>C</given-names></name><name><surname>Nisa Akay</surname><given-names>B</given-names></name><etal/></person-group>. <article-title>A reinforcement learning model for AI-based decision support in skin cancer</article-title>. <source>Nat Med</source>. (<year>2023</year>) <volume>29</volume>:<fpage>1</fpage>&#x2013;<lpage>6</lpage>.<pub-id pub-id-type="pmid">36694061</pub-id></citation></ref>
<ref id="B90"><label>90.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Tsai</surname><given-names>W-X</given-names></name><name><surname>Li</surname><given-names>Y-C</given-names></name><name><surname>Lin</surname><given-names>CH</given-names></name></person-group>. <article-title>Skin lesion classification based on multi-model ensemble with generated levels-of-detail images</article-title>. <source>Biomed Signal Process Control</source>. (<year>2023</year>) <volume>85</volume>:<fpage>105068</fpage>. <pub-id pub-id-type="doi">10.1016/j.bspc.2023.105068</pub-id></citation></ref>
<ref id="B91"><label>91.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kassem</surname><given-names>MA</given-names></name><name><surname>Hosny</surname><given-names>KM</given-names></name><name><surname>Fouad</surname><given-names>MM</given-names></name></person-group>. <article-title>Skin lesions classification into eight classes for ISIC 2019 using deep convolutional neural network and transfer learning</article-title>. <source>IEEE Access</source>. (<year>2020</year>) <volume>8</volume>:<fpage>114822</fpage>&#x2013;<lpage>32</lpage>. <pub-id pub-id-type="doi">10.1109/ACCESS.2020.3003890</pub-id></citation></ref>
<ref id="B92"><label>92.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Gessert</surname><given-names>N</given-names></name><name><surname>Nielsen</surname><given-names>M</given-names></name><name><surname>Shaikh</surname><given-names>M</given-names></name><name><surname>Werner</surname><given-names>R</given-names></name><name><surname>Schlaefer</surname><given-names>A</given-names></name></person-group>. <article-title>Skin lesion classification using ensembles of multi-resolution efficientnets with meta data</article-title>. <source>MethodsX</source>. (<year>2020</year>) <volume>7</volume>:<fpage>100864</fpage>. <pub-id pub-id-type="doi">10.1016/j.mex.2020.100864</pub-id><pub-id pub-id-type="pmid">32292713</pub-id></citation></ref>
<ref id="B93"><label>93.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Bhardwaj</surname><given-names>A</given-names></name><name><surname>Rege</surname><given-names>PP</given-names></name></person-group>. <article-title>Skin lesion classification using deep learning</article-title>. <comment>In: <italic>Advances in Signal and Data Processing: Select Proceedings of ICSDP 2019</italic>. Springer (2021). p. 575&#x2013;89</comment>.</citation></ref>
<ref id="B94"><label>94.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Jain</surname><given-names>A</given-names></name><name><surname>Rao</surname><given-names>ACS</given-names></name><name><surname>Jain</surname><given-names>PK</given-names></name><name><surname>Abraham</surname><given-names>A</given-names></name></person-group>. <article-title>Multi-type skin diseases classification using op-dnn based feature extraction approach</article-title>. <source>Multimed Tools Appl</source>. (<year>2022</year>) <volume>81</volume>:<fpage>1</fpage>&#x2013;<lpage>26</lpage>.<pub-id pub-id-type="pmid">35018131</pub-id></citation></ref>
<ref id="B95"><label>95.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname><given-names>Y</given-names></name><name><surname>Wang</surname><given-names>Y</given-names></name><name><surname>Cai</surname><given-names>J</given-names></name><name><surname>Lee</surname><given-names>TK</given-names></name><name><surname>Miao</surname><given-names>C</given-names></name><name><surname>Wang</surname><given-names>ZJ</given-names></name></person-group>. <article-title>Ssd-kd: a self-supervised diverse knowledge distillation method for lightweight skin lesion classification using dermoscopic images</article-title>. <source>Med Image Anal</source>. (<year>2023</year>) <volume>84</volume>:<fpage>102693</fpage>. <pub-id pub-id-type="doi">10.1016/j.media.2022.102693</pub-id><pub-id pub-id-type="pmid">36462373</pub-id></citation></ref>
<ref id="B96"><label>96.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Abdelhafeez</surname><given-names>A</given-names></name><name><surname>Mohamed</surname><given-names>HK</given-names></name><name><surname>Maher</surname><given-names>A</given-names></name><name><surname>Khalil</surname><given-names>NA</given-names></name></person-group>. <article-title>A novel approach toward skin cancer classification through fused deep features and neutrosophic environment</article-title>. <source>Front Public Health</source>. (<year>2023</year>) <volume>11</volume>:<fpage>1123581</fpage>. <pub-id pub-id-type="doi">10.3389/fpubh.2023.1123581</pub-id><pub-id pub-id-type="pmid">37139387</pub-id></citation></ref>
<ref id="B97"><label>97.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mehmood</surname><given-names>A</given-names></name><name><surname>Gulzar</surname><given-names>Y</given-names></name><name><surname>Mudassar Ilyas</surname><given-names>Q</given-names></name><name><surname>Jabbari</surname><given-names>A</given-names></name><name><surname>Ahmad</surname><given-names>M</given-names></name><name><surname>Iqbal</surname><given-names>S</given-names></name></person-group>. <article-title>Sbxception: a shallower and broader xception architecture for efficient classification of skin lesions</article-title>. <source>Cancers</source>. (<year>2023</year>) <volume>15</volume>(<issue>14</issue>):<fpage>3604</fpage>. <pub-id pub-id-type="doi">10.3390/cancers15143604</pub-id><pub-id pub-id-type="pmid">37509267</pub-id></citation></ref>
<ref id="B98"><label>98.</label><citation citation-type="other"><person-group person-group-type="author"><name><surname>Selvaraju</surname><given-names>RR</given-names></name><name><surname>Cogswell</surname><given-names>M</given-names></name><name><surname>Das</surname><given-names>A</given-names></name><name><surname>Vedantam</surname><given-names>R</given-names></name><name><surname>Parikh</surname><given-names>D</given-names></name><name><surname>Batra</surname><given-names>D</given-names></name></person-group>. <article-title>Grad-cam: visual explanations from deep networks via gradient-based localization</article-title>. <comment>In: <italic>Proceedings of the IEEE International Conference on Computer Vision</italic>. (2017). p. 618&#x2013;26</comment>.</citation></ref>
<ref id="B99"><label>99.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Mohammed</surname><given-names>A</given-names></name><name><surname>Kora</surname><given-names>R</given-names></name></person-group>. <article-title>A comprehensive review on ensemble deep learning: opportunities and challenges</article-title>. <source>J King Saud Univ Comput Inf Sci</source>. (<year>2023</year>) <volume>35</volume>(<issue>2</issue>):<fpage>757</fpage>&#x2013;<lpage>74</lpage>. <pub-id pub-id-type="doi">10.1016/j.jksuci.2023.01.014</pub-id></citation></ref>
<ref id="B100"><label>100.</label><citation citation-type="journal"><person-group person-group-type="author"><name><surname>Kawahara</surname><given-names>J</given-names></name><name><surname>Daneshvar</surname><given-names>S</given-names></name><name><surname>Argenziano</surname><given-names>G</given-names></name><name><surname>Hamarneh</surname><given-names>G</given-names></name></person-group>. <article-title>Seven-point checklist and skin lesion classification using multitask multimodal neural nets</article-title>. <source>IEEE J Biomed Health Inf</source>. (<year>2018</year>) <volume>23</volume>(<issue>2</issue>):<fpage>538</fpage>&#x2013;<lpage>46</lpage>. <pub-id pub-id-type="doi">10.1109/JBHI.2018.2824327</pub-id></citation></ref></ref-list>
</back>
</article>