<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Phys.</journal-id>
<journal-title>Frontiers in Physics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Phys.</abbrev-journal-title>
<issn pub-type="epub">2296-424X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1193245</article-id>
<article-id pub-id-type="doi">10.3389/fphy.2023.1193245</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Physics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>An improved YOLOv5 for object detection in visible and thermal infrared images based on contrastive learning</article-title>
<alt-title alt-title-type="left-running-head">Tu et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fphy.2023.1193245">10.3389/fphy.2023.1193245</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Tu</surname>
<given-names>Xiaoguang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2255874/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Yuan</surname>
<given-names>Zihao</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2237793/overview"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Bokai</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Liu</surname>
<given-names>Jianhua</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hu</surname>
<given-names>Yan</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hua</surname>
<given-names>Houqiang</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Wei</surname>
<given-names>Lin</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Institute of Electronic and Electrical Engineering</institution>, <institution>Civil Aviation Flight University of China</institution>, <addr-line>Guanghan</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Computer Science</institution>, <institution>Sichuan University</institution>, <addr-line>Chengdu</addr-line>, <country>China</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>College of Aviation Engineering</institution>, <institution>Civil Aviation Flight University of China</institution>, <addr-line>Guanghan</addr-line>, <country>China</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>College of Flight Technology</institution>, <institution>Civil Aviation Flight University of China</institution>, <addr-line>Guanghan</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2068237/overview">Xiaoqiang Zhang</ext-link>, Beihang University, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2267667/overview">Yingtian Zou</ext-link>, National University of Singapore, Singapore</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2267626/overview">Zun Li</ext-link>, Beijing University of Technology, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Xiaoguang Tu, <email>xguangtu@outlook.com</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>11</day>
<month>05</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>11</volume>
<elocation-id>1193245</elocation-id>
<history>
<date date-type="received">
<day>24</day>
<month>03</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>21</day>
<month>04</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Tu, Yuan, Liu, Liu, Hu, Hua and Wei.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Tu, Yuan, Liu, Liu, Hu, Hua and Wei</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>An improved algorithm has been proposed to address the challenges encountered in object detection using visible and thermal infrared images. These challenges include the diversity of object detection perspectives, deformation of the object, occlusion, illumination, and detection of small objects. The proposed algorithm introduces the concept of contrastive learning into the YOLOv5 object detection network. To extract image features for contrastive loss calculation, object and background image regions are randomly cropped from image samples. The contrastive loss is then integrated into the YOLOv5 network, and the combined loss function of both object detection and contrastive learning is used to optimize the network parameters. By utilizing the strategy of contrastive learning, the distinction between the background and the object in the feature space is improved, leading to enhanced object detection performance of the YOLOv5 network. The proposed algorithm has shown pleasing detection results in both visible and thermal infrared images.</p>
</abstract>
<kwd-group>
<kwd>deep learning</kwd>
<kwd>YOLOv5</kwd>
<kwd>object detection</kwd>
<kwd>contrastive learning</kwd>
<kwd>infrared thermal image</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>subtitle</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Object detection is a crucial area of research in computer vision [<xref ref-type="bibr" rid="B1">1</xref>&#x2013;<xref ref-type="bibr" rid="B4">4</xref>] that aims to identify and localize objects in an image, including both detection and recognition [<xref ref-type="bibr" rid="B5">5</xref>&#x2013;<xref ref-type="bibr" rid="B7">7</xref>]. This technology has become increasingly important in various domains of our daily lives, such as autonomous driving, robotics, and video surveillance.</p>
<p>Currently, deep learning has made significant strides in scientific research, particularly in the field of object detection, where convolutional neural networks (CNNs) have been extensively used and have achieved remarkable results [<xref ref-type="bibr" rid="B8">8</xref>-<xref ref-type="bibr" rid="B12">12</xref>]. Object detection techniques can be classified into two categories: one-stage object detection algorithms based on boundary box regression and two-stage object detection algorithms based on the candidate region. One-stage object detection algorithms typically use a boundary box to localize objects in an image and then implement classification regression, as exemplified by the YOLO series algorithm [<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>], SSD algorithm [<xref ref-type="bibr" rid="B15">15</xref>], RetinaNet algorithm [<xref ref-type="bibr" rid="B16">16</xref>], etc. The two-stage object detection is carried out based on the candidate regions of the image feature extraction and object classification regression, such as R-CNN [<xref ref-type="bibr" rid="B17">17</xref>], Fast R-CNN [<xref ref-type="bibr" rid="B18">18</xref>], and Faster R-CNN [<xref ref-type="bibr" rid="B19">19</xref>]. At present, these classical supervised learning object detection algorithms have achieved promising performance.</p>
<p>Despite the impressive progress made by the supervised object detection, there are still many challenges, including object perspective diversity, deformation, occlusion detection, illumination, and small object detection, which can make it challenging to extract useful image features [<xref ref-type="bibr" rid="B20">20</xref>]. To overcome these challenges, we propose an improved object detection algorithm based on YOLOv5 and contrastive learning [<xref ref-type="bibr" rid="B21">21</xref>&#x2013;<xref ref-type="bibr" rid="B24">24</xref>]. The basic idea of contrastive learning is to train a network by comparing the similarity between images based on the images themselves. This idea is consistent with the process of differentiating objects from the background during object detection. The YOLO-series algorithms are renowned for their high detection speed and excellent performance. Our proposed algorithm introduces the concept of contrastive learning into the YOLOv5 object detection network, using a supervised training strategy. The goal is to increase the distance between the object and background samples in the feature space, thereby enhancing the detection performance of the model, as illustrated in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Contrastive learning makes the same objects (positive samples) similar in the feature space, and makes the object and background (negative samples) dissimilar in the feature space.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g001.tif"/>
</fig>
<p>It is worth noting that the proposed improved YOLOv5 algorithm in this paper is specifically designed to enhance object detection in both visible and infrared thermal images [<xref ref-type="bibr" rid="B21">21</xref>&#x2013;<xref ref-type="bibr" rid="B25">25</xref>]. Some of the visible light and infrared thermal images are shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. This is particularly crucial in night monitoring scenarios where infrared thermal imaging plays a crucial role in pedestrian detection, forest fire detection, maritime rescue, public security reconnaissance, etc [<xref ref-type="bibr" rid="B26">26</xref>]. The proposed method is expected to address challenges such as light source interference, air humidity, occlusion, and other factors that affect object detection accuracy in infrared thermal images [<xref ref-type="bibr" rid="B27">27</xref>]. By implementing this improved algorithm, a significant enhancement in the detection accuracy of infrared thermal images is expected.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Some of the visible light images and infrared thermal images. Images in the first row are the visible light images, and images in the second row are the infrared thermal images.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g002.tif"/>
</fig>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 Image preprocessing</title>
<p>We propose utilizing contrastive learning to improve the distinction between the background and object in the feature space. Given an image, we perform random cropping to obtain the object and background regions. The object region image serves as positive samples, while the background region image serves as negative samples. In this process, we first identify the object&#x2019;s center point coordinates and then capture a 64 &#xd7; 64 image block randomly using these coordinates as the standard. If the captured image block contains more than half of the object area, it is considered a positive sample and represented by a green box in <xref ref-type="fig" rid="F3">Figure 3</xref>. Conversely, if the captured image block contains less than half of the object area, it is considered a negative sample or background image and represented by a yellow box in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Image preprocessing and feature extraction for the positive and negative samples. Then, the generated positive and negative samples are input into the contrastive learning network to calculate the contrastive loss.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g003.tif"/>
</fig>
<p>Once we obtain the positive and negative samples through random cropping, we perform image enhancement. Essentially, we derive different images from the same original image while maintaining its content. However, the derived images have variations in size, scale, brightness, color, and other characteristics. In this paper, we use various image enhancement methods such as cropping, rotation, color adjustment, scale adjustment, and illumination adjustment. Each operation is randomly combined to generate diverse images.</p>
<p>In <xref ref-type="fig" rid="F3">Figure 3</xref>, after enhancing the object image, different views generate <italic>X</italic>
<sup>
<italic>query</italic>1</sup> and <inline-formula id="inf1">
<mml:math id="m1">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>, while different views of the background image generate <inline-formula id="inf2">
<mml:math id="m2">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> and <inline-formula id="inf3">
<mml:math id="m3">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>. Since <italic>X</italic>
<sup>
<italic>query</italic>1</sup> and <inline-formula id="inf4">
<mml:math id="m4">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> are enhanced from the object image, the former is used as the original reference image, and the latter is used as the positive sample image that is inputted into the contrastive learning network for training. Conversely, <inline-formula id="inf5">
<mml:math id="m5">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> and <inline-formula id="inf6">
<mml:math id="m6">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> images, which are enhanced from the background images, serve as negative sample images that are inputted into the contrastive learning network to participate in training.</p>
</sec>
<sec id="s2-2">
<title>2.2 Overall network structure</title>
<p>The overall network structure contains two types of network structures, i.e., the YOLOv5 network structure and the contrastive learning network structure, as shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. The YOLOv5 network structure and the contrastive learning network structure encoder are both composed of several convolution layers, pooling layers, and fully connected layers. The YOLOv5 network structure is mainly divided into four parts: input, backbone, neck, and prediction.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Overall network structure of the proposed method. The left part represents the improved YOLOv5 algorithm, and the right part represents the encoder network structure of contrastive learning.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g004.tif"/>
</fig>
<p>We use the MoCo contrastive learning network to participate in the improvement of the YOLOv5 network architecture. The MoCo network structure includes three modules: image enhancement, feature extraction, and loss calculation. In the image enhancement phase, the images are randomly enhanced, including cropping, rotation, color adjustment, scale adjustment, and illumination adjustment. The images in the datasets can be randomly combined with various enhancement methods, and then, the pictures <italic>X</italic>
<sup>
<italic>query</italic>
</sup> and <inline-formula id="inf7">
<mml:math id="m7">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> with different views can be obtained and participate in the network training. In the feature extraction phase, two identical ResNet residual networks are mainly used to extract image features as the query encoder (<italic>Encoder</italic>
<sub>
<italic>Q</italic>
</sub>) and the momentum encoder (<italic>Encoder</italic>
<sub>
<italic>K</italic>
</sub>), whose corresponding parameters are <italic>&#x3b8;</italic>
<sub>
<italic>Q</italic>
</sub> and <italic>&#x3b8;</italic>
<sub>
<italic>K</italic>
</sub>, respectively. After that, <italic>X</italic>
<sup>
<italic>query</italic>
</sup> will be fed into the query encoder as the object image to extract the feature and then <inline-formula id="inf8">
<mml:math id="m8">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>0</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula> as the positive sample; the image set {<inline-formula id="inf9">
<mml:math id="m9">
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi>X</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>y</mml:mi>
</mml:mrow>
</mml:msubsup>
</mml:math>
</inline-formula>&#x2026;&#x2026;} is fed into the momentum encoder as negative samples for feature extraction. By minimizing the contrastive loss, the characteristic distance between the same kinds of samples can be reduced continuously and the distance between different kinds of samples can be increased continuously. The calculated contrastive loss is directly fed back to the query encoder and momentum encoder to update their network parameters <italic>&#x3b8;</italic>
<sub>
<italic>Q</italic>
</sub> and <italic>&#x3b8;</italic>
<sub>
<italic>K</italic>
</sub>.</p>
<p>We use a ResNet-style module as the encoder to extract image features, as depicted in <xref ref-type="fig" rid="F5">Figure 5</xref>. The input is an image with a resolution of 1 &#xd7; 64 &#xd7; 64 pixels, followed by convolution and pooling operations. The size of the spatiotemporal kernel (depth, height, and width) of the convolution layer is [7, 7, and 7], the step size is 2, and the filling size is 3. The pooling layer utilizes a 3 &#xd7; 3 maximum pooling operation with a step size of 2 and padding of 1. The resulting feature map is then fed into a residual block with 64 channels. Subsequently, the feature map undergoes four consecutive convolution operations with a spatial kernel size of [3, 3, and 3], a stride of 1, and padding of 1, resulting in a feature map size of 64 &#xd7; 16 &#xd7; 16. As the channel size is 64, the residuals are connected <italic>via</italic> solid lines at this stage. One side of the feature map is then up-sampled and down-sampled to obtain a size of 128 &#xd7; 8 &#xd7; 8, followed by four convolution operations on the other side. The first convolution layer has a spatial kernel size of [3, 3, and 3], a stride of 2, and padding of 1, while the last three convolution layers have a spatial kernel size of [3, 3, and 3], a stride of 1, and padding of 1. The feature map after the convolution operation is then added to the up-sampled feature map to obtain a feature map size of 128 &#xd7; 8 &#xd7; 8.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>P-R curve for the improved and original YOLOv5 networks on the experiment of COCO2017 datasets, respectively. <bold>(A)</bold> P-R curve for the improved YOLOv5 network. <bold>(B)</bold> P-R curve for the original YOLOv5 network.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g005.tif"/>
</fig>
</sec>
<sec id="s2-3">
<title>2.3 Loss function</title>
<p>The YOLOv5 loss function is composed of three parts, i.e., <italic>Loss</italic>
<sub>
<italic>box</italic>
</sub> (rectangular frame loss), <italic>Loss</italic>
<sub>
<italic>conf</italic>
</sub> (confidence loss), and <italic>Loss</italic>
<sub>
<italic>cls</italic>
</sub> (classification loss). The rectangular frame loss function calculates the discrepancy between the predicted frame and the object label frame, while the confidence loss function determines the level of certainty of a given predicted frame. Lastly, the classification loss function evaluates the model&#x2019;s ability to correctly identify the object category. The overall loss function of YOLOv5 is obtained by taking a weighted sum of these three individual losses as follows:<disp-formula id="e1">
<mml:math id="m10">
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>f</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>b</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>x</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>c</mml:mi>
<mml:mi>l</mml:mi>
<mml:mi>s</mml:mi>
</mml:mrow>
</mml:msub>
</mml:math>
<label>(1)</label>
</disp-formula>
<disp-formula id="e2">
<mml:math id="m11">
<mml:mn>1</mml:mn>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3b1;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b2;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3b3;</mml:mi>
<mml:mo>.</mml:mo>
</mml:math>
<label>(2)</label>
</disp-formula>
</p>
<p>The contrastive loss is defined by the following equation:<disp-formula id="e3">
<mml:math id="m12">
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2212;</mml:mo>
<mml:mi mathvariant="italic">log</mml:mi>
<mml:mfrac>
<mml:mrow>
<mml:mi mathvariant="italic">exp</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mo>&#x2b;</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mi>&#x3c4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mi>e</mml:mi>
<mml:mi>x</mml:mi>
<mml:mi>p</mml:mi>
<mml:mfenced open="(" close=")">
<mml:mrow>
<mml:mi>q</mml:mi>
<mml:mo>.</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>/</mml:mo>
<mml:mi>&#x3c4;</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(3)</label>
</disp-formula>where q represents the feature extracted by the query encoder from the object image, <italic>k</italic>
<sub>
<italic>i</italic>
</sub> represents the feature extracted by the momentum encoder, <italic>k</italic>
<sub>&#x2b;</sub> represents the feature of the positive sample (assuming there is only one), and <italic>&#x3c4;</italic> is used as a hyper-parameter to adjust the aforementioned contrastive loss.</p>
<p>After the contrastive loss of positive samples and negative samples is calculated, the next step is to calculate the cross entropy loss function. It is worth noting that the contrastive loss of positive samples and negative samples is taken as loss samples to calculate the cross entropy loss function, whose calculation formula is defined by the following equation:<disp-formula id="e4">
<mml:math id="m13">
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:munderover accentunder="false" accent="true">
<mml:mrow>
<mml:mo>&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:munderover>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2061;</mml:mo>
<mml:mi mathvariant="italic">log</mml:mi>
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
<mml:mo>,</mml:mo>
</mml:math>
<label>(4)</label>
</disp-formula>where <italic>n</italic> represents the number of samples between positive and negative samples, <italic>L</italic>
<sub>
<italic>i</italic>
</sub> represents the <italic>i</italic>th expected contrastive loss, and <inline-formula id="inf10">
<mml:math id="m14">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> represents the <italic>i</italic>th contrastive loss calculated by the network model. It should be emphasized that <inline-formula id="inf11">
<mml:math id="m15">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> is the contrastive loss between the positive sample and the sample image, while <inline-formula id="inf12">
<mml:math id="m16">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf13">
<mml:math id="m17">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>3</mml:mn>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula>&#x2026;are the contrastive losses between the negative samples and the sample image. Then, with step by step iterative operation, <inline-formula id="inf14">
<mml:math id="m18">
<mml:mrow>
<mml:mover accent="true">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>L</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mo>&#x302;</mml:mo>
</mml:mover>
</mml:mrow>
</mml:math>
</inline-formula> gradually approaches <italic>L</italic>
<sub>
<italic>i</italic>
</sub>. In each epoch, the loss function is calculated to enable the samples to fulfill the objective of pulling in positive samples and pulling out negative samples. The positive and negative region images are cropped from the original images and then enhanced. The resulting enhanced object and background images are fed into the encoder to extract their features. The resulting contrastive loss is used to update the network parameters of the contrastive learning encoder and is also added to the YOLOv5 loss for overall training. The final loss is defined by the following equation:<disp-formula id="e5">
<mml:math id="m19">
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:mi>s</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3be;</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>Y</mml:mi>
<mml:mi>O</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>O</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
<mml:mi>L</mml:mi>
<mml:mi>o</mml:mi>
<mml:mi>s</mml:mi>
<mml:msub>
<mml:mrow>
<mml:mi>s</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>C</mml:mi>
<mml:mi>L</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:math>
<label>(5)</label>
</disp-formula>
</p>
<p>The aforementioned equation is the overall optimization object function for the proposed method, where <italic>Loss</italic>
<sub>
<italic>YOLO</italic>
</sub> represents the YOLOv5 object detection loss and <italic>Loss</italic>
<sub>
<italic>CL</italic>
</sub> represents the contrastive learning loss.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<p>The PyTorch deep learning framework is used in the experiment. The CUDA version used is 11.3. YOLOv5 confidence loss weight <italic>&#x3b1;</italic> is set to 0.4, and the rectangular frame loss and classification loss weights <italic>&#x3b2;</italic> and <italic>&#x3b3;</italic> are both set to 0.3 The network training utilized asynchronous random gradient descent with a momentum term of 0.973. The initial learning rate for weight is set to 0.01, and the attenuation coefficient is set to 0.0005. A batch size of 128 is used, and a total of 200 batches were trained. In the global loss function, the weight <italic>&#x3be;</italic> of the YOLOv5 loss function is set to 1, and the weight <italic>&#x3bb;</italic> of the contrastive learning loss function is set to 0.001.</p>
<sec id="s3-1">
<title>3.1 Experiment on the COCO2017 dataset</title>
<p>The MS COCO dataset is used for the evaluation of our method. This dataset is funded and annotated by Microsoft; it is a large-scale dataset that can be utilized for image detection, semantic segmentation, and image captioning. It consists of over 330,000 images, out of which 220,000 are annotated, containing 1.5 million objects and 80 object categories, such as pedestrians, cars, and elephants. Additionally, it includes 91 material categories such as grass, walls, and sky. Each image in the dataset is accompanied by five descriptive sentences, and there are 250,000 pedestrians with key points available for analysis.</p>
<p>In the first experiment, we select the COCO2017 dataset as the experimental dataset. Specifically, 118,287 images are chosen as the training set, 40,670 images are chosen as the testing data, and 5,000 sets are chosen as the validation set while keeping their original label files intact. The datasets are divided into training, validation, and testing sets in the ratio of 118,287:40,670:5,000. It is ensured that the training and test sets are independent of each other during the experiment. Finally, the dataset is fed into the improved YOLOv5 network for training.</p>
<p>The performance of the model is evaluated using precision P), recall(R), average precision (AP), and mean average precision (mAP) for all categories of AP values. The calculation formula for each index is as follows:<disp-formula id="e6">
<mml:math id="m20">
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(6)</label>
</disp-formula>
<disp-formula id="e7">
<mml:math id="m21">
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi>T</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mrow>
<mml:mi>F</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:math>
<label>(7)</label>
</disp-formula>
<disp-formula id="e8">
<mml:math id="m22">
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>n</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>M</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mi>%</mml:mi>
<mml:mo>,</mml:mo>
</mml:math>
<label>(8)</label>
</disp-formula>
<disp-formula id="e9">
<mml:math id="m23">
<mml:mi>m</mml:mi>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:msubsup>
<mml:mrow>
<mml:mo movablelimits="false" form="prefix">&#x2211;</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:msubsup>
<mml:mi>A</mml:mi>
<mml:mi>P</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>N</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>100</mml:mn>
<mml:mi>%</mml:mi>
<mml:mo>,</mml:mo>
</mml:math>
<label>(9)</label>
</disp-formula>where <italic>T</italic>
<sub>
<italic>P</italic>
</sub> is the true case, <italic>F</italic>
<sub>
<italic>P</italic>
</sub> is the false case, <italic>F</italic>
<sub>
<italic>N</italic>
</sub> is the missed case, <italic>M</italic> is the total number of samples, and N is the number of categories. To create a P-R curve, we will use the recall rate as the horizontal axis and the precision rate as the vertical axis. The area under this curve is known as the average precision (AP) value. This experiment is focused on 80 classes, so N &#x3d; 80 and the mAP value is equal to the average sum of the AP values of 80 classes. The detection accuracy of the improved YOLOv5 network is compared with that of the original YOLOv5 network, and the detection performance of the network before and after the improvement is analyzed.</p>
<p>After feeding the COCO2017 dataset into both the YOLOv5 network and the improved YOLOv5 network for training 200 epochs, the loss function begins to converge. <xref ref-type="fig" rid="F5">Figure 5A</xref> displays the P-R curve for the improved YOLOv5 network. On the other hand, <xref ref-type="fig" rid="F5">Figure 5B</xref> shows the P-R curve for the original YOLOv5 network. In addition, the precision, recall, and mAP data pairs of the two models are shown in <xref ref-type="table" rid="T1">Table 1</xref>. <italic>mAP</italic>
<sub>50</sub> indicates the mAP of the IOU between the preselection box and the groundtruth box is greater than 0.5, and <italic>mAP</italic>
<sub>50&#x2212;95</sub> indicates the mAP of the IOU between the preselection box and the groundtruth box is between 0.5 and 0.95.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Comparison results between the improved YOLOv5 and the original YOLOv5 algorithms on the COCO2017 dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">
<italic>mAP</italic>
<sub>50</sub> (%)</th>
<th align="center">
<italic>mAP</italic>
<sub>50&#x2212;95</sub> (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">YOLOv5</td>
<td align="center">0.719</td>
<td align="center">0.526</td>
<td align="center">58.4</td>
<td align="center">36.9</td>
</tr>
<tr>
<td align="left">Our</td>
<td align="center">0.724</td>
<td align="center">0.527</td>
<td align="center">58.7</td>
<td align="center">37.2</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>From <xref ref-type="table" rid="T1">Table 1</xref>, we can see that the precision value of the original YOLOv5 model is 0.719, while the precision value of our model is 0.724, an increase of 0.005. The recall value of the original YOLOv5 model is 0.526, while the recall value of our model is 0.527, an increase of 0.001. The <italic>mAP</italic>
<sub>50</sub> value of the original YOLOv5 model is 58.4%, while that of our model is 58.7%, an increase of 0.3%. The <italic>mAP</italic>
<sub>50&#x2212;95</sub> value of the original YOLOv5 model is 36.9%, while that of our model is 37.2%, an increase of 0.3%. As observed from <xref ref-type="fig" rid="F5">Figure 5</xref> and <xref ref-type="table" rid="T1">Table 1</xref>, the improved algorithm in this study outperforms the original algorithm on the COCO datasets. <xref ref-type="fig" rid="F6">Figure 6</xref> displays the detection results of the improved YOLOv5 and the original YOLOv5 algorithms.</p>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Detection results by the improved YOLOv5 and original YOLOv5 algorithms on the COCO2017 dataset. The upper layer is the detection result of the original YOLOv5 algorithm, and the lower layer is the detection result of the improved YOLOv5 algorithm. It is worth noting that the fourth image is the detection result for small objects, which shows that the method used in this article is also applicable for detecting small objects.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g006.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Experiment on the LLVIP datasets</title>
<p>To validate the accuracy of the improved YOLOv5 object detection network for infrared thermal imaging, we apply the algorithm to infrared images using the LLVIP dataset. This dataset consists of 15,488 pairs of infrared images captured in 26 real-time scenes, with a majority of them taken in low-light conditions using a wavelength band of 8&#x2013;14&#xa0;<italic>&#x3bc;</italic>m. In the experiment, we select 100 images successively from 18 scenes in the original LLVIP dataset, resulting in a total of 1,800 images as the training sets. We also select 50 images successively from four scenes in the original LLVIP dataset, resulting in a total of 200 images as the testing and validation sets. The evaluation metrics, precision P), recall(R), and mean average precision (mAP) of all categories of AP values are used to evaluate the performance of the model.</p>
<p>After training the LLVIP datasets on both the YOLOv5 network and the improved YOLOv5 network for up to 100 epochs, the loss function starts to converge. <xref ref-type="fig" rid="F7">Figure 7A</xref> displays the P-R curve for the improved YOLOv5 network after the loss function has converged. Meanwhile, <xref ref-type="fig" rid="F7">Figure 7B</xref> shows the P-R curve for the original YOLOv5 network after the loss function has converged.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>P-R curve for the improved and original YOLOv5 networks on the experiment of LLVIP datasets. <bold>(A)</bold> P-R curve for the improved YOLOv5 network. <bold>(B)</bold> P-R curve for the original YOLOv5 network.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g007.tif"/>
</fig>
<p>From <xref ref-type="table" rid="T2">Table 2</xref>, we can see that the precision value of the original YOLOv5 model is 0.989, while the precision value of our model is 0.994, an increase of 0.005. The recall value of the original YOLOv5 model is 0.970, while the recall value of our model is 0.983, an increase of 0.013. The <italic>mAP</italic>
<sub>50</sub> value of the original YOLOv5 model is 99.0%, while that of our model is 99.6%, an increase of 0.6%. The <italic>mAP</italic>
<sub>50&#x2212;95</sub> value of the original YOLOv5 model is 74.7%, while that of our model is 76.0%, an increase of 1.3%. The detection results of the improved YOLOv5 and the original YOLOv5 algorithms can be viewed in <xref ref-type="fig" rid="F8">Figure 8A</xref> and <xref ref-type="fig" rid="F8">Figure 8B</xref>, respectively.</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Comparison results between the improved YOLOv5 and the original YOLOv5 algorithms on the LLVIP dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Model</th>
<th align="center">Precision</th>
<th align="center">Recall</th>
<th align="center">
<italic>mAP</italic>
<sub>50</sub> (%)</th>
<th align="center">
<italic>mAP</italic>
<sub>50&#x2212;95</sub> (%)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">YOLOv5</td>
<td align="center">0.989</td>
<td align="center">0.970</td>
<td align="center">99.0</td>
<td align="center">74.7</td>
</tr>
<tr>
<td align="left">Our</td>
<td align="center">0.994</td>
<td align="center">0.983</td>
<td align="center">99.6</td>
<td align="center">76.0</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Detection results by the improved YOLOv5 and original YOLOv5 algorithms on the LLVIP datasets. <bold>(A)</bold> Detection results by the improved YOLOv5 algorithm. <bold>(B)</bold> Detection results by the original YOLOv5 algorithm.</p>
</caption>
<graphic xlink:href="fphy-11-1193245-g008.tif"/>
</fig>
</sec>
<sec id="s3-3">
<title>3.3 Comparison with other algorithms</title>
<p>In order to further prove the effectiveness of our improved YOLOv5 algorithm, this paper conducted experimental comparisons with several current mainstream object detection algorithms, including YOLOv3, SSD, Faster R-CNN, mask R-CNN, and R-FCN. We unified the configuration environment and initial training parameters in all experiments; the experimental data are the same as that of the experiment, MS COCO dataset. The dataset is still guaranteed to include 118,287 training sets, 40,670 testing sets, and 5,000 verification sets. The experimental data results are shown in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Comparison results between various classical object detection algorithms and our improved YOLOv5 algorithm on the MS COCO dataset.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="left">Algorithm</th>
<th align="center">Backbone</th>
<th align="center">AP</th>
<th align="center">
<italic>AP</italic>
<sub>50</sub>
</th>
<th align="center">
<italic>AP</italic>
<sub>75</sub>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="left">Faster R-CNN&#x2b;&#x2b;&#x2b;</td>
<td align="center">ResNet-101-C4</td>
<td align="center">34.9</td>
<td align="center">55.7</td>
<td align="center">37.4</td>
</tr>
<tr>
<td align="left">Mask R-CNN</td>
<td align="center">ResNet-101-FPN</td>
<td align="center">38.2</td>
<td align="center">60.3</td>
<td align="center">41.7</td>
</tr>
<tr>
<td align="left">Cascade R-CNN</td>
<td align="center">ResNet-101</td>
<td align="center">42.8</td>
<td align="center">62.1</td>
<td align="center">46.3</td>
</tr>
<tr>
<td align="left">R-FCN</td>
<td align="center">ResNet-101</td>
<td align="center">27.6</td>
<td align="center">48.9</td>
<td align="center">&#x2013;</td>
</tr>
<tr>
<td align="left">RetinaNet</td>
<td align="center">ResNet-101</td>
<td align="center">39.1</td>
<td align="center">59.1</td>
<td align="center">42.3</td>
</tr>
<tr>
<td align="left">FPN</td>
<td align="center">ResNet-50</td>
<td align="center">38.6</td>
<td align="center">60.4</td>
<td align="center">42.0</td>
</tr>
<tr>
<td align="left">SSD</td>
<td align="center">VGG16</td>
<td align="center">23.2</td>
<td align="center">41.2</td>
<td align="center">23.4</td>
</tr>
<tr>
<td align="left">YOLOv2</td>
<td align="center">DarkNet-19</td>
<td align="center">21.6</td>
<td align="center">44.0</td>
<td align="center">19.2</td>
</tr>
<tr>
<td align="left">YOLOv3</td>
<td align="center">DarkNet-53</td>
<td align="center">33.0</td>
<td align="center">57.9</td>
<td align="center">34.4</td>
</tr>
<tr>
<td align="left">Our</td>
<td align="center">CSPDarkNet-53</td>
<td align="center">37.2</td>
<td align="center">58.7</td>
<td align="center">39.3</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In <xref ref-type="table" rid="T3">Table 3</xref>, we assess the accuracy of the frame regression task. The accuracy of the frame is generally measured by the intersection ratio (IOU), AP represents the IOU interval of 0.5 up to 0.95, after the average is taken. <italic>AP</italic>
<sub>50</sub> represents that the IOU value is 0.5, then taking the average value. <italic>AP</italic>
<sub>75</sub> represents that the IOU value is 0.75, then taking the average value. <xref ref-type="table" rid="T3">Table 3</xref> indicates that when the same datasets are input into our improved YOLOv5 algorithm and the other mainstream object detection algorithms, the various AP values of our improved YOLOv5 algorithm are all improved. These findings verify the effectiveness of our improved YOLOv5 algorithm in object detection tasks.</p>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<sec id="s4-1">
<title>4.1 Previous research on object detection</title>
<p>Object detection is a fundamental task in computer vision that involves identifying the presence of objects and their location in images or videos. Over the years, there have been many advances in object detection algorithms, resulting in two main categories: two-stage algorithms, such as R-CNN [<xref ref-type="bibr" rid="B17">17</xref>], Fast R-CNN [<xref ref-type="bibr" rid="B18">18</xref>], and Faster R-CNN [<xref ref-type="bibr" rid="B19">19</xref>], and one-stage algorithms, such as SSD [<xref ref-type="bibr" rid="B15">15</xref>] and YOLO series [<xref ref-type="bibr" rid="B13">13</xref>, <xref ref-type="bibr" rid="B14">14</xref>]. Although R-CNN represented a significant improvement over traditional algorithms, its candidate area box calculation in the CNN led to increased computation, significantly affecting the test speed. Fast R-CNN reduced computation but still could not achieve true real-time performance or end-to-end training and testing. Therefore, Faster R-CNN is proposed to integrate feature extraction, candidate box selection, classification, and boundary box regression into a single framework, improving accuracy and speed, and achieving end-to-end object detection. However, there is still a gap between real-time object detection and Faster R-CNN, leading to the emergence of one-stage algorithms such as SSD and YOLO. Although the YOLO series solved object detection as a regression problem, it suffered from a positioning error compared to Faster R-CNN. YOLOv2 improved the original algorithm while maintaining its speed advantage, while YOLOv3 used a deep residual network to extract image features. The YOLOv5 algorithm is the latest version and has a streamlined architecture and improved performance on object detection tasks, achieving good detection speed and accuracy by adopting adaptive anchor box computing and the multi-semantic fusion detection mechanism to quickly and effectively integrate high-level semantic information and low-level location information.</p>
</sec>
<sec id="s4-2">
<title>4.2 Discussion on the proposed method</title>
<p>In this paper, we propose an improved object detection algorithm by integrating contrastive learning into the YOLOv5 network, to further improve the performance of current object detection methods. Even there are many ready-made contrastive learning methods such as SimCLR [<xref ref-type="bibr" rid="B28">28</xref>], MoCo [<xref ref-type="bibr" rid="B29">29</xref>], BYOL [<xref ref-type="bibr" rid="B30">30</xref>], SwAV [<xref ref-type="bibr" rid="B31">31</xref>], and SimSiam [<xref ref-type="bibr" rid="B32">32</xref>], we use MoCo as the contrastive learning structure to build our model since MoCo is one of the best contrastive learning networks at present and it is relatively simple to be implemented. By simultaneously constraining the object detection loss and contrastive loss, our method can compact the distribution of similar objects in the feature space and enlarge the distribution distance between the object and the background in the feature space, thereby enhancing the distinction between the object and the background. Experimental results on COCO and LLVIP datasets demonstrate that our proposed method outperforms the original YOLOv5 network in terms of object detection performance in both visible and thermal infrared images. Moreover, our proposed method is a general framework as the contrastive learning mechanism can be applied not only to the YOLOv5 object detection model but also to other deep learning-based object detection methods, such as the Faster R-CNN series, SSD, and SPP-Net.</p>
</sec>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s5">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. These data can be found at: MS COCO Datasets: <ext-link ext-link-type="uri" xlink:href="https://cocodataset.org/#download">https://cocodataset.org/&#x23;download</ext-link> and LLVIP Datasets: <ext-link ext-link-type="uri" xlink:href="https://bupt-ai-cz.github.io/LLVIP">https://bupt-ai-cz.github.io/LLVIP</ext-link>.</p>
</sec>
<sec id="s6">
<title>Author contributions</title>
<p>XT: conceptualization, methodology, software, investigation, formal analysis, and writing&#x2014;original draft; ZY: data curation and writing&#x2014;original draft; BL: visualization and investigation; JL: resources and supervision; YH: software and validation; HH: visualization. All authors contributed to the article and approved the submitted version.</p>
</sec>
<sec id="s7">
<title>Funding</title>
<p>This work was supported in part by the Science and Technology Department in the Sichuan Province of China under grant no. 2022JDRC0076, in part by the China Postdoctoral Science Foundation under grant no. 2022M722248, in part by the Project of Basic Scientific Research of Central Universities of China under grant nos. ZHMH2022-004 and J2022-025, in part by the Open Fund of Key Laboratory of Flight Techniques and Flight Safety, CAAC (grant no. FZ2022KF06), and in part by the Fund of Key Laboratory of Flight Techniques and Flight Safety, CAAC (grant no. FZ2021ZZ05).</p>
</sec>
<sec sec-type="COI-statement" id="s8">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<label>1.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Everingham</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Van Gool</surname>
<given-names>L</given-names>
</name>
<name>
<surname>Williamsc</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Winn</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zisserman</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>The pascal visual object classes (VOC) challenge</article-title>. <source>Int J Comp Vis</source> (<year>2010</year>) <volume>88</volume>(<issue>2</issue>):<fpage>303</fpage>&#x2013;<lpage>38</lpage>. <pub-id pub-id-type="doi">10.1007/s11263-009-0275-4</pub-id>
</citation>
</ref>
<ref id="B2">
<label>2.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Donahue</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Darrell</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Malik</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Rich feature hierarchies for accurate object detection and semantic segmentation</article-title>. In: <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-date>June 2014</conf-date>; <conf-loc>Columbus, OH, USA</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2014</year>). p. <fpage>580</fpage>&#x2013;<lpage>7</lpage>.</citation>
</ref>
<ref id="B3">
<label>3.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Xia</surname>
<given-names>GS</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Belongie</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>J</given-names>
</name>
<etal/>
</person-group> <article-title>DOTA:A large scale dataset for object detection in aerial images[C]</article-title>. <conf-name>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</conf-name>. <publisher-name>IEEE</publisher-name> (<year>2018</year>). p. <fpage>3974</fpage>&#x2013;<lpage>83</lpage>.</citation>
</ref>
<ref id="B4">
<label>4.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Pang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Design and calibration test of a support force measuring system for hypersonic vehicle aerodynamic measurement</article-title>. <source>Flow Meas Instrumentation</source> (<year>2022</year>) <volume>88</volume>:<fpage>102264</fpage>. <pub-id pub-id-type="doi">10.1016/j.flowmeasinst.2022.102264</pub-id>
</citation>
</ref>
<ref id="B5">
<label>5.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Tan</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Le</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Efficientnet: Rethinking model scaling for convolutional neural networks [C]</article-title>. In: <conf-name>Proceedings of the International conference on machine learning</conf-name>; <conf-loc>Long Beach, California</conf-loc>. <publisher-name>PMLR</publisher-name> (<year>2019</year>). p. <fpage>6105</fpage>&#x2013;<lpage>14</lpage>.</citation>
</ref>
<ref id="B6">
<label>6.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Distributed location-aware task offloading in multi-UAVs enabled edge computing</article-title>. <source>IEEE Access</source> (<year>2022</year>) <volume>10</volume>:<fpage>72416</fpage>&#x2013;<lpage>28</lpage>. <pub-id pub-id-type="doi">10.1109/access.2022.3189682</pub-id>
</citation>
</ref>
<ref id="B7">
<label>7.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Q</given-names>
</name>
<name>
<surname>Ai</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Z</given-names>
</name>
<etal/>
</person-group> <article-title>Joint face image restoration and frontalization for recognition</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2021</year>) <volume>32</volume>(<issue>3</issue>):<fpage>1285</fpage>&#x2013;<lpage>98</lpage>. <pub-id pub-id-type="doi">10.1109/TCSVT.2021.3078517</pub-id>
</citation>
</ref>
<ref id="B8">
<label>8.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>FW</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>CY</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>YQ</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>ZB</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>CD</given-names>
</name>
<name>
<surname>Qi</surname>
<given-names>J</given-names>
</name>
</person-group> <article-title>Review of deep learning applied to occluded object detection</article-title>. <source>J Front Comp Sci Tech</source> (<year>2022</year>) <volume>16</volume>(<issue>6</issue>):<fpage>1243</fpage>&#x2013;<lpage>59</lpage>.</citation>
</ref>
<ref id="B9">
<label>9.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Y</given-names>
</name>
<etal/>
</person-group> <article-title>3D face reconstruction from a single image assisted by 2D face images in the wild</article-title>. <source>IEEE Trans Multimedia</source> (<year>2020</year>) <volume>23</volume>:<fpage>1160</fpage>&#x2013;<lpage>72</lpage>. <pub-id pub-id-type="doi">10.1109/TMM.2020.2993962</pub-id>
</citation>
</ref>
<ref id="B10">
<label>10.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Ren</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Deep residual learning for image recognition</article-title>. In: <conf-name>Proceedings of the IEEE confer ence on computer vision and pattern recognition</conf-name>; <conf-date>June 2016</conf-date>; <conf-loc>Las Vegas, NV, USA</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2016</year>). p. <fpage>770</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B11">
<label>11.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zou</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ai</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>Y</given-names>
</name>
<etal/>
</person-group> <article-title>Image-to-video generation via 3D facial dynamics</article-title>. <source>IEEE Trans Circuits Syst Video Tech</source> (<year>2021</year>) <volume>32</volume>(<issue>4</issue>):<fpage>1805</fpage>&#x2013;<lpage>19</lpage>.</citation>
</ref>
<ref id="B12">
<label>12.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Z</given-names>
</name>
</person-group>. <source>Novel trust scheme applicable to edge computing</source>. <publisher-name>Authorea Preprints</publisher-name> (<year>2022</year>).</citation>
</ref>
<ref id="B13">
<label>13.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Sruthi</surname>
<given-names>MS</given-names>
</name>
<name>
<surname>Poovathingal</surname>
<given-names>MJ</given-names>
</name>
<name>
<surname>Nandana</surname>
<given-names>VN</given-names>
</name>
<name>
<surname>Lakshimi</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Samshad</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Sudeesh</surname>
<given-names>V</given-names>
</name>
</person-group>. <article-title>YOLOv5 based open-source UAV for human detection during search and rescue (SAR)</article-title>. In: <conf-name>Proceedings of the 10th International Conference on 13 Advances in Computing and Communications</conf-name>; <conf-date>October 2021</conf-date>; <conf-loc>Kochi</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2021</year>). p. <fpage>1</fpage>&#x2013;<lpage>6</lpage>.</citation>
</ref>
<ref id="B14">
<label>14.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>XK</given-names>
</name>
<name>
<surname>yu</surname>
<given-names>LSC</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>TPH-YOLOv5: Improved YOLOv5 based on transformer prediction head for object detection on drone-captured scenarios</article-title>. In: <conf-name>Proceedings of the IEEE International Conference on Computer Vision</conf-name>; <conf-date>October 2021</conf-date>; <conf-loc>Montreal, BC, Canada</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2021</year>). p. <fpage>2778</fpage>&#x2013;<lpage>88</lpage>.</citation>
</ref>
<ref id="B15">
<label>15.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Anguelov</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Erhan</surname>
<given-names>D</given-names>
</name>
<name>
<surname>Szegedy</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Reed</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>C</given-names>
</name>
<etal/>
</person-group> <article-title>Ssd: Single shot MultiBox detector</article-title>. In: <conf-name>Proceedings of the 14th European Conference on Computer Vision</conf-name>; <conf-loc>Amsterdam</conf-loc>. <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>21</fpage>&#x2013;<lpage>37</lpage>.</citation>
</ref>
<ref id="B16">
<label>16.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>TY</given-names>
</name>
<name>
<surname>Goyal</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>He</surname>
<given-names>K</given-names>
</name>
<name>
<surname>Dollar</surname>
<given-names>P</given-names>
</name>
</person-group>. <article-title>Focal loss for dense object detection</article-title>. In: <conf-name>Proceedings of the 2017 IEEE International Conference on Computer Vision</conf-name>; <conf-loc>Venice</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2017</year>). p. <fpage>2999</fpage>&#x2013;<lpage>3007</lpage>.</citation>
</ref>
<ref id="B17">
<label>17.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>MY</given-names>
</name>
<name>
<surname>Tuzel</surname>
<given-names>O</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>R-CNN for small object detection</article-title>. In: <source>Asian conference on computer vision</source>. <publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name> (<year>2016</year>). p. <fpage>214</fpage>&#x2013;<lpage>30</lpage>.</citation>
</ref>
<ref id="B18">
<label>18.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Fast R-CNN</article-title>. In: <conf-name>Proceedings of the 2015 IEEE International Conference on Computer Vision (ICCV)</conf-name>; <conf-date>December 2015</conf-date>; <conf-loc>Santiago</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2015</year>). p. <fpage>1440</fpage>&#x2013;<lpage>8</lpage>.</citation>
</ref>
<ref id="B19">
<label>19.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Ren</surname>
<given-names>SQ</given-names>
</name>
<name>
<surname>He</surname>
<given-names>KM</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>Faster R-CNN: Towards real-time object detection with region proposal networks</article-title>. In: <conf-name>Proceedings of the 28th International Conference on Neural Information Processing Systems</conf-name>; <conf-loc>Montreal</conf-loc>. <publisher-name>MIT Press</publisher-name> (<year>2015</year>). p. <fpage>91</fpage>&#x2013;<lpage>9</lpage>.</citation>
</ref>
<ref id="B20">
<label>20.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Han</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Ding</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>N</given-names>
</name>
<name>
<surname>Xia</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>Redet: A rotation-equivariant detector for aerial object detection[C]</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name> (<year>2021</year>). p. <fpage>2786</fpage>&#x2013;<lpage>95</lpage>.</citation>
</ref>
<ref id="B21">
<label>21.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Z</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Yu</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Guo</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>DC-SPP-YOLO: Dense connection and spatial pyramid pooling based YOLO for object detection</article-title>. <source>Inf Sci</source> (<year>2020</year>) <volume>522</volume>:<fpage>241</fpage>&#x2013;<lpage>58</lpage>. <pub-id pub-id-type="doi">10.1016/j.ins.2020.02.067</pub-id>
</citation>
</ref>
<ref id="B22">
<label>22.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>W</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Q</given-names>
</name>
</person-group>. <article-title>Machine vision inspection of electrical connectors based on improved yolo v3</article-title>. <source>IEEE Access</source> (<year>2020</year>) <volume>8</volume>:<fpage>166184</fpage>&#x2013;<lpage>96</lpage>. <pub-id pub-id-type="doi">10.1109/access.2020.3022405</pub-id>
</citation>
</ref>
<ref id="B23">
<label>23.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Nouaze</surname>
<given-names>JC</given-names>
</name>
<name>
<surname>Touko</surname>
<given-names>PL</given-names>
</name>
<name>
<surname>Kim</surname>
<given-names>J</given-names>
</name>
</person-group>. <article-title>YOLO-tomato: A robust algorithm for tomato detection based on YOLOv3</article-title>. <source>Sensors</source> (<year>2020</year>) <volume>20</volume>(<issue>7</issue>):<fpage>2145.1</fpage>&#x2013;<lpage>2145.20</lpage>. <pub-id pub-id-type="doi">10.3390/s20072145</pub-id>
</citation>
</ref>
<ref id="B24">
<label>24.</label>
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Tu</surname>
<given-names>X</given-names>
</name>
<name>
<surname>Fu</surname>
<given-names>G</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>B</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>M</given-names>
</name>
<etal/>
</person-group> <source>Low-light image enhancement by learning contrastive representations in spatial and frequency domains</source> (<year>2023</year>). <comment>arXiv preprint arXiv:2303.13412</comment>.</citation>
</ref>
<ref id="B25">
<label>25.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Hsu</surname>
<given-names>WY</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>WY</given-names>
</name>
</person-group>. <article-title>Ratio-and-Scale-Aware YOLO for pedestrian detection</article-title>. <source>IEEE Trans Image Process</source> (<year>2020</year>) <volume>30</volume>:<fpage>934</fpage>&#x2013;<lpage>47</lpage>. <pub-id pub-id-type="doi">10.1109/TIP.2020.3039574</pub-id>
</citation>
</ref>
<ref id="B26">
<label>26.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>H</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>X</given-names>
</name>
</person-group>. <article-title>Low-altitude infrared small target detection based on fully convolutional regression network and graph matching</article-title>. <source>Infrared Phys Tech</source> (<year>2021</year>) <volume>115</volume>:<fpage>103738</fpage>. <pub-id pub-id-type="doi">10.1016/j.infrared.2021.103738</pub-id>
</citation>
</ref>
<ref id="B27">
<label>27.</label>
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Dai</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>Y</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Barnard</surname>
<given-names>K</given-names>
</name>
</person-group>. <article-title>Attentional local contrast networks for infrared small target detection</article-title>. <source>IEEE Trans-actions Geosci Remote Sensing</source> (<year>2021</year>) <volume>59</volume>(<issue>11</issue>):<fpage>9813</fpage>&#x2013;<lpage>24</lpage>. <pub-id pub-id-type="doi">10.1109/tgrs.2020.3044958</pub-id>
</citation>
</ref>
<ref id="B28">
<label>28.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>T</given-names>
</name>
<name>
<surname>Kornblith</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Norouzi</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Hinton</surname>
<given-names>G</given-names>
</name>
</person-group>. <article-title>A simple framework for contrastive learning of visual representations</article-title>. In: <conf-name>Proceedings of the 37th International Conference on Machine Learning</conf-name>. <publisher-name>PMLR</publisher-name> (<year>2020</year>). p. <fpage>1597</fpage>&#x2013;<lpage>607</lpage>.</citation>
</ref>
<ref id="B29">
<label>29.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>KM</given-names>
</name>
<name>
<surname>Fan</surname>
<given-names>HQ</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>YX</given-names>
</name>
<name>
<surname>Xie</surname>
<given-names>S</given-names>
</name>
<name>
<surname>Girshick</surname>
<given-names>R</given-names>
</name>
</person-group>. <article-title>Momentum contrast for unsupervised visual representation learning</article-title>. In: <conf-name>Proceedings of 2020 IEEE/CVF Conference on Computer Vision and Pattern Recognition</conf-name>; <conf-loc>Seattle</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2020</year>). p. <fpage>9726</fpage>&#x2013;<lpage>35</lpage>.</citation>
</ref>
<ref id="B30">
<label>30.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Grill</surname>
<given-names>JB</given-names>
</name>
<name>
<surname>Strub</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Altch&#xe9;</surname>
<given-names>F</given-names>
</name>
<name>
<surname>Tallec</surname>
<given-names>C</given-names>
</name>
<name>
<surname>Richemond</surname>
<given-names>PH</given-names>
</name>
<name>
<surname>Buchatskaya</surname>
<given-names>E</given-names>
</name>
<etal/>
</person-group> <article-title>Bootstrap your own latent a new approach to self-supervised learning</article-title>. In: <conf-name>Proceedings of the 34th International Conference on Neural Information Processing Systems (NIPS)</conf-name>; <conf-loc>Vancouver, Canada</conf-loc>. <publisher-name>Curran Associates Inc</publisher-name> (<year>2020</year>). p. <fpage>21271</fpage>&#x2013;<lpage>84</lpage>.</citation>
</ref>
<ref id="B31">
<label>31.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Caron</surname>
<given-names>M</given-names>
</name>
<name>
<surname>Misra</surname>
<given-names>I</given-names>
</name>
<name>
<surname>Mairal</surname>
<given-names>J</given-names>
</name>
<name>
<surname>Goyal</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Bojanowski</surname>
<given-names>P</given-names>
</name>
<name>
<surname>Joulin</surname>
<given-names>A</given-names>
</name>
</person-group>. <article-title>Unsupervised learning of visual features by contrasting cluster assignments</article-title>. In: <conf-name>Proceedings of the 34th International Confer ence on Neural Information Processing Systems</conf-name>; <conf-loc>Vancouver, Canada</conf-loc>. <publisher-name>Curran Associates Inc</publisher-name> (<year>2020</year>). p. <fpage>9912</fpage>&#x2013;<lpage>24</lpage>.</citation>
</ref>
<ref id="B32">
<label>32.</label>
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>XL</given-names>
</name>
<name>
<surname>He</surname>
<given-names>KM</given-names>
</name>
</person-group>. <article-title>Exploring simple Siamese representation learning</article-title>. In: <conf-name>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name>; <conf-loc>Nashville, USA</conf-loc>. <publisher-name>IEEE</publisher-name> (<year>2021</year>). p. <fpage>15745</fpage>&#x2013;<lpage>53</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>