<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Energy Res.</journal-id>
<journal-title>Frontiers in Energy Research</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Energy Res.</abbrev-journal-title>
<issn pub-type="epub">2296-598X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1378722</article-id>
<article-id pub-id-type="doi">10.3389/fenrg.2024.1378722</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Energy Research</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Abnormal line loss identification and category classification of distribution networks based on semi-supervised learning and hierarchical classification</article-title>
<alt-title alt-title-type="left-running-head">Li et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fenrg.2024.1378722">10.3389/fenrg.2024.1378722</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Li</surname>
<given-names>Wei</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2644020/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Wen</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Junmin</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/2641294/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Li</surname>
<given-names>Jie</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Zhao</surname>
<given-names>Yankai</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Lvliang Power Supply Company</institution>, <institution>State Grid Shanxi Electric Power Company</institution>, <addr-line>Lvliang</addr-line>, <country>China</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>School of Electrical Engineering</institution>, <institution>Southeast University</institution>, <addr-line>Nanjing</addr-line>, <country>China</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1433333/overview">Fuqi Ma</ext-link>, Xi&#x2019;an University of Technology, China</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2647560/overview">Linwei Sang</ext-link>, University of California, Berkeley, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1377188/overview">Bi Fan</ext-link>, Shenzhen University, China</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Wei Li, <email>101012237@seu.edu.cn</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>20</day>
<month>03</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>12</volume>
<elocation-id>1378722</elocation-id>
<history>
<date date-type="received">
<day>30</day>
<month>01</month>
<year>2024</year>
</date>
<date date-type="accepted">
<day>06</day>
<month>03</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2024 Li, Zhao, Li, Li and Zhao.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Li, Zhao, Li, Li and Zhao</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>Line loss refers to the electrical energy that is dissipated as heat during the transmission and distribution of electricity through power lines. However, unusual causes, such as grid topology mismatch and communication failure, can cause abnormal line loss. Efficient abnormal line loss detection contributes not only to minimizing energy wastage and reducing carbon emissions but also to maintaining the stability and reliability of the entire distribution network. In actual situations, the cause of abnormal line loss is not labeled due to the expensive labor cost. This paper proposes a hierarchical abnormal line loss identification and category classification model, considering the unlabeled and unbalanced sample problem. First, an abnormal line loss identification model-based random forest is established to detect whether the line loss is abnormal. Then, an abnormal line loss category classification model is developed with semi-supervised learning for line loss abnormal category classification, considering the unlabeled samples. The real dataset in China is utilized to validate the performance of the proposed model. Its reliability implies the potential to be applied to real-world scenarios to improve the management level and safety of the power grid.</p>
</abstract>
<kwd-group>
<kwd>distribution network</kwd>
<kwd>line loss</kwd>
<kwd>reasoning analysis</kwd>
<kwd>semi-supervised learning</kwd>
<kwd>XGBoost</kwd>
<kwd>random forest</kwd>
</kwd-group>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Smart Grids</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>The line loss rate is an essential indicator of economy and technology in the low-voltage distribution network (DN) (<xref ref-type="bibr" rid="B14">Sayed and Takeshita, 2011</xref>; <xref ref-type="bibr" rid="B12">Luo et al., 2021</xref>; <xref ref-type="bibr" rid="B15">Sun et al., 2022</xref>). With access to distributed generation and flexible load, DN becomes increasingly complex. Meanwhile, with the increasing electricity demand, a certain quantity of line loss in DN is generated. However, limited by the metering accuracy of data acquisition devices and the reliability of transmission systems, line loss identification in DN is usually completed by labor (<xref ref-type="bibr" rid="B8">Jing et al., 2019</xref>). Due to the incomplete installation of metering instruments of low-voltage substations and customers (<xref ref-type="bibr" rid="B23">Zhu and Lin, 2021</xref>; <xref ref-type="bibr" rid="B13">Raghuvamsi et al., 2022</xref>), it is challenging to analyze the causes of line loss.</p>
<p>With the establishment of big data centers and the development of machine learning, power supply corporations have gradually started to analyze line loss based on data-driven models to improve the economic benefits. According to the data source, the data-driven line loss analysis can be divided into user-oriented data analysis and DN substation area data analysis. <xref ref-type="bibr" rid="B5">Gunturi and Sarkar (2021)</xref> proposed an electricity theft detection model based on an ensemble machine learning model. The model applied the statistical method of oversampling to solve the over-fitting problem during the training process. Based on the terminal acquisition data, the model could identify the line loss anomaly in a small-scale DN. <xref ref-type="bibr" rid="B1">Buzau et al. (2020)</xref> used a user-side line loss identification algorithm based on a hybrid depth neural network to detect non-technical losses. The algorithm integrated a long short-term memory network and a multi-layer sensing machine, which were used for processing the original data and integrating non-time series data. <xref ref-type="bibr" rid="B3">Chen J. D. et al. (2023)</xref> established an electricity theft detection model based on a one-dimensional convolutional neural network. It analyzed the non-technical line loss on the user side according to the complete terminal data. The above three methods (<xref ref-type="bibr" rid="B1">Buzau et al., 2020</xref>; <xref ref-type="bibr" rid="B5">Gunturi and Sarkar, 2021</xref>; <xref ref-type="bibr" rid="B3">Chen J. D. et al., 2023</xref>) show a significant role in line loss identification on the user side. However, they are sensitive to the quality of user-side power consumption data and lack of universality.</p>
<p>Regarding line loss identification in DN, a feeder loss estimation method based on the boost <italic>k</italic>-means model was developed (<xref ref-type="bibr" rid="B2">Chen J. et al., 2023</xref>). The analysis index system for line loss was established, and the multi-information index was calculated according to the time series data. The established characteristic indexes were imported into the boost <italic>k</italic>-means algorithm for clustering calculation, and the outliers were marked as line loss data. <xref ref-type="bibr" rid="B19">Wu et al. (2019)</xref> introduced an algorithm of non-technical line loss of DN identification with large samples. Based on the robust neural network model, the proposed method employed an automatic denoising encoder to pre-process data. The RNN model classified the operation data and identified the non-technical line loss value. <xref ref-type="bibr" rid="B20">Yao et al. (2019)</xref> analyzed the topology of a low-voltage DN and used the GBDT model to predict the abnormal line loss nodes in the substation area. Based on parameter clustering and deep learning algorithms, the parameter correlation and time series characteristics of a DN were fully considered by <xref ref-type="bibr" rid="B11">Liu et al. (2022)</xref> and <xref ref-type="bibr" rid="B21">Zhang et al. (2022)</xref>. The multi-variate characteristic parameters were utilized to predict line loss events in a DN. When the topology of the DN is clear and the operation parameters are complete, identifying and predicting line loss based on the data-driven algorithm in the substation area can achieve remarkable results.</p>
<p>In actual operation conditions, it is difficult to accurately measure the operational parameters in the distribution network and the accuracy power consumption data (<xref ref-type="bibr" rid="B10">Lin and Abur, 2018</xref>; <xref ref-type="bibr" rid="B7">Jiang and Tang, 2020</xref>). <xref ref-type="bibr" rid="B22">Zhou et al. (2022)</xref> proposed a non-technical line loss identification model based on an AP reconstruction neural network. The model reconstructed and corrected the anomaly data by the AP neural network based on the simulation dataset, followed by a deep neural network to classify the data. <xref ref-type="bibr" rid="B6">Huang et al., (2023)</xref> constructed the electrical characteristic index system of theoretical line loss, and the power torque was proposed to identify line loss in the case of missing line data in a DN. However, this method is a supervised learning algorithm, which requires a certain amount of labeled data to train the model. In recent years, analyzing the causes of line loss has become a research focus. Power supply corporations have become interested in the causes of different line loss types. <xref ref-type="bibr" rid="B9">Liang et al. (2022)</xref> proposed a line loss interval calculation method based on power flow calculation and linear optimization, which was suitable for datasets with anomalies. This method fully considered the power flow and dispatching information and analyzed the cause of area line loss. Some studies (<xref ref-type="bibr" rid="B18">Wang et al., 2019</xref>; <xref ref-type="bibr" rid="B16">Sun et al., 2023</xref>) mentioned data-driven algorithms for line loss cause analysis, locating anomalous nodes in the network topology and analyzing the abnormal causes according to parameter deviations.</p>
<p>With the increasing complexity of DNs, the accuracy of traditional line loss identification methods on the overall level of the DN is crucial to guarantee. All data-driven algorithms and statistical methods greatly rely on the data quality and data quantity, especially the labeled data. The unsupervised learning methods, such as the clustering algorithms, do not need the labeled data to detect the abnormal line loss. However, its performance is limited and cannot identify the abnormal category. When the abnormal line loss data occupied the main part of the whole data, the clustering algorithm would directly regard the abnormal data as the normal one. The supervised learning algorithms, such as the neural network and tree models, have a more stable performance than unsupervised learning algorithms. However, it needs enough data to support the model training to avoid the overfitting phenomenon. In the abnormal line loss detection of a DN, the labeled data are limited due to labor consumption and time cost. Thus, the performance of supervised learning used to detect abnormal line loss with limited labeled samples cannot be guaranteed. The semi-supervised learning (<xref ref-type="bibr" rid="B17">Van Engelen et al., 2022</xref>; <xref ref-type="bibr" rid="B4">Du et al., 2024</xref>) combines unsupervised learning with supervised learning. It can utilize a large amount of unlabeled data and fewer labeled data to improve model performance and achieve a better performance than supervised learning on limited labeled data.</p>
<p>Considering limited labeled and unbalanced sample distribution in an actual situation, this paper proposes an abnormal line loss identification and category classification based on semi-supervised learning and hierarchical classification. The main contributions of this paper are listed as follows: (1) a hierarchical framework of abnormal line loss identification and category classification is proposed, considering the unlabeled and unbalance sample problem. (2) An abnormal line loss identification model based random forest is established to identify whether substation line loss is abnormal. (3) An abnormal line loss category classification model is developed with semi-supervised learning for line loss abnormal causal reasoning, considering the unlabeled samples.</p>
<p>The structure of this paper is as follows: <xref ref-type="sec" rid="s1">Section 1</xref> provides an introduction and the relevant literature. <xref ref-type="sec" rid="s2">Section 2</xref> introduces the framework of the proposed hierarchical abnormal line loss identification and category classification model. <xref ref-type="sec" rid="s3">Section 3</xref> describes the details of data pre-processing and feature engineering. <xref ref-type="sec" rid="s4">Sections 4</xref> and <xref ref-type="sec" rid="s5">5</xref> present the details of the abnormal line loss identification model and the abnormal line loss category classification model, respectively. <xref ref-type="sec" rid="s6">Section 6</xref> displays the detailed experiment results based on the real dataset. Finally, <xref ref-type="sec" rid="s7">section 7</xref> gives the conclusion.</p>
</sec>
<sec id="s2">
<title>2 The framework of the proposed model</title>
<p>This paper proposes an abnormal line loss identification and category classification model of a DN based on semi-supervised learning and hierarchical classification under unbalanced samples. The model is used to identify abnormal line loss in a DN and the corresponding abnormal reasons. In practical situations, there are enough labeled data for DN line loss abnormalities but few labeled data for the specific abnormal reasons. Therefore, a two-stage hierarchical classification model for identifying and reasoning abnormal line loss in a DN is proposed. In the first stage, a random forest-based abnormality identification model is established to identify whether abnormal line loss exists in the substation. In the second stage, considering less labeled data for the specific abnormal reasons, a semi-supervised learning-based XGBoost abnormal line loss category classification model is proposed to analyze the reasons of the abnormal line loss. The overall method framework is shown in <xref ref-type="fig" rid="F1">Figure 1</xref>.<list list-type="simple">
<list-item>
<p>(1) Data pre-processing: the data on the distribution network substation area include static document data and dynamic operation data. In the actual data collection process, some data are missing. The <italic>k</italic>-nearest neighbor method is adopted to select the <italic>k</italic> samples that are most similar from the sample alternative set of the same substation area, and the average value of <italic>k</italic>-samples is taken to fill in the missing values.</p>
</list-item>
<list-item>
<p>(2) Feature engineering: in the substation dynamic operation data, some features are directly related to the operation state of line loss, such as the daily line loss rate, daily maximum load rate, and daily power factor. Thus, new features are generated by the statistics of these features.</p>
</list-item>
<list-item>
<p>(3) Abnormal line loss identification: the correlation analysis is carried out on all the features generated by feature engineering. The features with a high correlation coefficient are selected as the input of the abnormal line loss identification model. The dataset is divided into the training and test datasets, and the abnormal line loss identification model based on the random forest algorithm is established to identify whether the line loss is abnormal.</p>
</list-item>
<list-item>
<p>(4) Abnormal line loss category classification: category classification is performed for the identified abnormal line losses. The common abnormal line loss causes are classified into four categories, line infrastructure problems, basic document files problem, meter problem, and theft of electricity. Considering a few data to be labeled by the abnormal category in the actual situation, a semi-supervised learning-based XGBoost abnormal line loss category classification model is proposed to achieve the causal reasoning analysis of the abnormal line loss.</p>
</list-item>
</list>
</p>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Framework of the proposed model.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g001.tif"/>
</fig>
</sec>
<sec id="s3">
<title>3 Data pre-processing and feature engineering</title>
<p>In the process of abnormal line loss identification in a DN, the data pre-processing and feature engineering of substation operation data are essential. By processing and extracting features from substation operation data, accurate and comprehensive features can be obtained, effectively improving the accuracy and reliability of abnormal line loss identification in a DN.</p>
<p>Substation data in distribution networks can be divided into two categories. One is static document data, including DN topology data, customer relationship data, the number of users, load type, transformer type, and substation load distribution. The other is dynamic operation data, including the daily input and output electricity, daily line loss rate, daily power factor, daily maximum load rate, daily voltage compliance rate, and daily three-phase imbalance degree, which is shown in <xref ref-type="table" rid="T1">Table 1</xref>. For dynamic document data, not only data pre-processing, such as data cleaning and completion, need to be carried out but also relevant features need to be extracted. For example, statistical measures of line loss rate, such as the average value, maximum, minimum, and variance, are significantly related to the state of line loss. The overall data processing and feature engineering processes are shown in <xref ref-type="fig" rid="F2">Figure 2</xref>.</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Examples of dynamic operation data.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Datetime</th>
<th align="center">No. of substation</th>
<th align="center">Power supply quantity</th>
<th align="center">Power sales quantity</th>
<th align="center">Line loss power</th>
<th align="center">Line loss rate</th>
<th align="center">Power factor</th>
<th align="center">Max. Load rate</th>
<th align="center">Three-phase unbalance rate</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">2022/3/1</td>
<td align="center">063488510003175</td>
<td align="center">2951.370</td>
<td align="center">2815.640</td>
<td align="center">135.73</td>
<td align="center">4.599</td>
<td align="center">0.993</td>
<td align="center">24.19</td>
<td align="center">0.546</td>
</tr>
<tr>
<td align="center">2022/3/2</td>
<td align="center">063488510003175</td>
<td align="center">2845.620</td>
<td align="center">2719.380</td>
<td align="center">126.24</td>
<td align="center">4.436</td>
<td align="center">0.994</td>
<td align="center">25.76</td>
<td align="center">0.546</td>
</tr>
<tr>
<td align="center">2022/3/3</td>
<td align="center">063488510003175</td>
<td align="center">2961.070</td>
<td align="center">2832.830</td>
<td align="center">128.24</td>
<td align="center">4.331</td>
<td align="center">0.991</td>
<td align="center">25.09</td>
<td align="center">0.504</td>
</tr>
<tr>
<td align="center">2022/3/4</td>
<td align="center">063488510003175</td>
<td align="center">2935.850</td>
<td align="center">2803.190</td>
<td align="center">132.66</td>
<td align="center">4.519</td>
<td align="center">0.991</td>
<td align="center">22.60</td>
<td align="center">0.625</td>
</tr>
<tr>
<td align="center">2022/3/5</td>
<td align="center">063488510003175</td>
<td align="center">3056.390</td>
<td align="center">2917.960</td>
<td align="center">138.43</td>
<td align="center">4.529</td>
<td align="center">0.983</td>
<td align="center">25.80</td>
<td align="center">0.500</td>
</tr>
<tr>
<td align="center">2022/3/6</td>
<td align="center">063488510003175</td>
<td align="center">2993.900</td>
<td align="center">2868.100</td>
<td align="center">125.8</td>
<td align="center">4.202</td>
<td align="center">0.984</td>
<td align="center">22.50</td>
<td align="center">0.458</td>
</tr>
<tr>
<td align="center">2022/3/7</td>
<td align="center">063488510003175</td>
<td align="center">2919.360</td>
<td align="center">2789.380</td>
<td align="center">129.98</td>
<td align="center">4.452</td>
<td align="center">0.990</td>
<td align="center">23.50</td>
<td align="center">0.540</td>
</tr>
<tr>
<td align="center">2022/3/8</td>
<td align="center">063488510003175</td>
<td align="center">2913.810</td>
<td align="center">2807.600</td>
<td align="center">106.21</td>
<td align="center">3.645</td>
<td align="center">0.989</td>
<td align="center">22.24</td>
<td align="center">0.424</td>
</tr>
<tr>
<td align="center">2022/3/9</td>
<td align="center">063488510003175</td>
<td align="center">2941.260</td>
<td align="center">2816.150</td>
<td align="center">125.11</td>
<td align="center">4.254</td>
<td align="center">0.991</td>
<td align="center">22.08</td>
<td align="center">0.417</td>
</tr>
<tr>
<td align="center">2022/3/10</td>
<td align="center">063488510003175</td>
<td align="center">2814.540</td>
<td align="center">2720.210</td>
<td align="center">94.33</td>
<td align="center">3.352</td>
<td align="center">0.987</td>
<td align="center">22.80</td>
<td align="center">0.549</td>
</tr>
<tr>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
<td align="center">&#x2026;</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Data pre-processing and feature engineering.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g002.tif"/>
</fig>
<sec id="s3-1">
<title>3.1 Data pre-processing</title>
<p>Data pre-processing mainly includes filling in missing values and encoding character data.<list list-type="simple">
<list-item>
<p>(1) Character data encoding</p>
</list-item>
</list>
</p>
<p>Character data encoding is carried out for the load distribution <italic>L</italic>
<sub>
<italic>d</italic>
</sub> and abnormal line loss categories in the substation area. <xref ref-type="table" rid="T2">Table 2</xref> shows character data encoding for the load distribution in the substation area.<list list-type="simple">
<list-item>
<p>(2) Missing data filling</p>
</list-item>
</list>
</p>
<table-wrap id="T2" position="float">
<label>TABLE 2</label>
<caption>
<p>Character data encoding of substation load distribution.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Substation load distribution</th>
<th align="center">
<italic>L</italic>
<sub>
<italic>d</italic>
</sub>
</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">Uniform distribution</td>
<td align="center">1</td>
</tr>
<tr>
<td align="center">Heavy tail and light head</td>
<td align="center">2</td>
</tr>
<tr>
<td align="center">Heavy middle and light tail and head</td>
<td align="center">3</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>Upon the analysis of existing data, there were some missing data such as the daily power factor and daily three-phase unbalance in some substations. To solve this problem, the candidate set is generated by the substation. Then, the <italic>k</italic>-nearest neighbor method is adopted to select the <italic>k</italic>-samples which are the most similar from the candidate set and fill in the missing values by taking the average value of <italic>k</italic>-samples.</p>
</sec>
<sec id="s3-2">
<title>3.2 Feature engineering</title>
<p>According to the substation operation data, feature extraction is carried out on the daily power supply quantity, daily power sales quantity, daily line loss rate, daily power factor, and other data. The statistical features such as the average value, maximum value, minimum value, and variance in monthly are generated.<list list-type="simple">
<list-item>
<p>(1) Monthly average value</p>
</list-item>
</list>
</p>
<p>For the loss rate, <italic>lr</italic>; power factor, <italic>&#x3bc;</italic>; maximum load rate, <italic>MaxL</italic>; and the three-phase voltage unbalance rate, <italic>U</italic>, the average value is calculated with the month as the statistical length, as shown in Equation <xref ref-type="disp-formula" rid="e1">1</xref>:<disp-formula id="e1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>avg</mml:mtext>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <italic>x</italic>
<sub>
<italic>i</italic>
<italic>j</italic>
</sub> indicates the measured value of the <italic>j-</italic>th day of the <italic>i-</italic>th month; <italic>x</italic> &#x3d; <italic>lr</italic>, <italic>&#x3bc;, MaxL or U</italic>; <italic>x</italic>
<sub>
<italic>i</italic>,avg</sub> indicates the average value of the indicator in the <italic>i-</italic>th month; and <italic>N</italic>
<sub>
<italic>i</italic>
</sub> indicates the total number of days in the <italic>i-</italic>th month.<list list-type="simple">
<list-item>
<p>(2) Monthly maximum/minimum value</p>
</list-item>
</list>
</p>
<p>For the daily line loss rate <italic>lr</italic> and daily maximum load rate <italic>MaxL</italic>, the maximum and minimum values are calculated with the monthly statistical length, which is defined by Equation <xref ref-type="disp-formula" rid="e2">2</xref>.<disp-formula id="e2">
<mml:math id="m2">
<mml:mrow>
<mml:mtable columnalign="left">
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>max</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>max</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
<mml:mtr>
<mml:mtd>
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>min</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>&#x2061;</mml:mo>
<mml:mi>min</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>x</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mtd>
</mml:mtr>
</mml:mtable>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <italic>x</italic>
<sub>
<italic>i</italic>
<italic>j</italic>
</sub> represents the measured value of the corresponding index on the <italic>j-</italic>th day of the <italic>i-</italic>th month.<list list-type="simple">
<list-item>
<p>(3) Monthly fluctuation rate of daily line loss</p>
</list-item>
</list>
</p>
<p>The fluctuation of the monthly line loss rate can also reflect the abnormality of the line loss to a certain extent. Considering the difference of the average line loss rate in months, the fluctuation rate of the monthly line loss is defined as in Equation <xref ref-type="disp-formula" rid="e3">3</xref> in order to remove the impact of the average level of the line loss rate on the statistical results.<disp-formula id="e3">
<mml:math id="m3">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>d</mml:mi>
<mml:mi>e</mml:mi>
<mml:mi>v</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>avg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:msqrt>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:msubsup>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>,</mml:mo>
<mml:mtext>avg</mml:mtext>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <italic>lr</italic>
<sub>
<italic>i</italic>,avg</sub> represents the average value of the line loss rate in the <italic>i-</italic>th month.<list list-type="simple">
<list-item>
<p>(4) Monthly abnormal rate of daily line loss</p>
</list-item>
</list>
</p>
<p>If the daily line loss rate is 0, negative, or too high, to some extent, it implies that the line loss rate may also be abnormal. Therefore, in order to reduce the influence of the accidental occurrence of the abnormal daily line loss rate, this paper defines the abnormal rate of monthly line loss, <italic>&#x3b3;</italic>
<sub>
<italic>lr</italic>,<italic>i</italic>
</sub>, as shown in Equation <xref ref-type="disp-formula" rid="e4">4</xref>. In this paper, the threshold of the excessive line loss rate is set as 7%.<disp-formula id="e4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3b3;</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mi>r</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>i</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>j</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:munderover>
</mml:mstyle>
<mml:mrow>
<mml:msub>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="" close="&#x2016;" separators="|">
<mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:mn>0</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3e;</mml:mo>
<mml:mn>7</mml:mn>
<mml:mo>%</mml:mo>
</mml:mrow>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:msub>
<mml:mi>r</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(4)</label>
</disp-formula>
</p>
</sec>
</sec>
<sec id="s4">
<title>4 Abnormal line loss recognition based on random forest</title>
<p>Random forest is an inheritance algorithm based on several decision tree classifiers. The bootstrap resampling technology is used to repeatedly randomly extract parts of the samples from the original training set to form a new training set to train multiple decision trees. The final abnormal line loss identification results are obtained by combining the results of multiple independent decision trees. Compared with the single decision tree, it has higher accuracy and stability, as shown in <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Illusion of random forest.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g003.tif"/>
</fig>
<p>The features obtained by feature engineering are taken as the input of the random forest classifier, and the line loss abnormal is the output of the random forest classifier. Thus, the abnormal line loss identification of a DN is converted into a binary classification problem. The process is as follows:</p>
<p>
<statement content-type="step" id="Step_1">
<label>Step 1</label>
<p>Dataset partitioning. The initial training set and the number of features are set. Based on the bootstrap resampling method, the samples from the original training set are repeatedly and randomly selected to form the training set <italic>D</italic>
<sub>1</sub>, &#x2026; , <italic>D</italic>
<sub>
<italic>K</italic>
</sub> to build the single decision tree. The samples that have never been sampled are used to build validation datasets to estimate the performance of the model.</p>
</statement>
</p>
<p>
<statement content-type="step" id="Step_2">
<label>Step 2</label>
<p>Construction of a single decision tree. When constructing a single decision tree, each node is split through the principle of the minimum Gini index. When the Gini index is 0, all samples in the node belong to the same category. The Gini index is calculated as in Equation <xref ref-type="disp-formula" rid="e5">5</xref>.<disp-formula id="e5">
<mml:math id="m5">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mi>i</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>&#x2212;</mml:mo>
<mml:msup>
<mml:mrow>
<mml:msubsup>
<mml:mstyle displaystyle="true">
<mml:mo>&#x2211;</mml:mo>
</mml:mstyle>
<mml:mrow>
<mml:mi>p</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mi>P</mml:mi>
</mml:msubsup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>D</mml:mi>
<mml:mi>p</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>/</mml:mo>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:mi>D</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(5)</label>
</disp-formula>where &#x7c;<italic>D</italic>&#x7c; is the number of samples in the dataset, &#x7c;<italic>D</italic>
<sub>
<italic>p</italic>
</sub>&#x7c; is the number of samples belonging to class <italic>p</italic> in the set <italic>D</italic>, and <italic>P</italic> is the number of categories.</p>
</statement>
</p>
<p>
<statement content-type="step" id="Step_3">
<label>Step 3</label>
<p>Decision tree integration. In <italic>K</italic> decision trees, the Boyer&#x2013;Moore majority vote algorithm is used to obtain the final classification result.</p>
<p>In the process of training the random forest, the depth of the decision tree <italic>M</italic>
<sub>
<italic>t</italic>
</sub>, the number of decision trees <italic>N</italic>
<sub>
<italic>t</italic>
</sub>, and the minimum number of samples in each split node <italic>S</italic>
<sub>
<italic>t</italic>
</sub> need to be determined. This paper uses grid search and cross-validation to determine the optimal hyperparameter combination.</p>
</statement>
</p>
</sec>
<sec id="s5">
<title>5 Abnormal line loss category classification based on XGBoost and semi-supervised learning</title>
<p>To deal with the unlabeled sample problem, semi-supervised learning is employed. An initial model is first trained with labeled data and then used to predict the unlabeled samples. The labeled samples with high confidence are added to the labeled dataset and used to retrain the model to improve the classification accuracy.</p>
<sec id="s5-1">
<title>5.1 XGBoost</title>
<p>XGBoost adopts the idea of boosting. The basic idea is to stack the base classifiers layer by layer. Each layer gives a higher weight to the misclassified samples of the previous layer when training. The XGBoost tree is constructed by extending a node into two branches, and the layers of the nodes continue to split until the entire tree is formed. Starting from the depth of the tree equal to 0, each node traverses all the features and sorts them according to the value of the feature gain function, as shown in Equation <xref ref-type="disp-formula" rid="e6">6</xref>. In this way, all the features are sorted according to the contribution of the features to the objective function. Then, the feature is linearly scanned to determine the best segmentation point.<disp-formula id="e6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>G</mml:mi>
<mml:mi>a</mml:mi>
<mml:mi>i</mml:mi>
<mml:mi>n</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mfenced open="[" close="]" separators="|">
<mml:mrow>
<mml:mfrac>
<mml:msubsup>
<mml:mi>G</mml:mi>
<mml:mi>L</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2b;</mml:mo>
<mml:mfrac>
<mml:msubsup>
<mml:mi>G</mml:mi>
<mml:mi>R</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>&#x2212;</mml:mo>
<mml:mfrac>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>G</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msup>
<mml:mrow>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>L</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>H</mml:mi>
<mml:mi>R</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>&#x3bb;</mml:mi>
</mml:mrow>
</mml:mfrac>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mi>&#x3b4;</mml:mi>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(6)</label>
</disp-formula>where <italic>G</italic>
<sub>
<italic>L</italic>
</sub> represents the cumulative sum of the first-order partial derivation of the objective function by the samples contained in the left subtree after the current node splitting. <italic>G</italic>
<sub>
<italic>R</italic>
</sub> represents the cumulative sum of the first-order partial derivation of the objective function by the samples contained in the right subtree after the current node splitting. <italic>H</italic>
<sub>
<italic>L</italic>
</sub> represents the cumulative sum of the second-order partial derivation of the objective function by the samples contained in the left subtree after the current node splitting. <italic>H</italic>
<sub>
<italic>R</italic>
</sub> represents the cumulative sum of the second-order derivation of the objective function of the samples contained in the right subtree after the current node splitting. <italic>&#x3bb;</italic> is the regularization parameter, and <italic>&#x3b4;</italic> is the threshold to control the minimum gain of the split.</p>
</sec>
<sec id="s5-2">
<title>5.2 Abnormal line loss type classification based on XGBoost and semi-supervised learning</title>
<p>Since there are less labeled data for abnormal line loss types, most abnormal line losses only mark whether there is an anomaly but do not mark the specific reason of the anomaly. Therefore, this paper adopts the self-training semi-supervised learning method to model the abnormal line loss category classification. It trains an initial model with labeled data and then uses the model to predict the unlabeled data. The data with high confidence are added to the labeled dataset and used to retrain the model. The final model is obtained by iterating the process until the converge condition is satisfied.</p>
<p>According to whether the abnormal line loss type is labeled, the dataset is divided into the labeled sample dataset <bold>D</bold>
<sub>
<bold>L</bold>
</sub> &#x3d; {(<bold>
<italic>x</italic>
</bold>
<sub>1</sub>, <italic>y</italic>
<sub>1</sub>), (<bold>
<italic>x</italic>
</bold>
<sub>2</sub>, <italic>y</italic>
<sub>2</sub>), &#x2026;, (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>n</italic>
</sub>, <italic>y</italic>
<sub>
<italic>n</italic>
</sub>)}and unlabeled sample dataset <bold>D</bold>
<sub>
<bold>U</bold>
</sub>. The number of sample label categories is <italic>N</italic>
<sub>
<italic>c</italic>
</sub>. In self-training semi-supervised learning, the pseudo-label sample selection strategy is the core part of the model performance. The purpose of the pseudo-labeled sample selection strategy is to select the samples that are more likely to be correctly labeled from the unlabeled samples and add them to the labeled samples to form a new training set so as to further improve the accuracy and generalization performance of the model. If pseudo-label samples, which are falsely labeled, are added to the training set, the performance of the model may be degraded. In this paper, pseudo-label sample selection based on the Mahalanobis distance is adopted, and the process is as follows:</p>
<p>In the labeled sample dataset <bold>D</bold>
<sub>
<bold>L</bold>
</sub>, the samples are divided according to the sample category. The sample set of class <italic>m</italic> is denoted as <bold>D</bold>
<sub>
<bold>L</bold>
</sub>,<sub>
<italic>m</italic>
</sub> &#x3d; {(<bold>
<italic>x</italic>
</bold>
<sub>
<italic>i</italic>
</sub>, <italic>y</italic>
<sub>
<italic>i</italic>
</sub>)&#x7c; <italic>y</italic>
<sub>
<italic>i</italic>
</sub> &#x3d; <italic>m</italic>}, <italic>m</italic> &#x3d; 1, &#x2026; , <italic>N</italic>
<sub>
<italic>c</italic>
</sub>. The average value of its feature vector is calculated based on Equation <xref ref-type="disp-formula" rid="e7">7</xref>.<disp-formula id="e7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">D</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold">D</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(7)</label>
</disp-formula>
</p>
<p>In the unlabeled sample dataset <bold>D</bold>
<sub>
<bold>U</bold>
</sub>, the corresponding pseudo-labeled sample set <bold>D</bold>
<sub>
<bold>P</bold>
</sub> is obtained after labeling. <italic>y</italic>
<sub>
<italic>p</italic>,</sub> <sub>
<italic>j</italic>
</sub> is denoted as the pseudo-label of sample <italic>x</italic>
<sub>
<italic>j</italic>
</sub>, <italic>x</italic>
<sub>
<italic>j</italic>
</sub> &#x3f5; <bold>D</bold>
<sub>
<bold>U</bold>
</sub>. Suppose <italic>y</italic>
<sub>
<italic>p</italic>,</sub> <sub>
<italic>j</italic>
</sub> &#x3d; <italic>m</italic>, the Mahalanobis distance between the pseudo-label sample (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>j</italic>
</sub>, <italic>y</italic>
<sub>
<italic>p</italic>,</sub> <sub>
<italic>j</italic>
</sub>) and <inline-formula id="inf1">
<mml:math id="m8">
<mml:mrow>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is calculated by Equation <xref ref-type="disp-formula" rid="e8">8</xref>.<disp-formula id="e8">
<mml:math id="m9">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x3d;</mml:mo>
<mml:msqrt>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:msubsup>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mi>m</mml:mi>
<mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:msubsup>
<mml:msup>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mi mathvariant="normal">T</mml:mi>
</mml:msup>
</mml:mrow>
</mml:msqrt>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(8)</label>
</disp-formula>where <bold>C</bold>
<sub>
<italic>m</italic>
</sub> is the covariance matrix of <bold>D</bold>
<sub>
<bold>L</bold>
</sub>,<sub>
<italic>m</italic>
</sub>, which is shown in Equation <xref ref-type="disp-formula" rid="e9">9</xref>.<disp-formula id="e9">
<mml:math id="m10">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">C</mml:mi>
<mml:mi>m</mml:mi>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mn>1</mml:mn>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="|" close="|" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold">D</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2212;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfrac>
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold">D</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mstyle displaystyle="true">
<mml:munder>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2208;</mml:mo>
<mml:msub>
<mml:mi mathvariant="bold">D</mml:mi>
<mml:mrow>
<mml:mi mathvariant="bold">L</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:munder>
</mml:mstyle>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2212;</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(9)</label>
</disp-formula>
</p>
<p>The detailed processing of abnormal line loss category classification based on semi-supervised learning and XGBoost is shown as follows:</p>
<statement content-type="step" id="step_1">
<label>Step 1:</label>
<p>the XGB model, <italic>M</italic>, is built based on the dataset <bold>D</bold>
<sub>
<bold>L</bold>
</sub> &#x3d; {(<bold>
<italic>x</italic>
</bold>
<sub>1</sub>, <italic>y</italic>
<sub>1</sub>), (<bold>
<italic>x</italic>
</bold>
<sub>2</sub>, <italic>y</italic>
<sub>2</sub>), &#x2026;, (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>n</italic>
</sub>, <italic>y</italic>
<sub>
<italic>n</italic>
</sub>)}.</p>
</statement>
<statement content-type="step" id="step_2">
<label>Step 2:</label>
<p>the unlabeled sample set <bold>D</bold>
<sub>
<bold>U</bold>
</sub> is used as the input of model <italic>M.</italic> The corresponding pseudo-label is obtained to generate the pseudo-label sample set <bold>D</bold>
<sub>
<bold>P</bold>
</sub>
<bold>.</bold>
</p>
</statement>
<statement content-type="step" id="step_3">
<label>Step 3:</label>
<p>the distance <inline-formula id="inf2">
<mml:math id="m11">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> based on Equation. <xref ref-type="disp-formula" rid="e8">8</xref> is calculated for each pseudo-label sample (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>j</italic>
</sub>, <italic>y</italic>
<sub>
<italic>p</italic>,</sub> <sub>
<italic>j</italic>
</sub>) &#x3f5; <bold>D</bold>
<sub>
<bold>p</bold>
</sub>
<bold>.</bold>
</p>
</statement>
<statement content-type="step" id="step_4">
<label>Step 4:</label>
<p>If <inline-formula id="inf3">
<mml:math id="m12">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> &#x3c;<italic>&#x3b8;</italic>, which is the threshold, it means that the pseudo-label is acceptable. The pseudo-label sample (<bold>
<italic>x</italic>
</bold>
<sub>
<italic>j</italic>
</sub>, <italic>y</italic>
<sub>
<italic>p</italic>,</sub> <sub>
<italic>j</italic>
</sub>) is removed from <bold>D</bold>
<sub>
<bold>P</bold>
</sub> and added to <bold>D</bold>
<sub>
<bold>L</bold>
</sub>
<bold>.</bold> If <inline-formula id="inf4">
<mml:math id="m13">
<mml:mrow>
<mml:mi>d</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mover accent="true">
<mml:mi mathvariant="bold-italic">x</mml:mi>
<mml:mo>&#xaf;</mml:mo>
</mml:mover>
<mml:mi>m</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> &#x2265;<italic>&#x3b8;</italic>, it means that the pseudo-label is unreliable, and <bold>
<italic>x</italic>
</bold>
<sub>
<italic>j</italic>
</sub> is still retained in the unlabeled sample set <bold>D</bold>
<sub>
<bold>U.</bold>
</sub>
</p>
</statement>
<statement content-type="step" id="step_5">
<label>Step 5:</label>
<p>
<bold>D</bold>
<sub>
<bold>L</bold>
</sub> and <bold>D</bold>
<sub>
<bold>U</bold>
</sub> are updated<bold>.</bold>
</p>
</statement>
<statement content-type="step" id="step_6">
<label>Step 6:</label>
<p>Steps 1&#x2013;5 are repeated until the converge condition is satisfied. The final model is used to classify the category of abnormal line loss.</p>
</statement>
</sec>
</sec>
<sec id="s6">
<title>6 Experiment and results</title>
<sec id="s6-1">
<title>6.1 Data source and experiment settings</title>
<p>In this paper, the operation data on three power supply stations in Lvliang, Shanxi Province, China, spanning half a year are used for comparison experiments. The three power stations contain 1,175 10-kV substations, which mainly include residential load, industrial load, public lighting, and commercial load. The substation operation data contain the daily active power supply, reactive power supply, line loss rate, input power, output power, power factor, maximum load rate, three-phase unbalance rate, and other data on substations spanning from May 2022 to November 2022.</p>
<p>In the experiment, the abnormality of the substation line loss is labeled by the month. There are a total of 7,050 samples in the dataset, including 1,503 abnormal line loss samples and 5,547 normal line loss samples. Due to the limited labor, only the abnormal causes in the part of the substation are verified, which includes 988 samples, accounting for 65.73% of the whole abnormal line loss samples. The distribution of abnormal line loss causes is shown in <xref ref-type="fig" rid="F4">Figure 4</xref>. The main cause of abnormal line loss is the meter problem, including data acquisition exception and meter device fault. The electricity theft accounted for the smallest proportion. A part of the reason is that the electricity theft by users is difficult to confirm in reality due to user privacy. The detailed causes of different abnormal line loss categories are shown as follows:<list list-type="simple">
<list-item>
<p>&#x2022; Line infrastructure problem: too long supply wire or too small wire radius and aging of the line equipment.</p>
</list-item>
<list-item>
<p>&#x2022; Basic document files problem: distributed network topology mismatch and user-zone ownership error.</p>
</list-item>
<list-item>
<p>&#x2022; Meter problem: data collected not at the same time, meter deviation, meter device failure, and communication failure.</p>
</list-item>
<list-item>
<p>&#x2022; Theft of electricity: illegal use of electricity.</p>
</list-item>
</list>
</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Diagram of abnormal line loss cause distribution.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g004.tif"/>
</fig>
<p>In the abnormal line loss recognition model, the dataset is divided as 7:3, where 70% of the data comprises the training set and 30% of the data comprises the test set. The hyperparameters of the random forest and XGBoost model used in this paper are shown in <xref ref-type="table" rid="T3">Table 3</xref>.</p>
<table-wrap id="T3" position="float">
<label>TABLE 3</label>
<caption>
<p>Parameter settings.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">Model</th>
<th align="center">Parameter</th>
<th align="center">Value</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td rowspan="3" align="center">Random forest in abnormal line loss detection</td>
<td align="center">Maximum depth of decision tree</td>
<td align="center">10</td>
</tr>
<tr>
<td align="center">Number of decision trees</td>
<td align="center">144</td>
</tr>
<tr>
<td align="center">Minimum number of samples in each split node</td>
<td align="center">10</td>
</tr>
<tr>
<td rowspan="4" align="center">XGBoost in abnormal line loss category classification</td>
<td align="center">Maximum depth</td>
<td align="center">5</td>
</tr>
<tr>
<td align="center">Learning rate</td>
<td align="center">0.08</td>
</tr>
<tr>
<td align="center">Booster</td>
<td align="center">DART</td>
</tr>
<tr>
<td align="center">Subsample</td>
<td align="center">0.75</td>
</tr>
</tbody>
</table>
</table-wrap>
<p>In this paper, the abnormal line loss detection and category classification is a two-stage classification problem. Thus, the confuse matrix is used to display the result. In stage 1, the abnormal line loss identification is a binary classification problem.</p>
<p>In stage 2, the abnormal line loss category classification is a multi-classification task, and the evaluation metrics include accuracy, precision, recall, and the F1-score. Considering the unbalanced sample problem, this paper utilizes the macro average value, as shown in Equations <xref ref-type="disp-formula" rid="e10">10</xref>&#x2013;<xref ref-type="disp-formula" rid="e13">13</xref>. The <italic>TP</italic> is the number of the positive samples detected as positive. The <italic>TN</italic> is the number of negative samples detected as negative. The <italic>FP</italic> is the number of negative samples detected as positive. The <italic>FN</italic> is the number of positive samples detected as negative.<disp-formula id="e10">
<mml:math id="m14">
<mml:mrow>
<mml:mi>A</mml:mi>
<mml:mi>c</mml:mi>
<mml:mi>c</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(10)</label>
</disp-formula>
<disp-formula id="e11">
<mml:math id="m15">
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(11)</label>
</disp-formula>
<disp-formula id="e12">
<mml:math id="m16">
<mml:mrow>
<mml:mi>R</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mstyle displaystyle="true">
<mml:munderover>
<mml:mo>&#x2211;</mml:mo>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>c</mml:mi>
</mml:msub>
</mml:munderover>
</mml:mstyle>
<mml:mfrac>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
<mml:mrow>
<mml:mi>T</mml:mi>
<mml:msub>
<mml:mi>P</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>F</mml:mi>
<mml:msub>
<mml:mi>N</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfrac>
<mml:mo>,</mml:mo>
</mml:mrow>
</mml:math>
<label>(12)</label>
</disp-formula>
<disp-formula id="e13">
<mml:math id="m17">
<mml:mrow>
<mml:msub>
<mml:mi>F</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mfrac>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>P</mml:mi>
<mml:mo>&#xd7;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mi>P</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mi>R</mml:mi>
</mml:mrow>
</mml:mfrac>
<mml:mo>.</mml:mo>
</mml:mrow>
</mml:math>
<label>(13)</label>
</disp-formula>
</p>
</sec>
<sec id="s6-2">
<title>6.2 Results of abnormal line loss identification and category classification</title>
<p>To further analyze the performance of feature engineering, the Spearman correlation analysis is first employed to quantify the relationship between the statistical features and abnormal line loss. The result is displayed in <xref ref-type="fig" rid="F5">Figure 5</xref>. It is clear that the monthly abnormal rate of daily line loss, <italic>&#x3b3;</italic>
<sub>
<italic>lr</italic>,<italic>i</italic>
</sub>, is the most important feature in abnormal line loss identification. The maximum value of line loss, the minimum value of line loss, and the average value of line loss also have a certain correlation with abnormal line loss. The three-phase unbalance rate is the least related to abnormal line loss and is not regarded as the input of the identification model.</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Correlation analysis of the statistical features.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g005.tif"/>
</fig>
<p>The result of our proposed abnormal line loss identification and category classification model is shown in <xref ref-type="table" rid="T4">Table 4</xref>. It shows that a good performance is achieved in abnormal line loss identification. All the evaluation metrics obtain good results. <xref ref-type="fig" rid="F6">Figure 6</xref> displays the decision boundary of one decision tree of the random forest. It is clear that all the samples with negative line loss are recognized as abnormal. The sample with a high monthly abnormal rate and high minimal line loss rate is also identified as abnormal.</p>
<table-wrap id="T4" position="float">
<label>TABLE 4</label>
<caption>
<p>Results of abnormal line loss detection and category classification.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">Abnormal line loss detection</th>
<th align="center">Abnormal line loss category classification</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">
<italic>Acc</italic>
</td>
<td align="center">0.9768</td>
<td align="center">0.8446</td>
</tr>
<tr>
<td align="center">
<italic>P</italic>
</td>
<td align="center">0.9948</td>
<td align="center">0.7617</td>
</tr>
<tr>
<td align="center">
<italic>R</italic>
</td>
<td align="center">0.9979</td>
<td align="center">0.8124</td>
</tr>
<tr>
<td align="center">
<italic>F</italic>1</td>
<td align="center">0.9963</td>
<td align="center">0.7862</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F6" position="float">
<label>FIGURE 6</label>
<caption>
<p>Diagram of the decision boundary of one decision tree of the random forest.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g006.tif"/>
</fig>
<p>In abnormal line loss category, the classification result is not better than that of abnormal line loss identification. The small sample size and unbalanced sample distribution significantly impact the precision and recall values. The confusion matrix of the XGBoost model is presented in <xref ref-type="fig" rid="F7">Figure 7</xref>. The classification result of electricity theft is the worst. The meter problem classification is the best. It is because the number of electricity theft incidents is too small and impacts the model learning. All the categories are easily misidentified as meter problems, especially electricity theft. In reality, the meter problem is the most common cause of abnormal line loss, including different abnormal line loss scenarios, such as data error, communication problem, and data collection terminal fault. Thus, other causes are easily misidentified as meter problems.</p>
<fig id="F7" position="float">
<label>FIGURE 7</label>
<caption>
<p>Confusion matrix of abnormal line loss category classification with XGBoost and semi-supervised learning. Label &#x201c;Document&#x201d; denotes the basic document files problem. Label &#x201c;Theft&#x201d; denotes theft of electricity. Label &#x201c;Infrastructure&#x201d; denotes the line infrastructure problem. Label &#x201c;meter&#x201d; denotes the meter problem.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g007.tif"/>
</fig>
</sec>
<sec id="s6-3">
<title>6.3 Comparison experiment</title>
<p>In this section, the comparison experiments are conducted from different aspects, including abnormal line loss identification with different algorithms, abnormal line loss category classification with different algorithms, and comparison of supervised learning and semi-supervised learning.<list list-type="simple">
<list-item>
<p>1) Comparison of abnormal line loss identification with different algorithms</p>
</list-item>
</list>
</p>
<p>In abnormal line loss identification, the decision tree (DT), XGBoost, BP, and support vector machine (SVM) are utilized as the comparison algorithms. In DT, the max depth of tree is set as 12. In XGBoost, the learning rate is 0.1 and the number of estimators is set as 100. In BP, the number of hidden layers is set as 2, with 100 neurons in each hidden layer. The kernel function of SVM is the radial basis kernel function, and the regularization parameter is 1.</p>
<p>The identification results of different algorithms are shown in <xref ref-type="fig" rid="F8">Figure 8</xref>. Since the abnormal line loss identification problem is a relatively simple binary classification problem, all the algorithms can achieve a good performance. From the aspect of accuracy, BP achieves the best performance. The accuracy values of RF, DT, XGBoost, and SVM are close. From the aspect of all metrics, the RF performs the best. The precision, recall, and F1-score of the RF are the highest. The precision result of BP implies that the model easily launches false alarms than RF. The performance of DT and SVM is the worst. Further analyzing the result with data, it is found that the monthly line loss with the negative daily line loss rate is easily recognized as abnormal. The abnormal monthly line loss with a small and positive line loss is the most difficult to detect compared to other abnormal line loss scenarios.<list list-type="simple">
<list-item>
<p>2) Comparison of abnormal line loss category classification with different algorithms</p>
</list-item>
</list>
</p>
<fig id="F8" position="float">
<label>FIGURE 8</label>
<caption>
<p>Results of abnormal line loss detection with different algorithms.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g008.tif"/>
</fig>
<p>In abnormal line loss category classification, random forest, DT, and BP are used as the comparison algorithms. In random forest, the number of decision trees is set as 105, and the maximum depth of the decision tree is set as 10. In DT, the maximum depth of the tree is set as 15. In BP, the number of hidden layers is set as 3, with 85 neurons in each hidden layer. All the algorithms are conducted with the semi-supervised learning.</p>
<p>The result of the abnormal line loss category classification is displayed in <xref ref-type="fig" rid="F9">Figure 9</xref>. It is obvious that the performance of XGBoost is the best and that of DT is the worst. Due to limited samples, the accuracy of abnormal line loss category classification is not higher than that of abnormal line loss identification. In another aspect, the input feature is generated based on monthly line loss, which cannot reflect the fluctuation of the intra-day line loss rate. In particular, the theft of electricity is closely related to the intra-day line loss rate, which cannot be well-detected.<list list-type="simple">
<list-item>
<p>3) Supervised learning vs. semi-supervised learning</p>
</list-item>
</list>
</p>
<fig id="F9" position="float">
<label>FIGURE 9</label>
<caption>
<p>Results of abnormal line loss category classification with different algorithms.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g009.tif"/>
</fig>
<p>In this section, the performance of supervised learning and semi-supervised learning is compared with XGBoost in the abnormal line loss category classification task. The supervised learning directly uses 70% of the labeled samples to train XGBoost, and the rest 30% was used for the test. The evaluation metric results are displayed in <xref ref-type="table" rid="T5">Table 5</xref>. From <xref ref-type="table" rid="T5">Table 5</xref>, it is obvious that the classification results are significantly improved by semi-supervised learning, especially recall. It is implied that the phenomenon of leaking alarm is relieved. The category of theft of electricity is the most difficult to detect. It is because of the limited electricity theft samples and because electricity theft is mostly impacted by the intra-day line loss rate. The confusion matrix of supervised learning is presented in <xref ref-type="fig" rid="F10">Figure 10</xref>. Compared to <xref ref-type="fig" rid="F7">Figure 7</xref>, the classification accuracy of all the categories is enhanced. For the semi-supervised learning, the unlabeled samples are used, which can help the model learn to increase the classification accuracy. However, the current data cannot reflect the situation of intra-day line loss, and the category classification performance is limited. To further improve the abnormal line category classification, detailed line loss data are needed.</p>
<table-wrap id="T5" position="float">
<label>TABLE 5</label>
<caption>
<p>Results of supervised learning and semis-supervised learning with XGBoost.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center"/>
<th align="center">Supervised learning</th>
<th align="center">Semi-supervised learning</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">
<italic>Acc</italic>
</td>
<td align="center">0.7331</td>
<td align="center">0.8446</td>
</tr>
<tr>
<td align="center">
<italic>P</italic>
</td>
<td align="center">0.5986</td>
<td align="center">0.7617</td>
</tr>
<tr>
<td align="center">
<italic>R</italic>
</td>
<td align="center">0.6334</td>
<td align="center">0.8124</td>
</tr>
<tr>
<td align="center">
<italic>F</italic>1</td>
<td align="center">0.6155</td>
<td align="center">0.7862</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F10" position="float">
<label>FIGURE 10</label>
<caption>
<p>Confusion matrix of abnormal line loss category classification with XGBoost and supervised learning. Label &#x201c;Document&#x201d; denotes the basic document files problem. Label &#x201c;Theft&#x201d; denotes theft of electricity. Label &#x201c;Infrastructure&#x201d; denotes the line infrastructure problem. Label &#x201c;meter&#x201d; denotes the meter problem.</p>
</caption>
<graphic xlink:href="fenrg-12-1378722-g010.tif"/>
</fig>
</sec>
</sec>
<sec sec-type="conclusion" id="s7">
<title>7 Conclusion</title>
<p>Abnormal line loss identification is crucial in distribution networks to guarantee the timely and safe power supply in grid. In actual situations, the cause of abnormal line loss is not completely labeled due to the expensive labor cost. Considering the actual limited and unbalanced samples, this paper proposed a hierarchical classification framework to identify the causal reason of the abnormal line loss. An abnormal line loss identification model-based random forest was first established to identify whether substation line loss was abnormal. Based on the results of detected abnormal line loss, an abnormal line loss category classification model was developed with semi-supervised learning and XGBoost, considering the unlabeled samples. With the help of self-training semi-supervised learning, the unlabeled samples were utilized to train the classification model to relieve the over-fitting performance. Numerous experiments were conducted on the real dataset from China. The accuracy of abnormal line loss identification was more than 97%. The accuracy of abnormal line loss category classification was around 84% under semi-supervised learning. The results highlight the good performance of the proposed hierarchical learning structure to relieve the impact of the unbalance samples, which is very helpful for future application.</p>
<p>In the future, more detailed abnormal line loss causes can be considered. In addition, the sampling techniques to relieve the sample unbalance can be further utilized when considering the detailed abnormal line loss causes. In summary, this research highlights the application of machine learning in abnormal line loss identification and category classification, with implications for improving the management and operation of power grids.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s8">
<title>Data availability statement</title>
<p>The original contributions presented in the study are included in the article/supplementary material; further inquiries can be directed to the corresponding author.</p>
</sec>
<sec id="s9">
<title>Author contributions</title>
<p>WeL: writing&#x2013;review and editing and writing&#x2013;original draft; WeZ: writing&#x2013;review and editing, writing&#x2013;original draft, and data curation; JuL: writing&#x2013;review and editing; JiL: writing&#x2013;review and editing; YZ: writing&#x2013;review and editing.</p>
</sec>
<sec sec-type="funding-information" id="s10">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This research was funded by the science and technology project of State Grid Shanxi Electric Power Company, grant number 5205J0220002. The funder was not involved in the study design, collection, analysis, interpretation of data, the writing of this article, or the decision to submit it for publication.</p>
</sec>
<sec sec-type="COI-statement" id="s11">
<title>Conflict of interest</title>
<p>Authors WeL, WeZ, JuL, JiL, and YZ were employed by State Grid Shanxi Electric Power Company.</p>
</sec>
<sec sec-type="disclaimer" id="s12">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors, and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Buzau</surname>
<given-names>M. M.</given-names>
</name>
<name>
<surname>Tejedor-Aguilera</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Cruz-Romero</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>G&#xf3;mez-Exp&#xf3;sito</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Hybrid deep neural networks for detection of non-technical losses in electricity Smart meters</article-title>. <source>IEEE Trans. Power Syst.</source> <volume>35</volume>, <fpage>1254</fpage>&#x2013;<lpage>1263</lpage>. <pub-id pub-id-type="doi">10.1109/tpwrs.2019.2943115</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zeb</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Sun</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2023b</year>). <article-title>A power line loss analysis method based on boost clustering</article-title>. <source>J. Supercomput.</source> <volume>79</volume>, <fpage>3210</fpage>&#x2013;<lpage>3226</lpage>. <pub-id pub-id-type="doi">10.1007/s11227-022-04777-w</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Chen</surname>
<given-names>J. D.</given-names>
</name>
<name>
<surname>Nanehkaran</surname>
<given-names>Y. A.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>W. R.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Y. J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>D. F.</given-names>
</name>
</person-group> (<year>2023a</year>). <article-title>Data-driven intelligent method for detection of electricity theft</article-title>. <source>Int. J. Electr. Power and Energy Syst.</source> <volume>148</volume>, <fpage>108948</fpage>. <pub-id pub-id-type="doi">10.1016/j.ijepes.2023.108948</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Du</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Wu</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>S.</given-names>
</name>
</person-group> (<year>2024</year>). <article-title>Semi-supervised imbalanced multi-label classification with label propagation</article-title>. <source>Pattern Recognit.</source> <volume>150</volume>, <fpage>110358</fpage>. <pub-id pub-id-type="doi">10.1016/j.patcog.2024.110358</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Gunturi</surname>
<given-names>S. K.</given-names>
</name>
<name>
<surname>Sarkar</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Ensemble machine learning models for the detection of energy theft</article-title>. <source>Electr. Power Syst. Res.</source> <volume>192</volume>, <fpage>106904</fpage>. <pub-id pub-id-type="doi">10.1016/j.epsr.2020.106904</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Huang</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Calculation method of theoretical line loss in low-voltage grids based on improved random forest algorithm</article-title>. <source>Energies</source> <volume>16</volume>, <fpage>2971</fpage>. <pub-id pub-id-type="doi">10.3390/en16072971</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jiang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Tang</surname>
<given-names>H. B.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Distribution line parameter estimation considering dynamic operating states with a probabilistic graphical model</article-title>. <source>Int. J. Electr. Power and Energy Syst.</source> <volume>121</volume>, <fpage>106133</fpage>. <pub-id pub-id-type="doi">10.1016/j.ijepes.2020.106133</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jing</surname>
<given-names>T. T.</given-names>
</name>
<name>
<surname>Dai</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Xi</surname>
<given-names>H. J.</given-names>
</name>
<name>
<surname>Hu</surname>
<given-names>Y.</given-names>
</name>
</person-group>, (<year>2019</year>). <article-title>Method for theoretical line loss calculation of 10kV distribution district based on actual electric energy of distribution transformer secondary side</article-title>. <source>Iop Int. Conf. Civ. Archit. Disaster Prev.</source> <volume>218</volume>, <fpage>1</fpage>&#x2013;<lpage>6</lpage>. <pub-id pub-id-type="doi">10.1088/1755-1315/218/1/012152</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liang</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Ma</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Line loss interval algorithm for distribution network with DG based on linear optimization under abnormal or missing measurement data</article-title>. <source>Energies</source> <volume>15</volume>, <fpage>4158</fpage>. <pub-id pub-id-type="doi">10.3390/en15114158</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Lin</surname>
<given-names>Y. Z.</given-names>
</name>
<name>
<surname>Abur</surname>
<given-names>A.</given-names>
</name>
</person-group> (<year>2018</year>). <article-title>A new framework for detection and identification of network parameter errors</article-title>. <source>IEEE Trans. Smart Grid</source> <volume>9</volume>, <fpage>1698</fpage>&#x2013;<lpage>1706</lpage>. <pub-id pub-id-type="doi">10.1109/tsg.2016.2597286</pub-id>
</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Liu</surname>
<given-names>K. Y.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Kang</surname>
<given-names>Z. J.</given-names>
</name>
<name>
<surname>Luo</surname>
<given-names>L.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Anomaly detection method of distribution network line loss based on hybrid clustering and LSTM</article-title>. <source>J. Electr. Eng. Technol.</source> <volume>17</volume>, <fpage>1131</fpage>&#x2013;<lpage>1141</lpage>. <pub-id pub-id-type="doi">10.1007/s42835-021-00958-4</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Luo</surname>
<given-names>F. Z.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Yao</surname>
<given-names>L. Z.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>L. Z.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>D. W.</given-names>
</name>
<name>
<surname>Qian</surname>
<given-names>M. H.</given-names>
</name>
</person-group> (<year>2021</year>). &#x201c;<article-title>Flexible load active management method in optimization operation of distribution networks</article-title>,&#x201d; in <conf-name>Proceedings of the 2021 3rd asia energy and electrical engineering symposium (AEEES 2021)</conf-name>, <conf-loc>Chengdu, China</conf-loc> <fpage>413</fpage>&#x2013;<lpage>418</lpage>. <pub-id pub-id-type="doi">10.1109/AEEES51875.2021.9403086</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Raghuvamsi</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Teeparthi</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Kosana</surname>
<given-names>V.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A novel deep learning architecture for distribution system topology identification with missing PMU measurements</article-title>. <source>Results Eng.</source> <volume>15</volume>, <fpage>100543</fpage>. <pub-id pub-id-type="doi">10.1016/j.rineng.2022.100543</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sayed</surname>
<given-names>M. A.</given-names>
</name>
<name>
<surname>Takeshita</surname>
<given-names>T.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>All nodes voltage regulation and line loss minimization in loop distribution systems using UPFC</article-title>. <source>IEEE Trans. Power Electron.</source> <volume>26</volume>, <fpage>1694</fpage>&#x2013;<lpage>1703</lpage>. <pub-id pub-id-type="doi">10.1109/tpel.2010.2090048</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>Y. F.</given-names>
</name>
<name>
<surname>Zeng</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>J. H.</given-names>
</name>
<name>
<surname>Shi</surname>
<given-names>J. D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Optimization planning method of distributed generation based on steady-state security region of distribution network</article-title>. <source>Energy Rep.</source> <volume>8</volume>, <fpage>4209</fpage>&#x2013;<lpage>4222</lpage>. <pub-id pub-id-type="doi">10.1016/j.egyr.2022.03.078</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Sun</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Xuan</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Huang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Cao</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023</year>). <article-title>Traceability analysis for low-voltage distribution network abnormal line loss using a data-driven power flow model</article-title>. <source>Front. Energy Res.</source> <volume>11</volume>, <fpage>832837</fpage>. <pub-id pub-id-type="doi">10.3389/fenrg.2023.1272095</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Van</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Jesper</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Hoos</surname>
<given-names>H. H.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A survey on semi-supervised learning</article-title>. <source>Mach. Learn.</source> <volume>109</volume> (<issue>2</issue>), <fpage>373</fpage>&#x2013;<lpage>440</lpage>. <pub-id pub-id-type="doi">10.1007/s10994-019-05855-6</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Wang</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Bai</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Xing</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2019</year>). &#x201c;<article-title>Development of synchronous line loss analysis and diagnosis system based on arbitrary segmentation of power grid</article-title>,&#x201d; in <conf-name>Proceedings of the 2019 IEEE 4th advanced information technology, electronic and automation control conference (IAEAC)</conf-name>, <conf-loc>Chengdu, China</conf-loc> <fpage>1840</fpage>&#x2013;<lpage>1844</lpage>.</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wu</surname>
<given-names>W.</given-names>
</name>
<name>
<surname>Cheng</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Zhou</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>B.</given-names>
</name>
<name>
<surname>Zang</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xu</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2019</year>). <article-title>Benchmarking daily line loss rates of low voltage transformer regions in power grid based on robust neural network</article-title>. <source>Appl. Sci.</source> <volume>9</volume>, <fpage>5565</fpage>. <pub-id pub-id-type="doi">10.3390/app9245565</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yao</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Zhu</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Li</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>He</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2019</year>). <article-title>Research on predicting line loss rate in low voltage distribution network based on gradient boosting decision tree</article-title>. <source>Energies</source> <volume>12</volume>, <fpage>2522</fpage>. <pub-id pub-id-type="doi">10.3390/en12132522</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhang</surname>
<given-names>Z. L.</given-names>
</name>
<name>
<surname>Yang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Zhao</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Xiao</surname>
<given-names>R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Prediction method of line loss rate in low&#x2010;voltage distribution network based on multi&#x2010;dimensional information matrix and dimensional attention mechanism&#x2010;long&#x2010;and short&#x2010;term time&#x2010;series network</article-title>. <source>Transm. Distribution</source> <volume>16</volume>, <fpage>4187</fpage>&#x2013;<lpage>4203</lpage>. <pub-id pub-id-type="doi">10.1049/gtd2.12590</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="confproc">
<person-group person-group-type="author">
<name>
<surname>Zhou</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Xue</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Feng</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Dong</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Qu</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2022</year>). &#x201c;<article-title>Abnormal line loss data detection and correction method</article-title>,&#x201d; in <conf-name>Proceedings of the 2022 4th asia energy and electrical engineering symposium (AEEES)</conf-name>, <conf-loc>Chengdu, China</conf-loc> <fpage>832</fpage>&#x2013;<lpage>837</lpage>. <pub-id pub-id-type="doi">10.1109/AEEES54426.2022.9759815</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Zhu</surname>
<given-names>L. P.</given-names>
</name>
<name>
<surname>Lin</surname>
<given-names>J. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Learning spatiotemporal correlations for missing noisy PMU data correction in Smart grid</article-title>. <source>IEEE Internet Things J.</source> <volume>8</volume>, <fpage>7589</fpage>&#x2013;<lpage>7599</lpage>. <pub-id pub-id-type="doi">10.1109/jiot.2020.3040195</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>