<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.3" xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Genet.</journal-id>
<journal-title>Frontiers in Genetics</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Genet.</abbrev-journal-title>
<issn pub-type="epub">1664-8021</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="publisher-id">1269255</article-id>
<article-id pub-id-type="doi">10.3389/fgene.2023.1269255</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Genetics</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>Improving predictive ability in sparse testing designs in soybean populations</article-title>
<alt-title alt-title-type="left-running-head">Persa et al.</alt-title>
<alt-title alt-title-type="right-running-head">
<ext-link ext-link-type="uri" xlink:href="https://doi.org/10.3389/fgene.2023.1269255">10.3389/fgene.2023.1269255</ext-link>
</alt-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name>
<surname>Persa</surname>
<given-names>Reyna</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1887427/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/data-curation/"/>
<role content-type="https://credit.niso.org/contributor-roles/formal-analysis/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Canella Vieira</surname>
<given-names>Caio</given-names>
</name>
<xref ref-type="aff" rid="aff2">
<sup>2</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1497546/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Rios</surname>
<given-names>Esteban</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1205089/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Hoyos-Villegas</surname>
<given-names>Valerio</given-names>
</name>
<xref ref-type="aff" rid="aff3">
<sup>3</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/734990/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Messina</surname>
<given-names>Carlos D.</given-names>
</name>
<xref ref-type="aff" rid="aff4">
<sup>4</sup>
</xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author">
<name>
<surname>Runcie</surname>
<given-names>Daniel</given-names>
</name>
<xref ref-type="aff" rid="aff5">
<sup>5</sup>
</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1693038/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name>
<surname>Jarquin</surname>
<given-names>Diego</given-names>
</name>
<xref ref-type="aff" rid="aff1">
<sup>1</sup>
</xref>
<xref ref-type="corresp" rid="c001">&#x2a;</xref>
<uri xlink:href="https://loop.frontiersin.org/people/1824842/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/Writing - review &#x26; editing/"/>
</contrib>
</contrib-group>
<aff id="aff1">
<sup>1</sup>
<institution>Agronomy Department</institution>, <institution>University of Florida</institution>, <addr-line>Gainesville</addr-line>, <addr-line>FL</addr-line>, <country>United States</country>
</aff>
<aff id="aff2">
<sup>2</sup>
<institution>Crop, Soil, and Environmental Sciences</institution>, <institution>Bumpers College</institution>, <institution>University of Arkansas</institution>, <addr-line>Fayetteville</addr-line>, <addr-line>AR</addr-line>, <country>United States</country>
</aff>
<aff id="aff3">
<sup>3</sup>
<institution>Department of Plant Science</institution>, <institution>McGill University</institution>, <addr-line>Montreal</addr-line>, <addr-line>QC</addr-line>, <country>Canada</country>
</aff>
<aff id="aff4">
<sup>4</sup>
<institution>Horticultural Sciences Department</institution>, <institution>University of Florida</institution>, <addr-line>Gainesville</addr-line>, <addr-line>FL</addr-line>, <country>United States</country>
</aff>
<aff id="aff5">
<sup>5</sup>
<institution>Department of Plant Sciences</institution>, <institution>University of California Davis</institution>, <addr-line>Davis</addr-line>, <addr-line>CA</addr-line>, <country>United States</country>
</aff>
<author-notes>
<fn fn-type="edited-by">
<p>
<bold>Edited by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/738478/overview">Alencar Xavier</ext-link>, Corteva Agriscience, United States</p>
</fn>
<fn fn-type="edited-by">
<p>
<bold>Reviewed by:</bold> <ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2400092/overview">Diana Marcela Escamilla Sanchez</ext-link>, Iowa State University, United States</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/1059159/overview">Asif Bashir Shikari</ext-link>, Sher-e-Kashmir University of Agricultural Sciences and Technology of Kashmir, India</p>
<p>
<ext-link ext-link-type="uri" xlink:href="https://loop.frontiersin.org/people/2401257/overview">Cleiton Antonio Wartha</ext-link>, Corteva Agriscience, United States</p>
</fn>
<corresp id="c001">&#x2a;Correspondence: Diego Jarquin, <email>jhernandezjarqui@ufl.edu</email>
</corresp>
</author-notes>
<pub-date pub-type="epub">
<day>23</day>
<month>11</month>
<year>2023</year>
</pub-date>
<pub-date pub-type="collection">
<year>2023</year>
</pub-date>
<volume>14</volume>
<elocation-id>1269255</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>07</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>23</day>
<month>10</month>
<year>2023</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#xa9; 2023 Persa, Canella Vieira, Rios, Hoyos-Villegas, Messina, Runcie and Jarquin.</copyright-statement>
<copyright-year>2023</copyright-year>
<copyright-holder>Persa, Canella Vieira, Rios, Hoyos-Villegas, Messina, Runcie and Jarquin</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/">
<p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p>
</license>
</permissions>
<abstract>
<p>The availability of high-dimensional genomic data and advancements in genome-based prediction models (GP) have revolutionized and contributed to accelerated genetic gains in soybean breeding programs. GP-based sparse testing is a promising concept that allows increasing the testing capacity of genotypes in environments, of genotypes or environments at a fixed cost, or a substantial reduction of costs at a fixed testing capacity. This study represents the first attempt to implement GP-based sparse testing in soybeans by evaluating different training set compositions going from non-overlapped RILs until almost the other extreme of having same set of genotypes observed across environments for different training set sizes. A total of 1,755 recombinant inbred lines (RILs) tested in nine environments were used in this study. RILs were derived from 39 bi-parental populations of the Soybean Nested Association Mapping (NAM) project. The predictive abilities of various models and training set sizes and compositions were investigated. Training compositions included a range of ratios of overlapping (O-RILs) and non-overlapping (NO-RILs) RILs across environments, as well as a methodology to maximize or minimize the genetic diversity in a fixed-size sample. Reducing the training set size compromised predictive ability in most training set compositions. Overall, maximizing the genetic diversity within the training set and the inclusion of O-RILs increased prediction accuracy given a fixed training set size; however, the most complex model was less affected by these factors. More testing environments in the early stages of the breeding pipeline can provide a more comprehensive assessment of genotype stability and adaptation which are fundamental for the precise selection of superior genotypes adapted to a wide range of environments.</p>
</abstract>
<kwd-group>
<kwd>sparse testing</kwd>
<kwd>genomic prediction</kwd>
<kwd>plant breeding</kwd>
<kwd>soybean</kwd>
<kwd>experimental design</kwd>
<kwd>genotype-by-environment interaction</kwd>
</kwd-group>
<contract-num rid="cn001">General Startup Support</contract-num>
<contract-sponsor id="cn001">University of Florida<named-content content-type="fundref-id">10.13039/100007698</named-content>
</contract-sponsor>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Genomics of Plants and the Phytoecosystem</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec id="s1">
<title>1 Introduction</title>
<p>Soybean [<italic>Glycine max</italic> (L.) Merr.] delivers the highest amount of protein per hectare than any crop and accounts for over 60% of total global oilseed production (<xref ref-type="bibr" rid="B29">United States Department of Agriculture, 2022a</xref>). It is the largest and most concentrated segment of global agricultural trade and one of the most essential crops to the world&#x2019;s food security (<xref ref-type="bibr" rid="B10">Gale et al., 2019</xref>). Soybean production has nearly doubled over the last two&#xa0;decades (182,830 to 363,860&#xa0;MT) (<xref ref-type="bibr" rid="B30">United States Department of Agriculture, 2022b</xref>). The genetic improvement of soybean cultivars, as well as advancements in farming technology and agronomic practices, have significantly contributed to this substantial increase (<xref ref-type="bibr" rid="B28">Specht et al., 1999</xref>; <xref ref-type="bibr" rid="B25">Rowntree et al., 2013</xref>; <xref ref-type="bibr" rid="B18">Koester et al., 2014</xref>; <xref ref-type="bibr" rid="B24">Rincker et al., 2014</xref>; <xref ref-type="bibr" rid="B2">Canella Vieira and Chen, 2021</xref>). A typical soybean breeding pipeline consists of several years of multi-environment field trials to select and advance high-yielding breeding lines (<xref ref-type="bibr" rid="B2">Canella Vieira and Chen, 2021</xref>; <xref ref-type="bibr" rid="B36">Yoosefzadeh-Najafabadi and Rajcan, 2022</xref>). The availability of high-dimensional genomic data (<xref ref-type="bibr" rid="B26">Song et al., 2013</xref>; <xref ref-type="bibr" rid="B27">2020</xref>) and the advancements in genome-based prediction models (GP) have revolutionized and contributed to accelerated genetic gains as well as higher testing efficiency in soybean breeding programs (<xref ref-type="bibr" rid="B14">Jarquin et al., 2014a</xref>; <xref ref-type="bibr" rid="B17">Jarquin et al., 2014b</xref>; <xref ref-type="bibr" rid="B23">Persa et al., 2020</xref>; <xref ref-type="bibr" rid="B33">Widener et al., 2021</xref>; <xref ref-type="bibr" rid="B3">Canella Vieira et al., 2022</xref>).</p>
<p>The concept of GP revolves around using the information of all molecular markers&#x2014;regardless of estimated effect size of significance&#x2014;to develop prediction models of the genetic merit for the phenotype of interest in unobserved genotypes (<xref ref-type="bibr" rid="B20">Meuwissen et al., 2001</xref>). Thus, GP allows the identification and selection of desirable genotypes earlier in the breeding pipeline, which not only reduce cost, time, and space but enhance genetic gain by shortening the length of the breeding cycle and increasing selection intensity (<xref ref-type="bibr" rid="B14">Jarquin et al., 2014a</xref>; <xref ref-type="bibr" rid="B6">Crossa et al., 2017</xref>; <xref ref-type="bibr" rid="B2">Canella Vieira and Chen, 2021</xref>; <xref ref-type="bibr" rid="B32">Wartha and Lorenz, 2021</xref>; <xref ref-type="bibr" rid="B3">Canella Vieira et al., 2022</xref>). However, the presence of the genotype-by-environment G&#xd7;E interaction, a change in the response patterns from one environment to another, complicates the selection of improved cultivars (<xref ref-type="bibr" rid="B5">Crossa et al., 2011</xref>). For this reason, it is necessary to establish multi-environment trials (METs) to evaluate the performance of genotypes under a wide range of weather conditions (environmental stimuli) allowing the selection of stable materials or materials with local adaptation only (<xref ref-type="bibr" rid="B15">Jarquin et al., 2020a</xref>). As expected, the high phenotyping cost does not permit the evaluation of all candidate genotypes in all of the environments of interest but a fraction of these combinations of genotypes-in-environments (<xref ref-type="bibr" rid="B16">Jarquin et al., 2020b</xref>). To overcome these disadvantages (i.e., G&#xd7;E and the high phenotyping costs), the implementation of the reaction norm model (<xref ref-type="bibr" rid="B14">Jarquin et al., 2014a</xref>) leverages the borrowing of information of genotypes across environments helping to increase the predictability of unobserved combinations of genotypes-in-environments.</p>
<p>In addition, GP including G&#xd7;E model parameters can substantially improve field testing design and efficiency, as well as resource allocation (<xref ref-type="bibr" rid="B16">Jarquin et al., 2020b</xref>; <xref ref-type="bibr" rid="B21">Montesinos Lopez et al., 2023a</xref>). For instance, GP can reduce the costs and space associated with field testing by using sparse testing designs in which only a subset of the genotypes are tested at each location (<xref ref-type="bibr" rid="B16">Jarquin et al., 2020b</xref>). Sparse testing allows the prediction of non-observed genotype-in-environment combinations reducing the costs at a fixed evaluation capacity (less expensive to make accurate inferences on the original set of genotypes-in-environment combinations) or increasing the overall evaluation capacity at fixed costs (inferences on more genotypes and environment combinations based on the original budget) (<xref ref-type="bibr" rid="B16">Jarquin et al., 2020b</xref>).</p>
<p>Using two maize (<italic>Zea mays</italic> L.) data sets from the International Maize and Wheat Improvement Center (CIMMYT)&#x2019;s breeding program in eastern Africa, <xref ref-type="bibr" rid="B16">Jarquin et al. (2020b)</xref> were the first to demonstrate that GP models could substantially reduce the testing footprint of breeding programs using sparse testing designs. Additional studies of GP-based sparse testing designs have been reported in wheat (<italic>Triticum</italic> L.) (<xref ref-type="bibr" rid="B4">Crespo-Herrera et al., 2021</xref>; <xref ref-type="bibr" rid="B13">He et al., 2021</xref>; <xref ref-type="bibr" rid="B1">Atanda et al., 2022</xref>; <xref ref-type="bibr" rid="B21">Montesinos Lopez et al., 2023a</xref>; <xref ref-type="bibr" rid="B22">Montesinos Lopez et al., 2023b</xref>), maize (<xref ref-type="bibr" rid="B22">Montesinos Lopez et al., 2023b</xref>), groundnut (<italic>Arachis hypogaea</italic> L.) (<xref ref-type="bibr" rid="B21">Montesinos Lopez et al., 2023a</xref>), and rice (<italic>Oryza sativa</italic> L.) (<xref ref-type="bibr" rid="B13">He et al., 2021</xref>; <xref ref-type="bibr" rid="B22">Montesinos Lopez et al., 2023b</xref>). To date, no applications of GP-based sparse testing have been reported in soybean. Therefore, the objective of this study is to investigate the potential of reducing the field testing footprint (less natural resources such as land and water associated with the in fields evaluation of RILs) in soybean breeding programs based on sparse testing designs, as well as the prediction accuracy derived from different GP models including the main effect of the molecular markers via covariance structures (M1), a multiplicative reaction norm model (M2) to account for the genotype-by-environment G&#xd7;E interaction, and an extended reaction norm model also including the family structure (M3) in interaction with environmental stimuli (<xref ref-type="bibr" rid="B23">Persa et al., 2020</xref>). Two different methods for model calibration were considered. The first one, is initially based on RILs randomly selected then varying sample sizes and training composition (between non-overlapping [NO-RILs] and overlapping [O-RILs] genotypes across environments) for a fixed testing set size. While the second one, only varies the training set size since it is based on common sets of genotypes observed across environments and selected under a genetic criteria/algorithm, where the goal is to select a core sample of RILs that maximizes/minimizes the genomic diversity on a sample of fixed size. The objectives of implementing this second selection method were to assess the impacts in predictive ability using different levels of genomic diversity of the RILs when calibrating models, and evaluate the stability of these selected RILs across environments. The impacts in predictive ability using these selection methods were evaluated using a soybean population of 1,755 genotypes evaluated in nine environments (all genotypes in all environments).</p>
</sec>
<sec sec-type="materials|methods" id="s2">
<title>2 Materials and methods</title>
<sec id="s2-1">
<title>2.1 SoyNAM dataset</title>
<p>Phenotypic and genomic data from the Soybean Nested Association Mapping (SoyNAM) experiment (<ext-link ext-link-type="uri" xlink:href="https://www.soybase.org/SoyNAM/">https://www.soybase.org/SoyNAM/</ext-link>) were used in this study. Briefly, the SoyNAM data is comprised of 5,600 recombinant inbred lines (RILs) derived from 40 bi-parental populations (140 RILs per population) corresponding to 40 founders belonging to three different genetic backgrounds [G1: 17 high-yielding lines, G2: 15 diverse ancestries, and G3: eight exotic plant introductions (PI)] crossed with a common hub parent (IA3023) (<xref ref-type="bibr" rid="B8">Diers et al., 2018</xref>). The common parent and founder lines, the RILs and check cultivars were grown in two-row field plots (0.76&#xa0;m spacing; <italic>ca</italic>. 4&#xa0;m long) and phenotyped for nine agronomic traits including grain yield (kg ha<sup>-1</sup>), plant height (cm), seed protein and oil (% dry weight), days to maturity (number of days from planting when 95% of the plant reach physiological maturity), seed size (100 seeds weight in grams), fiber content (percentage in the grain), lodging (score from 1&#x2013;5), and shattering (score from 1 to 5). Initially, the RILs from each family were split in four sets of 35 and each set was augmented with the two parents of the family and three check cultivars selected for adaptation to the field environment; however, if there was not enough seed available for a RIL, the plot was completed with a check variety. RILs were genotyped using the Illumina Infinium BARCSoySNP6K BeadChip (<xref ref-type="bibr" rid="B27">Song et al., 2020</xref>). After filtering molecular markers with more than 20% of missing values and minor allele frequency smaller than 0.03, a subset of 4,100 single nucleotide polymorphisms (SNPs) was available for data analysis.</p>
<p>Balanced multi-environment field experiments where all genotypes are observed in all testing environments are essential to assess the efficacy and advantages of the sparse testing design. For this, environments with less than 1,500 overlapping RILs across all testing environments were discarded. After applying this criterion to the phenotypic data, a total of 1,775 RILs derived from 39 bi-parental families (16 G1, 15 G2, and eight G3) remained for analyses (all RILs tested in all nine environments). The nine environments considered in this study were located across five States (Iowa, Illinois, Indiana, Kansas, and Nebraska) in 2012 and 2013. These included Iowa 2012 (IA_2012), Iowa 2013 (IA_2013), Illinois 2012 (IL_2012), Illinois 2013 (IL_2013), Indiana 2012 (IN_2012), Indiana 2013 (IN_2013), Kansas 2012 (KS_2012), Kansas 2013 (KS_2013), and Nebraska 2012 (NE_2012).</p>
</sec>
<sec id="s2-2">
<title>2.2 Training set selection and composition methods</title>
<p>Two different selection methods were considered to compose calibration sets. The first method (S1) is based on randomly selecting (for each replicate) sets of 195 RILs (1,755 divided by nine) and assigning non-overlapped to each one of the nine environments (total of five replicates). In this case, the 1,755 phenotypic observations measured across nine environments correspond to roughly 11% of all the total potential RILs-in-environment combinations (1,755 &#xd7; nine environments &#x3d; 15,795). The objective is to predict the remaining 1,560 non-observed RILs (1,755&#x2013;195) in each environment for a total of 14,040 (1,560 &#xd7; nine environments) missing combinations across all environments. In addition, different training set sizes were considered by systematically reducing the training set size by groups of 10 RILs from 195 to 95 RILs within each environment. For instance, by reducing the initial training set size (195) by 10 RILs, the training set size across environments was reduced to 1,665 (1,755&#x2013;90). By reducing the training set by 100 RILs, the training set size across environments was reduced to 855 (1,755&#x2013;900). The different within environments training set sizes varied from 195 to 95 RILs (or equivalently from 1,755 to 855 across environments).</p>
<p>In addition, for each of the training set sizes (195, 185, 175, 165, 155, 145, 135, 125, 115, 105, and 95), different training compositions consisting of non-overlapping (NO-RILs) and overlapping (O-RILs) genotypes across environments were considered under the S1 selection method (<xref ref-type="table" rid="T1">Table 1</xref>). To compose these, the starting point were the original sets of 195 NO-RILs that were assigned to each environment. Then, within each environment, 10 RILs were masked as non-observed reducing the total number to 185 NO-RILs. For the within environments training set size of 195 NO-RILs, after masking 10 RILs as non-observed the total number of NO-RILs is reduced to 185. Out of the 90 RILs masked as non-observed (10 non-observed RILs &#xd7; nine environments), 10 were randomly selected as O-RILs to be observed across all environments. The total number of 195 RILs was consistently observed within each environment such that 185 were NO-RILs and 10 were O-RILs (<xref ref-type="table" rid="T1">Table 1</xref>). Thus, the total number of unique tested RILs across environments was reduced from 1,755 to 1,675 (185 NO-RILs &#xd7; nine environments &#x2b;10 O-RILs) and the number of NO-RILs across environments was reduced to 1,665 (185 NO-RILs &#xd7; nine environments). The removal of 10 NO-RILs within environments and redistribution as 10 O-RILs across environments were conducted systematically until reaching the five NO-RILs and 190 O-RILs composition within environments (<xref ref-type="fig" rid="F1">Figure 1</xref>). In such composition, the total number of testing plots remained at 1,755 (195 RILs &#xd7; nine environments), but the total number of unique RILs across environments was reduced to 235 (five NO-RILs &#xd7; nine environments &#x2b;190 O-RILs), and the number of NO-RILs across environments was reduced to 45 (five NO-RILs &#xd7; nine environments) (<xref ref-type="fig" rid="F1">Figure 1</xref>). The removal of 10 NO-RILs within environments and redistribution as 10 O-RILs across environments was conducted systematically in each training set size ranging from 195 to 95 RILs per environment (<xref ref-type="table" rid="T1">Table 1</xref>).</p>
<table-wrap id="T1" position="float">
<label>TABLE 1</label>
<caption>
<p>Summary of different training set sizes and compositions for selection method S1.</p>
</caption>
<table>
<thead valign="top">
<tr>
<th align="center">RILs</th>
<th colspan="20" align="center">Training set composition (non-overlapping RILs - overlapping RILs)</th>
</tr>
</thead>
<tbody valign="top">
<tr>
<td align="center">195</td>
<td align="center">195-0</td>
<td align="center">185-10</td>
<td align="center">175-20</td>
<td align="center">165-30</td>
<td align="center">155-40</td>
<td align="center">145-50</td>
<td align="center">135-60</td>
<td align="center">125-70</td>
<td align="center">115-80</td>
<td align="center">105-90</td>
<td align="center">95-100</td>
<td align="center">85-110</td>
<td align="center">75-120</td>
<td align="center">65-130</td>
<td align="center">55-140</td>
<td align="center">45-150</td>
<td align="center">35-160</td>
<td align="center">25-170</td>
<td align="center">15-180</td>
<td align="center">5-190</td>
</tr>
<tr>
<td align="center">185</td>
<td align="center">-</td>
<td align="center">185-0</td>
<td align="center">175-10</td>
<td align="center">165-20</td>
<td align="center">155-30</td>
<td align="center">145-40</td>
<td align="center">135-50</td>
<td align="center">125-60</td>
<td align="center">115-70</td>
<td align="center">105-80</td>
<td align="center">95-90</td>
<td align="center">85-100</td>
<td align="center">75-110</td>
<td align="center">65-120</td>
<td align="center">55-130</td>
<td align="center">45-140</td>
<td align="center">35-150</td>
<td align="center">25-160</td>
<td align="center">15-170</td>
<td align="center">5-180</td>
</tr>
<tr>
<td align="center">175</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">175-0</td>
<td align="center">165-10</td>
<td align="center">155-20</td>
<td align="center">145-30</td>
<td align="center">135-40</td>
<td align="center">125-50</td>
<td align="center">115-60</td>
<td align="center">105-70</td>
<td align="center">95-80</td>
<td align="center">85-90</td>
<td align="center">75-100</td>
<td align="center">65-110</td>
<td align="center">55-120</td>
<td align="center">45-130</td>
<td align="center">35-140</td>
<td align="center">25-150</td>
<td align="center">15-160</td>
<td align="center">5-170</td>
</tr>
<tr>
<td align="center">165</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">165-0</td>
<td align="center">155-10</td>
<td align="center">145-20</td>
<td align="center">135-30</td>
<td align="center">125-40</td>
<td align="center">115-50</td>
<td align="center">105-60</td>
<td align="center">95-70</td>
<td align="center">85-80</td>
<td align="center">75-90</td>
<td align="center">65-100</td>
<td align="center">55-110</td>
<td align="center">45-120</td>
<td align="center">35-130</td>
<td align="center">25-140</td>
<td align="center">15-150</td>
<td align="center">5-160</td>
</tr>
<tr>
<td align="center">155</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">155-0</td>
<td align="center">145-10</td>
<td align="center">135-20</td>
<td align="center">125-30</td>
<td align="center">115-40</td>
<td align="center">105-50</td>
<td align="center">95-60</td>
<td align="center">85-70</td>
<td align="center">75-80</td>
<td align="center">65-90</td>
<td align="center">55-100</td>
<td align="center">45-110</td>
<td align="center">35-120</td>
<td align="center">25-130</td>
<td align="center">15-140</td>
<td align="center">5-150</td>
</tr>
<tr>
<td align="center">145</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">145-0</td>
<td align="center">135-10</td>
<td align="center">125-20</td>
<td align="center">115-30</td>
<td align="center">105-40</td>
<td align="center">95-50</td>
<td align="center">85-60</td>
<td align="center">75-70</td>
<td align="center">65-80</td>
<td align="center">55-90</td>
<td align="center">45-100</td>
<td align="center">35-110</td>
<td align="center">25-120</td>
<td align="center">15-130</td>
<td align="center">5-140</td>
</tr>
<tr>
<td align="center">135</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">135-0</td>
<td align="center">125-10</td>
<td align="center">115-20</td>
<td align="center">105-30</td>
<td align="center">95-40</td>
<td align="center">85-50</td>
<td align="center">75-60</td>
<td align="center">65-70</td>
<td align="center">55-80</td>
<td align="center">45-90</td>
<td align="center">35-100</td>
<td align="center">25-110</td>
<td align="center">15-120</td>
<td align="center">5-130</td>
</tr>
<tr>
<td align="center">125</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">125-0</td>
<td align="center">115-10</td>
<td align="center">105-20</td>
<td align="center">95-30</td>
<td align="center">85-40</td>
<td align="center">75-50</td>
<td align="center">65-60</td>
<td align="center">55-70</td>
<td align="center">45-80</td>
<td align="center">35-90</td>
<td align="center">25-100</td>
<td align="center">15-110</td>
<td align="center">5-120</td>
</tr>
<tr>
<td align="center">115</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">115-0</td>
<td align="center">105-10</td>
<td align="center">95-20</td>
<td align="center">85-30</td>
<td align="center">75-40</td>
<td align="center">65-50</td>
<td align="center">55-60</td>
<td align="center">45-70</td>
<td align="center">35-80</td>
<td align="center">25-90</td>
<td align="center">15-100</td>
<td align="center">5-110</td>
</tr>
<tr>
<td align="center">105</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">105-0</td>
<td align="center">95-10</td>
<td align="center">85-20</td>
<td align="center">75-30</td>
<td align="center">65-40</td>
<td align="center">55-50</td>
<td align="center">45-60</td>
<td align="center">35-70</td>
<td align="center">25-80</td>
<td align="center">15-90</td>
<td align="center">5-100</td>
</tr>
<tr>
<td align="center">95</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">-</td>
<td align="center">95-0</td>
<td align="center">85-10</td>
<td align="center">75-20</td>
<td align="center">65-30</td>
<td align="center">55-40</td>
<td align="center">45-50</td>
<td align="center">35-60</td>
<td align="center">25-70</td>
<td align="center">15-80</td>
<td align="center">5-90</td>
</tr>
</tbody>
</table>
</table-wrap>
<fig id="F1" position="float">
<label>FIGURE 1</label>
<caption>
<p>Graphical representation of three different training set sizes, compositions and genetic diversity based on the number of unique RILs used for model calibration. The horizontal gray lines correspond to NO-RILs (A across/W within environments) while the horizontal blue lines represent O-RILs; values in the diagonal provide information about the percentage of unique RILs (UNIQUE A) across environments. <bold>(A)</bold> corresponds to the selection of 195 RILs per environment for a total of 1,775 phenotypic observations. <bold>(B, C)</bold> correspond to intermediate and a reduced training set sizes with 145 and 95 RILs per environment with different compositions.</p>
</caption>
<graphic xlink:href="fgene-14-1269255-g001.tif"/>
</fig>
<p>The second method (S2) implemented to select RILs and compose training sets is an alternative to the ramdom method and it is based on the genetic diversity of the RILs using genomic information. It focuses mainly on the maximization/minimization of the genetic diversity. For this, the Super Saturated Design (SSD) method was implement to select samples of fixed size (selected from a large pool of genotypes) increasing (SSD.max) or decreasing (SSD.min) the genetic diversity in the sample (<xref ref-type="bibr" rid="B31">Virdi et al., 2023</xref>). Here, the initial fixed training set size for model calibration consisted of 195 O-RILs across environments. Since the selection of RILs does not involve a random process, for each case (maximize/minimize genetic diversity) only one sample was obtained. Repeating the selection algorithm using a random sample of 195 RILs as starting point would return nearly identical samples (&#x3e;98%). Briefly, the genomic information of the 1,755 RILs was randomly split into two independent sets, one of size 195 (<inline-formula id="inf1">
<mml:math id="m1">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>) and the other of size 1,560 (<inline-formula id="inf2">
<mml:math id="m2">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). The first set <inline-formula id="inf3">
<mml:math id="m3">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> was used to store the selected RILs that met the criteria of maximizing/minimizing the genetic diversity of the sample while the RILs in <inline-formula id="inf4">
<mml:math id="m4">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> were candidates to be selected to compose the sample of size 195.</p>
<p>Systematically, for each iteration, each RIL (one at a time) from the selected set of 195 RILs <inline-formula id="inf5">
<mml:math id="m5">
<mml:mrow>
<mml:mfenced open="(" close="" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula>) (<inline-formula id="inf6">
<mml:math id="m6">
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>.</mml:mo>
<mml:mo>,</mml:mo>
<mml:mn>195</mml:mn>
</mml:mrow>
</mml:math>
</inline-formula>) is replaced by each one of the 1,560 RILs (<italic>j</italic> &#x3d; 1,2, &#x2026; , 1,560) from the <inline-formula id="inf7">
<mml:math id="m7">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> set. Then, <italic>E</italic>(<italic>S</italic>
<sup>2</sup>) is computed, where <inline-formula id="inf8">
<mml:math id="m8">
<mml:mrow>
<mml:mi>S</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:msubsup>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is a matrix of genetic similarities between pairs of markers for a given group of individuals, <inline-formula id="inf9">
<mml:math id="m9">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the SNPs matrix of dimension 195 &#xd7; 4,100. The <inline-formula id="inf10">
<mml:math id="m10">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> is defined as the sum of the squared values of the off-diagonal elements of <inline-formula id="inf11">
<mml:math id="m11">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Equivalently, maximizing/minimizing the <inline-formula id="inf12">
<mml:math id="m12">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> can be accomplished by respectively increasing/decreasing the trace (sum of the values in the diagonal) of <inline-formula id="inf13">
<mml:math id="m13">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula>. Thus, for one iteration, the total number of times that the <italic>E</italic>(<italic>S</italic>
<sup>2</sup>) is computed is 304,200 (195 &#xd7;1,560) and the objective is to identify the <italic>l</italic>
<sup>th</sup> (<italic>j</italic> &#x3d; 1, 2, &#x2026; , 1,560) RIL in the <inline-formula id="inf14">
<mml:math id="m14">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> set that maximizes or minimizes (according to the desired sample) the <inline-formula id="inf15">
<mml:math id="m15">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> after discarding the <italic>m</italic>
<sup>th</sup> (<italic>i</italic> &#x3d; 1, 2, &#x2026; , 195) RIL of the <inline-formula id="inf16">
<mml:math id="m16">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> set. Thus, out of the 304,200 combinations, only one satisfies the condition of maximizing/minimizing the most the <inline-formula id="inf17">
<mml:math id="m17">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. These two RILs are exchanged from one set to the other (i.e., substitute the <italic>l</italic>
<sup>th</sup> RIL in <inline-formula id="inf18">
<mml:math id="m18">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>0</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> by the <italic>m</italic>
<sup>th</sup> RIL in <inline-formula id="inf19">
<mml:math id="m19">
<mml:mrow>
<mml:msub>
<mml:mi>X</mml:mi>
<mml:mn>1</mml:mn>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>). This procedure is repeated until <inline-formula id="inf20">
<mml:math id="m20">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2264;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi mathvariant="italic">min</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for the <italic>k</italic>
<sup>th</sup> iteration <inline-formula id="inf21">
<mml:math id="m21">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1,2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> when the objective is to maximize the genetic diversity in the sample or <inline-formula id="inf22">
<mml:math id="m22">
<mml:mrow>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x2265;</mml:mo>
<mml:msub>
<mml:mi>E</mml:mi>
<mml:mi mathvariant="italic">max</mml:mi>
</mml:msub>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msubsup>
<mml:mi>S</mml:mi>
<mml:mrow>
<mml:mi>l</mml:mi>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
<mml:mrow>
<mml:mn>2</mml:mn>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:mn>1</mml:mn>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> to minimize it.</p>
</sec>
<sec id="s2-3">
<title>2.3 Genomic prediction models</title>
<sec id="s2-3-1">
<title>2.3.1 M1: E &#x2b; G; environment and genomic main effects</title>
<p>Consider that <inline-formula id="inf23">
<mml:math id="m23">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represents the yield performance of the <italic>i</italic>
<sup>th</sup> genotype at the <italic>j</italic>
<sup>th</sup> environment, and it is composed of the sum of a common effect (&#x3bc;) plus an environmental random effect (<inline-formula id="inf24">
<mml:math id="m24">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <italic>j</italic> &#x3d; 1,2, &#x2026; , <italic>J</italic>), a genetic random effect corresponding to the <italic>i</italic>
<sup>th</sup> RIL (<inline-formula id="inf25">
<mml:math id="m25">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <italic>j</italic> &#x3d; 1,2, &#x2026; , <italic>I</italic>), a genomic random effect (<inline-formula id="inf26">
<mml:math id="m26">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula>, <italic>i</italic> &#x3d; 1,2, &#x2026; , <italic>I</italic>), and a random error term <inline-formula id="inf27">
<mml:math id="m27">
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> capturing the unexplained variability by the model components. This linear predictor can be written as follows:<disp-formula id="e1">
<mml:math id="m28">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(1)</label>
</disp-formula>where <inline-formula id="inf28">
<mml:math id="m29">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf29">
<mml:math id="m30">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> represents the corresponding variance component; <inline-formula id="inf30">
<mml:math id="m31">
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mrow>
<mml:mfenced open="{" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mrow>
<mml:mi mathvariant="bold">G</mml:mi>
<mml:mi>&#x3c3;</mml:mi>
</mml:mrow>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf31">
<mml:math id="m32">
<mml:mrow>
<mml:mi mathvariant="bold">G</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:msup>
<mml:mi mathvariant="bold">X</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msup>
<mml:mo>/</mml:mo>
<mml:mi>p</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula>, <bold>X</bold> is the centered and standardized (by columns) matrix of SNPs, <inline-formula id="inf32">
<mml:math id="m33">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is the additive genetic variance; and <inline-formula id="inf33">
<mml:math id="m34">
<mml:mrow>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula id="inf34">
<mml:math id="m35">
<mml:mrow>
<mml:msup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:math>
</inline-formula> as the residual variance. The entries of <inline-formula id="inf35">
<mml:math id="m36">
<mml:mrow>
<mml:mi mathvariant="bold">G</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> describe the genomic similarities between pairs of individuals allowing the borrowing of information between tested and untested genotypes.</p>
</sec>
<sec id="s2-3-2">
<title>2.3.2 M2: E &#x2b; G &#x2b; GE; environment and genomic main effects plus genotype-by-environment (GE) interaction</title>
<p>The previous M1 returns a singular genomic effect for each genotype observed in different environments. Thus, the resulting predicted genomic effect might not be accurate when considering all tested environments. To allow specific genomic effects at each environment, a model incorporating the genotype-by-environment GE interaction was considered. The reaction norm model conceptually allows the inclusion of the interaction between each molecular marker and each environmental factor in a convenient way via covariance structures (<xref ref-type="bibr" rid="B14">Jarquin et al., 2014a</xref>). Let <inline-formula id="inf36">
<mml:math id="m37">
<mml:mrow>
<mml:mfenced open="" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>gE</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>{</mml:mo>
<mml:mtext>gE</mml:mtext>
</mml:mrow>
<mml:mtext>ij</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> be the vector of the interaction scores between <italic>i</italic>
<sup>th</sup> genotype and the <italic>j</italic>
<sup>th</sup> environment. The previous interaction effect can be modeled as a random effect following a multivariate normal distribution centered on zero and a covariance structure given by <inline-formula id="inf37">
<mml:math id="m38">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:msub>
<mml:mi mathvariant="bold">G</mml:mi>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2218;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> where <inline-formula id="inf38">
<mml:math id="m39">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> and <inline-formula id="inf39">
<mml:math id="m40">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> represent the incidence matrices that connect phenotypic records with genotypes and environments, respectively. Here, &#x201c;<inline-formula id="inf40">
<mml:math id="m41">
<mml:mrow>
<mml:mo>&#x2218;</mml:mo>
</mml:mrow>
</mml:math>
</inline-formula>&#x201d; is the Hadamard product (cell-by-cell) between two matrices (covariances structures). Adding the previous model term to M1, the resulting linear predictor is as follows:<disp-formula id="e2">
<mml:math id="m42">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>gE</mml:mtext>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(2)</label>
</disp-formula>where <inline-formula id="inf41">
<mml:math id="m43">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">g</mml:mi>
<mml:mi mathvariant="bold">E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>{</mml:mo>
<mml:mtext>gE</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>}</mml:mo>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
</mml:msub>
<mml:mi mathvariant="bold">G</mml:mi>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2218;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> with <inline-formula id="inf42">
<mml:math id="m44">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mrow>
<mml:mi>g</mml:mi>
<mml:mi>E</mml:mi>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> representing the associated variance component.</p>
<sec id="s2-3-3">
<title>2.3.3 M3: E &#x2b; G &#x2b; GE &#x2b; FE; Environment and Genomic main effects plus genotype-by-environment and family-by-environment interactions</title>
<p>
<xref ref-type="bibr" rid="B23">Persa et al. (2020)</xref> proposed a model to leverage the information of individuals belonging to the same families but observed in different environments by including the family-by-environment FE interaction model term. Consider <inline-formula id="inf43">
<mml:math id="m45">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mtext>&#x2009;</mml:mtext>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mn>1</mml:mn>
<mml:mo>,</mml:mo>
<mml:mn>2</mml:mn>
<mml:mo>,</mml:mo>
<mml:mo>&#x2026;</mml:mo>
<mml:mo>,</mml:mo>
<mml:mi>m</mml:mi>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, as the random model term representing the effect of the <italic>k</italic>
<sup>th</sup> family such that <inline-formula id="inf44">
<mml:math id="m46">
<mml:mrow>
<mml:msub>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn>0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. This model term allows the borrowing of information between genotypes belonging to the same family under the premise that genotypes from the same family may perform alike. Here, the predicted effect of the family membership is common for individuals of the same family but observed in different environments. For this reason, similarly to model M2, the interaction between this model term and the environments <inline-formula id="inf45">
<mml:math id="m47">
<mml:mrow>
<mml:mfenced open="" close="}" separators="|">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mtext>FE</mml:mtext>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>{</mml:mo>
<mml:mtext>FE</mml:mtext>
</mml:mrow>
<mml:mtext>kj</mml:mtext>
</mml:msub>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:math>
</inline-formula> was considered to allow specific values for each family at each environment besides the main effect of the family membership. The resulting model after adding the main effect of the family <inline-formula id="inf46">
<mml:math id="m48">
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
</mml:mrow>
</mml:math>
</inline-formula> and the interaction <inline-formula id="inf47">
<mml:math id="m49">
<mml:mtext>FE</mml:mtext>
</mml:math>
</inline-formula> to model M2 is as follows:<disp-formula id="e3">
<mml:math id="m50">
<mml:mrow>
<mml:msub>
<mml:mi>y</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x3d;</mml:mo>
<mml:mi>&#x3bc;</mml:mi>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mi>j</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">L</mml:mi>
<mml:mi>i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mi>k</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi mathvariant="normal">g</mml:mi>
<mml:mi mathvariant="normal">i</mml:mi>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>gE</mml:mtext>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mtext>FE</mml:mtext>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>&#x2b;</mml:mo>
<mml:msub>
<mml:mi>&#x3f5;</mml:mi>
<mml:mrow>
<mml:mi>i</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
</mml:mrow>
</mml:math>
<label>(3)</label>
</disp-formula>where <inline-formula id="inf48">
<mml:math id="m51">
<mml:mrow>
<mml:msub>
<mml:mrow>
<mml:mi mathvariant="bold">F</mml:mi>
<mml:mi mathvariant="bold">E</mml:mi>
<mml:mo>&#x3d;</mml:mo>
<mml:mo>{</mml:mo>
<mml:mtext>FE</mml:mtext>
</mml:mrow>
<mml:mrow>
<mml:mi>k</mml:mi>
<mml:mi>j</mml:mi>
</mml:mrow>
</mml:msub>
<mml:mo>}</mml:mo>
<mml:mo>&#x223c;</mml:mo>
<mml:mi>N</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:mn mathvariant="bold">0</mml:mn>
<mml:mo>,</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:mo>&#x2218;</mml:mo>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
</mml:msub>
<mml:msubsup>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">E</mml:mi>
<mml:mo>&#x2032;</mml:mo>
</mml:msubsup>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mtext>FE</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>, <inline-formula id="inf49">
<mml:math id="m52">
<mml:mrow>
<mml:msubsup>
<mml:mi>&#x3c3;</mml:mi>
<mml:mtext>FE</mml:mtext>
<mml:mn>2</mml:mn>
</mml:msubsup>
</mml:mrow>
</mml:math>
</inline-formula> is the corresponding variance component, and <inline-formula id="inf50">
<mml:math id="m53">
<mml:mrow>
<mml:msub>
<mml:mi>Z</mml:mi>
<mml:mi mathvariant="normal">F</mml:mi>
</mml:msub>
</mml:mrow>
</mml:math>
</inline-formula> is the incidence matrix that connects phenotypes with families.</p>
</sec>
</sec>
</sec>
<sec id="s2-4">
<title>2.4 Assessment of model efficiency</title>
<p>Predictive ability was measured on a trial basis, thus the Pearson correlation coefficient between predicted and observed values was computed within environments. The overall predictive ability was computed as the average Pearson&#x2019;s correlation coefficient across the nine environments. As mentioned before, despite the training set size or its composition (NO-RILs and O-RILs), the testing set size (prediction set) was the same for all cases. Within each environment, between 95 and 195 RILs were used as the training set for a constant prediction set size of 1,560 RILs.</p>
<p>Since the RILs belong to families with different genetic backgrounds (G1, G2, and G3), the Pearson&#x2019;s correlation coefficient was also calculated considering only the RILs within each one of the different groups at each environment. In this case, in each environment, the Pearson&#x2019;s correlation coefficient between predicted and observed values was computed three times, one for each genetic background group. The objective of considering the different groups of families when computing the correlations was to assess the effects on predictive ability when blocking for population structure although that was not the main goal of this research.</p>
</sec>
</sec>
<sec sec-type="results" id="s3">
<title>3 Results</title>
<sec id="s3-1">
<title>3.1 Phenotypic data and population structure</title>
<p>Phenotypic information on grain yield was available for 1,755 genotypes observed in nine environments (all genotypes in all environments) for a total of 15,795 records available for analysis. No statistically significant differences were observed for yield across the seven different initial samples of size 195 RILs selected with the S1 (five replications based on random selections) or S2 (two for maximizing/minimizing the genetic diversity) selection methods. However, the sampling method minimizing genetic diversity resulted in higher average yield (3,722.5&#xa0;kg ha-<sup>1</sup>) compared to the average of all other sampling methods of (3,563.2&#xa0;kg ha-<sup>1</sup>) and a significantly lower coefficient of variation among samples (<xref ref-type="sec" rid="s11">Supplementary Figure S1</xref>).</p>
<p>To assess the stability across environments of RILs selected with the different sampling methods, the Pearson&#x2019;s correlation coefficient between environments was computed for each sample considering the largest sample size (195 RILs) observed across all nine environments. The total number of correlation values among the nine environments was 36 <inline-formula id="inf51">
<mml:math id="m54">
<mml:mrow>
<mml:mrow>
<mml:mo>(</mml:mo>
</mml:mrow>
<mml:mrow>
<mml:mfrac>
<mml:mrow>
<mml:mn>9</mml:mn>
<mml:mo>&#xd7;</mml:mo>
<mml:mn>8</mml:mn>
</mml:mrow>
<mml:mn>2</mml:mn>
</mml:mfrac>
</mml:mrow>
<mml:mrow>
<mml:mo>)</mml:mo>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula>. The SSD.max sample showed the highest (0.292) median Pearson&#x2019;s correlation coefficient with a larger dispersion between the 50% and 75% quantiles while the SSD.min returned the lowest median correlation (0.123) (<xref ref-type="sec" rid="s11">Supplementary Figure S2</xref>).</p>
<p>A total of 4,100 SNPs were included in the analysis after filtering the original set of 5,400 SNPs. Among the 1,755 RILs, no clear association patterns regarding population structure were observed across different genetic background groups (G1: high-yielding lines, G2: diverse ancestries, and G3: exotic PIs) (left panel in <xref ref-type="fig" rid="F2">Figure 2</xref>). On the other hand, the SSD.min samples were substantially more clustered than the SSD.max samples when compared to the distribution of all RILs (right panel in <xref ref-type="fig" rid="F2">Figure 2</xref>). This indicates that the SSD methodology can effectively select a group of individuals to either maximize or minimize genetic diversity given a fixed sample size.</p>
<fig id="F2" position="float">
<label>FIGURE 2</label>
<caption>
<p>Population structure of 1,755 RILs derived from 39 families sharing a common hub parent (IA3023). The families belong to three different groups of ancestry G1(elite), G2 (diverse), and G3 (exotic background). The right panel indicates the selected genotypes using the super saturated design (SSD) method to maximize (red color) or minimize (blue color) the genetic diversity based on genomic information.</p>
</caption>
<graphic xlink:href="fgene-14-1269255-g002.tif"/>
</fig>
</sec>
<sec id="s3-2">
<title>3.2 Predictive ability across multiple samples</title>
<sec id="s3-2-1">
<title>3.2.1 Across ancestry groups</title>
<p>
<xref ref-type="fig" rid="F3">Figure 3</xref> depicts the mean (five replicates randomly selected) average correlation between predicted and observed values across the nine environments for different training set sample sizes and compositions consisting of NO-RILs and O-RILs, and three prediction models (M1: E &#x2b; L &#x2b; G [gray color]; M2: E &#x2b; L &#x2b; G &#x2b; GE [blue color]; and M3: E &#x2b; L &#x2b; G &#x2b; GE &#x2b; FE [orange color]). The solid thick line represents the mean average correlation corresponding to the largest training set size (195 RILs) while the thin dashed lines correspond to the reduced training set sizes (185&#x2013;95). The starting point on the left side of the lines represents the mean average for the scenarios where all the genotypes were observed only once across environments (195, 185, &#x2026; , 95), while the other extreme of the lines (right side) corresponds to the case where most of the genotypes (e.g., 190 out 195; 170 out 175, &#x2026; , 90 out of 95) are common across environments. Hence, the number of common genotypes increases as it moves laterally (left to right) across <xref ref-type="fig" rid="F3">Figure 3</xref>.</p>
<fig id="F3" position="float">
<label>FIGURE 3</label>
<caption>
<p>Mean (five replicates) average (across nine environments) of the within environments correlation between predicted and observed values for different sample sizes and composition for model training and three prediction models including the main effect of the environment (E), RIL (L), marker SNPs (G), and the interaction between marker SNPs and environments (GE); and between families and environments (FE). (M1: E &#x2b; L &#x2b; G; M2: E &#x2b; L &#x2b; G &#x2b; GE; and M3: E &#x2b; L &#x2b; G &#x2b; GE &#x2b; FE).</p>
</caption>
<graphic xlink:href="fgene-14-1269255-g003.tif"/>
</fig>
<p>The highest mean average correlation was obtained with model M3 (&#x223c;0.57), which was roughly 33% and 90% higher than M2 (&#x223c;0.40) and M1 (&#x223c;0.30), respectively (<xref ref-type="fig" rid="F3">Figure 3</xref>). A slight improvement in predictive ability was observed by increasing the number of O-RILs by 10, reaching a plateau with the addition of more O-RILs, followed by a slight reduction in the average correlation towards the end (right side). Models M2 and M3 were less influenced by increasing the number of common genotypes across environments as compared to model M1 (<xref ref-type="fig" rid="F3">Figure 3</xref>). In addition, as expected, the reduction of the training set size resulted in a reduction of the mean average correlation in all three models, being more pronounced in model M3. However, M3 always returned the best results for the same combinations between the training set size and calibration composition compared to M1 and M2 (<xref ref-type="fig" rid="F3">Figure 3</xref>).</p>
<p>Considering the largest training set size (195 RILs), the training composition did not have a significant impact on the model performance, particularly in model M3 which performance was stable across all training compositions. Consequently, model M3 has advantages since high predictive ability does not require testing a large set of common RILs across environments which could be challenging due to constraints on the land and seed availability, especially in the earlier stages of the soybean breeding pipeline.</p>
</sec>
<sec id="s3-2-2">
<title>3.2.2 Within ancestry groups</title>
<p>Since the RILs were derived from families with different groups of ancestry (G1, G2, and G3), the effects of ancestry groups on the predictive ability were assessed by computing the correlation between predicted and observed values of similar families within each environment. For this, a post-stratification of the vector of predicted values of size 1,560 in the three different groups was conducted. Then, the within environments predictive ability for each group of families were computed despite the training set composition.</p>
<p>Similar to <xref ref-type="fig" rid="F3">Figures 3</xref>, <xref ref-type="fig" rid="F4">4</xref> depict the mean average correlation across environments and replicate computing the correlation between predicted and observed values only among the families that belong to the same group of ancestry (e.g., G1, G2, and G3). As expected, different patterns were observed for the different groups of families. The group of RILs derived from elite parents (G1) was highly affected by the reduction of the training sample size (<xref ref-type="fig" rid="F4">Figure 4</xref>). However, these results significantly improved when the number of O-RILs increased. Using the largest training set size, the best results were obtained with the model that includes the interaction between families and environments (M3). Also, the predictive ability was not affected by the different training set compositions in this case. In the group of RILs derived from diverse ancestry (G2), a less pronounced decay in predictive ability was observed when the training set size was decreased (<xref ref-type="fig" rid="F4">Figure 4</xref>). The best results were also obtained with the most complex model M3. For the group of RILs derived from exotic ancestry (G3), the results using the most complex model were not affected by the training set size and composition (<xref ref-type="fig" rid="F4">Figure 4</xref>). In this scenario, even the smaller training set size returned comparable results to the largest training set size.</p>
<fig id="F4" position="float">
<label>FIGURE 4</label>
<caption>
<p>Mean (five replicates) average (across nine environments) of the within environments correlation between predicted and observed values for different genetic backgrounds [<bold>(A)</bold> G1: 17 high-yielding lines, <bold>(B)</bold> G2: 15 diverse ancestries, and <bold>(C)</bold> G3: eight exotic plant introductions (PI)], sample sizes and composition for model training and three prediction models (M1: E &#x2b; L &#x2b; G; M2: E &#x2b; L &#x2b; G &#x2b; GE; and M3: E &#x2b; L &#x2b; G &#x2b; GE &#x2b; FE).</p>
</caption>
<graphic xlink:href="fgene-14-1269255-g004.tif"/>
</fig>
</sec>
<sec id="s3-2-3">
<title>3.2.3 Within families for each ancestry group</title>
<p>A more detailed dissection of the model&#x2019;s predictive ability at the group/family (G1/families 1&#x2013;17; G2/families 18&#x2013;31; and G3/families 32&#x2013;39) level is displayed in <xref ref-type="sec" rid="s11">Supplementary Figure S3</xref> (G1), <xref ref-type="sec" rid="s11">Supplementary Figure S4</xref> (G2), and <xref ref-type="sec" rid="s11">Supplementary Figure S5</xref> (G3) for the different models (M1-M3), training set sizes and composition between overlapped and not overlapped RILs. For each family the within environment correlation between predicted and observed values was computed. Then, the average across environments was obtained for each one of the five replicates and the mean was calculated. This procedure was repeated for all the different training set size and compositions and the results were grouped (mean) for each group of families (G1, G2, and G3).</p>
<p>Different patterns were observed for each one of the ancestry groups. For the G1 group (<xref ref-type="sec" rid="s11">Supplementary Figure S3</xref>), the mean average within families correlation varied between 0.05 and 0.114. Considering the largest training set size, the models M2 and M3 performed very similarly across the different training compositions (blue and orange thick lines) always outperforming M1. However, reducing the training set size, the M3 model loses predictive ability compared to M2. With respect to the families in G2, the mean average within familie correlation ranged from 0.06 to 0.145. When considering the largest training set size, the M3 model slightly outperformed M2 model, specifically with a low number of overlapping RILs. While with reduced training set sizes and compotions not clear pattern were observed. Regarding the families in G3, the correlations varied between 0.55 and 0.195. In this case, the M3 model returned the best results for almost all training compostions when considering the largest training set size. Also, the model&#x2019;s predictive ability was not significantly affected by reducing the training set sizes but for the training composition when increasing the number of overlapped RILs across environments. Although for each group of families different patterns were observed, in general, the model M3 returned comparable results to the other models (M1 and M2) for most of the training set sizes and compositions. In many of these cases, the M3 model outperformed M1 and M2 specially when considering larger training set sizes and a low number of overlapping RILs across environments.</p>
</sec>
<sec id="s3-2-4">
<title>3.2.4 Maximized (SSD.max) and minimized (SSD.min) genetic diversity</title>
<p>Besides the method that randomly selects RILs to compose the training set, the SSD algorithm that chooses RILs maximizing/minimizing the genetic diversity (SSD.max and SSD.min, respectively) for a fixed sample size that was also considered. <xref ref-type="sec" rid="s11">Supplementary Figure S6</xref> shows the progression of <inline-formula id="inf52">
<mml:math id="m55">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> for maximizing (SSD.max left panel) and minimizing (SSD.min right panel) the genetic diversity contained in a sample of size 195 RILs. A total of 183 iterations were required in SSD.max to meet the stopping criteria when the <inline-formula id="inf53">
<mml:math id="m56">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> of the (<italic>k&#x2b;1</italic>)<sup>th</sup> iteration is larger than the previous one. On the other hand, a total of 182 iterations were needed to meet the stopping criteria in SSD.min. The corresponding <inline-formula id="inf54">
<mml:math id="m57">
<mml:mrow>
<mml:mi>E</mml:mi>
<mml:mrow>
<mml:mfenced open="(" close=")" separators="|">
<mml:mrow>
<mml:msup>
<mml:mi>S</mml:mi>
<mml:mn>2</mml:mn>
</mml:msup>
</mml:mrow>
</mml:mfenced>
</mml:mrow>
</mml:mrow>
</mml:math>
</inline-formula> values that maximize/minimize the genomic diversity were 896 and 5,613.</p>
<p>Given the highest prediction accuracies were always obtained with model M3 for all training set sizes, the predictive ability comparison between the random samples and those obtained using the SSD method was based on this model only. <xref ref-type="fig" rid="F5">Figure 5</xref> depicts the scatter plot between the across environments average predictive ability corresponding to the five random samples (considering only the sets of common genotypes) and the SSD selection method across ancestry groups (panel A) and within ancestry groups (G1-panel B; G2-panel C; and G3-panel D). The colored circles indicate the samples that are being compared: orange color for contrasting SSD maximizing genetic diversity vs. random sample; and pink color for SSD minimizing genetic diversity vs. random sample. The numbers within the circles indicate the training set size, and the diagonal line represents the 1:1 ratio between both methods. Values above the diagonal line indicate that the random method is superior and <italic>vice versa</italic>. In general, across groups of ancestry, equivalent results between the random and SSD.max were observed for all sample sizes. On the other hand, the random sample method outperformed SSD.min in all groups of ancestry and sample sizes (<xref ref-type="fig" rid="F5">Figure 5</xref>).</p>
<fig id="F5" position="float">
<label>FIGURE 5</label>
<caption>
<p>Predictive abilities between random and SSD methods considering sets of common genotypes across environments using model M3 across ancestry groups <bold>(A)</bold>, and within ancestry groups (G1: <bold>(B)</bold>, G2: <bold>(C)</bold>, and G3: <bold>(D)</bold>. The numbers within the circles indicate the sample size of the training set. The orange circles contrast the results obtained with the sample that maximizes the genetic diversity (SSD.max) vs. the random samples. The pink circles contrast the results obtained with the sample that minimizes the genetic diversity (SSD.min) vs. the random samples.</p>
</caption>
<graphic xlink:href="fgene-14-1269255-g005.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="discussion" id="s4">
<title>4 Discussion</title>
<p>The implementation of GP has revolutionized commercial and public soybean breeding programs by allowing plant breeders to predict the phenotype of interest in unobserved genotypes (<xref ref-type="bibr" rid="B14">Jarquin et al., 2014a</xref>; <xref ref-type="bibr" rid="B17">Jarquin et al., 2014b</xref>; <xref ref-type="bibr" rid="B23">Persa et al., 2020</xref>; <xref ref-type="bibr" rid="B33">Widener et al., 2021</xref>; <xref ref-type="bibr" rid="B3">Canella Vieira et al., 2022</xref>). The first report on GP in soybean was based on a standard G-BLUP model including only additive effects and an extended version of the G-BLUP model including additive-by-additive effects (<xref ref-type="bibr" rid="B17">Jarquin et al., 2014b</xref>). <xref ref-type="bibr" rid="B19">Ma et al. (2016)</xref> used ridge regression best linear unbiased prediction (rrBLUP) (<xref ref-type="bibr" rid="B9">Endelman, 2011</xref>) with fivefold cross-validations to explore strategies of marker preselection. Prediction accuracy based on pre-selected markers slightly increased compared with random or equidistant marker sampling (<xref ref-type="bibr" rid="B19">Ma et al., 2016</xref>). <xref ref-type="bibr" rid="B35">Xavier et al. (2016)</xref> showed that the training population size was the most impactful factor in the prediction accuracy when investigating the impacts of training population size, genotyping density, and different prediction models. <xref ref-type="bibr" rid="B23">Persa et al. (2020)</xref> expanded the reaction norm model proposed by <xref ref-type="bibr" rid="B14">Jarquin et al. (2014a)</xref> by incorporating the interaction between the family&#x2019;s membership of the genotypes and the environment under the premise that the differential responses of families to environmental stimuli could be used for enhancing the selection process in target environments. These authors showed significant improvements in predictive ability by incorporating the family-by-environment interaction in the models compared to the conventional reaction norm model (<xref ref-type="bibr" rid="B23">Persa et al., 2020</xref>). However, they also pointed out that this model requires to observe, at least partially, the individuals of the family to predict otherwise no improvements in predictive ability are expected when predicting a yet to observe family. Thus, this model is benefited only when some individuals of the target family are observed. In this case, the family term here has a closely related to pedigree information. In addition, <xref ref-type="bibr" rid="B11">Habier et al. (2007)</xref> and (2013), showed that the addition of a model term modeling the differences between groups of indiviuals (families, pedigree, etc.) besides the genomic information, does a better job capturing relationship information rather than just relationships alone. In this way, the family term enables the genomic information to focusing on only capturing LD information rather than trying to capture or explain at the same time LD and the relationships.</p>
<p>Canella Vieira et al. (2022) investigated the potential of incorporating soil texture information and its interaction with molecular markers via covariance structures for enhancing predictive ability across breeding scenarios. The obtained results were more stable when predicting trait performance in novel environments compared to the conventional reaction norm model.</p>
<p>The expression of a phenotype is a function of the genotype, the environment, and the interaction between the genotype and environment (G&#xd7;E) across different environments (<xref ref-type="bibr" rid="B7">de Leon et al., 2016</xref>). Grain yield is a highly complex and quantitative trait regulated by numerous large and small-effect genes, of which its expression largely depent on the genotype interaction with various components of the environment. Genomic prediction-based sparse testing is a promising concept to substantially increase the number of tested genotypes and environments while maintaining fixed requirements for land, seed availability, and costs (<xref ref-type="bibr" rid="B16">Jarquin et al., 2020b</xref>; <xref ref-type="bibr" rid="B21">Montesinos Lopez et al., 2023a</xref>). Hence, GP-based sparse testing enables a more comprehensive assessment of genotype stability and adaptation in the early stages of the breeding pipeline, which likely could result in a more accurate selection of superior genotypes widely adapted to various environments.</p>
<p>In soybean breeding, this is particularly important in the early stages of yield trials (i.e., preliminary yield trials) where seed availability and the excessive number of testing genotypes are major constraints (<xref ref-type="bibr" rid="B2">Canella Vieira and Chen, 2021</xref>). In the public sector, preliminary yield trials often consist of thousands (&#x223c;1,000 to 3,000) of genotypes tested across several replicated environments (three to seven environments, two to three replications) resulting on average 20,000 preliminary yield plots (2,000 genotypes, five environments, two replications). As shown in this research (1,755 genotypes, nine environments, a total of 15,795 phenotypic records), GP-based sparse testing can sustain high accuracies at various training compositions which could reduce the testing footprint by as much as 90% (<xref ref-type="fig" rid="F1">Figure 1</xref>). Given the cost of approximately $15 for a yield phenotype (plot), this methodology has the potential to reduce the total cost of preliminary trials (depending on the size of the program) by as much as $210,600 ($236,925&#x2013;26,325; 1,755 vs 15,795) with the largest training set size (195) while about $224,100 ($236,925-$12,825; 855 vs 15,795) for the smallest one (95) while maintaining the number of genotypes and environments fixed. On the other hand, the number of tested RILs and/or environments can increase by 5- to 10-fold while maintaining the costs fixed. Considering the premise of same phenotyping cost across environments (fixed cost of $15 per phenotypic record), with the initial budget dedicated to test all the RILs in all of the environments ($236,925) the number of RILs or the number of environments can be increased by 9 folds or considering combinations of these.</p>
<p>In this study, different prediction models, as well as multiple training set sizes and compositions (ratio of NO-RILs and O-RILs across environments, SSD.max and SSD.min), were investigated. The most comprehensive model, including the interaction between families and the environment (M3), yielded the highest prediction accuracies independent of training set sizes and composition. The ability to borrow information between genotypes derived from the same families likely contributed to the superior performance of M3 because half of the cross was already observed a considerable number of times in other crosses and environments (<xref ref-type="bibr" rid="B15">Jarquin et al., 2020a</xref>). In GP-based sparse testing where most genotypes are untested within and across environments, borrowing information from tested individuals within the same family but observed in different environments has been shown to improve predictive ability by as much as 48% compared to models including the interaction of molecular markers and environments only or reaction norm model (<xref ref-type="bibr" rid="B23">Persa et al., 2020</xref>). Across all three models, reducing the training set size negatively impacted prediction accuracies. However, prediction accuracy was rapidly recovered by the addition of O-RILs. This indicates that the training set composition is critical to a successful GP-based sparse testing implementation when the training set is reduced. This observation is also important in advanced yield trials where the number of testing genotypes is significantly reduced as compared to preliminary trials (roughly 90% fewer genotypes), and the number of testing environments is significantly increased (<xref ref-type="bibr" rid="B2">Canella Vieira and Chen, 2021</xref>). Specifically, the advanced yield trials stage offers more flexibility in maximizing training set sizes and composition (increasing O-RILs) since seed availability should not be a major constraint.</p>
<p>The results and trends in predictive ability obtained with models M1 (main effects only) and M2 (reaction norm considering the interaction between markers and environments) were similar to those obtained by <xref ref-type="bibr" rid="B16">Jarquin et al., 2020b</xref>; <xref ref-type="bibr" rid="B4">Crespo-Herrera et al., 2021</xref> analyzing maize and wheat data, respectively. However, these authors did not consider models including the interaction between family and environments. On the other hand, the results of model M3 (also including the interaction between family and environment) were similar to those obtained by <xref ref-type="bibr" rid="B23">Persa et al., 2020</xref> analyzing information of the SoyNAM experiment comprising 1,358 RILs observed in 18 environments (not all RILs observed in all environments) but considering a conventional fivefold cross-validation predicting tested genotypes in observed environments (CV2) and untested genotypes in observed environments (CV1). In addition, <xref ref-type="bibr" rid="B34">Xavier and Habier (2022)</xref>, using the SoyNAM population conducted an study to evaluate the effects in predictive ability of models similar to M1 and M2 when considering simulated data for the case where RILs are observed only once across environments for different heritabilities. The results obtained for these authors were higher compared with the results here presented probably due to the fact they considered simulated data as response as opposed to real data and a different cross-validation scheme.</p>
<p>In this study, in addition to the ratio of NO-RILs and O-RILs, the selection of RILs to compose the training set based on maximizing (SSD.max) and minimizing (SSD.min) genetic diversity was also investigated. The Super Saturated Design (SSD) methodology was implemented to identify the set of 195 RILs that either decreased or increased E(<italic>S</italic>
<sup>2</sup>) (SSD.max and SSD.min, respectively). The method successfully created two fixed-size sample groups with contrasting genetic diversity. This was further observed in the population structure of each group, where SSD.max samples showed wider distribution and minimal clustering as compared to the SSD.min samples (right panel in <xref ref-type="fig" rid="F2">Figure 2</xref>). Interestingly, the SSD.max samples&#x27; prediction accuracy outperformed the SSD.min samples in all scenarios (<xref ref-type="fig" rid="F5">Figure 5</xref>). Superior prediction accuracy in the SSD.max samples could be explained by the higher availability of diverse alleles associated with stress resilience, and therefore, resulted in higher stability across environments. A narrower genetic diversity, although may prioritize high-yielding alleles, can be more susceptible to environmental stressors. This was also observed in the higher phenotypic correlation of the SSD.max samples across all environments compared to SSD.min samples (<xref ref-type="sec" rid="s11">Supplementary Figure S2</xref>).</p>
<p>Also, in general the results of the SSD.max sample slightly under performed the results of the random samples for most of the sample sizes of common RILs across environments and across ancestry groups, except for the case when the sample size was fixed in 115 common RILs (<xref ref-type="fig" rid="F5">Figure 5A</xref>). Considering the different ancestry groups, mixed results were obtained for G1, while for G2 better results were obtained with the SSD.max sample, and for G3 the random samples were superior. In all of the cases, the SSD.min returned the worse results. Seems like the G1 group of families (high yielding) is very susceptible to the presence of genotype-by-environment interactions thus reduced sample sizes with a high propotion of non-overlapped genotypes significantly reduced predictive ability. On the other hand, in G2, the reduction of the training set sizes is the only factor that reduces predictive ability, and the results are more stable across the different training sets compositions. Finally, for G3, neither the sample size nor the training set composition seems to affect the predictive ability since similar results were obtained across these.</p>
<p>Additionally, higher genetic diversity (here represented by G3) yielded stable prediction accuracies across various training set sizes and compositions. Like the SSD.max observation, this could be attributed to the higher availability of diverse stress resilient-alleles and therefore require a smaller number of samples, as well as reduced O-RILs to achieve maximum prediction accuracy. These observations are fundamental to the establishment of a successful GP-based sparse testing design and should be further explored across populations with various genetic backgrounds, including high-yielding bi-parental populations.</p>
<p>Similarly to the analysis of the different genetic backgrounds, a more detailed analysis can be done by considering the model&#x2019;s predictive ability at the family level. As expected, a significant reduction in predictive ability was obtained (<xref ref-type="sec" rid="s11">Supplementary Figures S4&#x2013;S6</xref>); however, there was not a unique model always outperforming the other two. In general, model M3 slightly outperformed the other two models, specially when considering the largest training set size and with a lower number of overlapping genotypes. Also, it was superior when only considering the families from G3. Arguably the results obtained when computing the correlation across families might be inflated; however, the selection of the superior cultivars are not made within families but across these. In addition, considering the three models, there was not a unique model systematically outperforming the others. The lack of a unique model significanly outperforming the within families predictive ability is due to the fact that no relationship is factored in when ranking individuals within families (<xref ref-type="bibr" rid="B12">Habier et al., 2013</xref>). All the models have the same ability to model LD. In our case, at the family level, the most complex model M3 showed slight improvements in predictive ability by including the family term as main effect and in interaction with environments. This could help breeders to better screening RILs across families and environments while reducing phenotyping cost via sparse testing designs.</p>
</sec>
<sec sec-type="conclusion" id="s5">
<title>5 Conclusion</title>
<p>Genomic prediction-based sparse testing design is a promising approach to further maximize the applications of high-dimensional genomic data and predictive models toward improving cultivar development. The increase of testing environments in the early stages of the breeding pipeline can provide a more comprehensive assessment of genotype stability and adaptation which are fundamental for the precise selection of superior genotypes widely adapted to various environments. Various training set sizes and compositions, as well as prediction models, have been investigated. Overall, the training set size and the inclusion of O-RILs appear to be the main factors impacting prediction accuracy given a fixed training set size while the genetic diversity seems to be a secondary factor, except when it was minimized returning the worse results. Additional studies investigating the real-world effectiveness of prediction accuracy based on genotype ranking and advancement breeding decisions can help determine the ideal protocols for GP-based sparse testing in soybean. In summary, GP-based sparse testing can either improve/increase testing capacity (represented as the number of genotypes and environments) at a fixed cost or substantially decrease the cost of a breeding pipeline at a fixed testing capacity.</p>
</sec>
</body>
<back>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found here: <ext-link ext-link-type="uri" xlink:href="https://soybase.org/SoyNAM/">https://soybase.org/SoyNAM/</ext-link>.</p>
</sec>
<sec id="s7">
<title>Author contributions</title>
<p>RP: Data curation, Formal Analysis, Methodology, Writing&#x2013;original draft. CC: Conceptualization, Methodology, Supervision, Writing&#x2013;review and editing. ER: Conceptualization, Supervision, Writing&#x2013;review and editing. VH-V: Conceptualization, Writing&#x2013;review and editing. CM: Conceptualization, Writing&#x2013;review and editing. DR: Writing&#x2013;review and editing. DJ: Conceptualization, Supervision, Writing&#x2013;review and editing.</p>
</sec>
<sec id="s8">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. University of Florida, Agronomy Department.</p>
</sec>
<sec sec-type="COI-statement" id="s9">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
<p>The author(s) declared that they were an editorial board member of Frontiers, at the time of submission. This had no impact on the peer review process and the final decision.</p>
</sec>
<sec sec-type="disclaimer" id="s10">
<title>Publisher&#x2019;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<sec id="s11">
<title>Supplementary material</title>
<p>The Supplementary Material for this article can be found online at: <ext-link ext-link-type="uri" xlink:href="https://www.frontiersin.org/articles/10.3389/fgene.2023.1269255/full#supplementary-material">https://www.frontiersin.org/articles/10.3389/fgene.2023.1269255/full&#x23;supplementary-material</ext-link>
</p>
<supplementary-material xlink:href="DataSheet1.docx" id="SM1" mimetype="application/docx" xmlns:xlink="http://www.w3.org/1999/xlink"/>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Atanda</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Govindan</surname>
<given-names>V.</given-names>
</name>
<name>
<surname>Singh</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Robbins</surname>
<given-names>K. R.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Bentley</surname>
<given-names>A. R.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Sparse testing using genomic prediction improves selection for breeding targets in elite spring wheat</article-title>. <source>Theor. Appl. Genet.</source> <volume>135</volume>, <fpage>1939</fpage>&#x2013;<lpage>1950</lpage>. <pub-id pub-id-type="doi">10.1007/s00122-022-04085-0</pub-id>
</citation>
</ref>
<ref id="B2">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Canella Vieira</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>The numbers game of soybean breeding in the United States</article-title>. <source>Crop Breed. Appl. Biotechnol.</source> <volume>21</volume>, <fpage>1</fpage>&#x2013;<lpage>15</lpage>. <pub-id pub-id-type="doi">10.1590/1984</pub-id>
</citation>
</ref>
<ref id="B3">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Canella Vieira</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Persa</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Incorporation of soil-derived covariates in progeny testing and line selection to enhance genomic prediction accuracy in soybean breeding</article-title>. <source>Front. Genet.</source> <volume>13</volume>, <fpage>905824</fpage>&#x2013;<lpage>905915</lpage>. <pub-id pub-id-type="doi">10.3389/fgene.2022.905824</pub-id>
</citation>
</ref>
<ref id="B4">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Crespo Herrera</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Piepho</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>P&#xe9;rez Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Montesinos Lopez</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Burgue&#xf1;o</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2021</year>). <article-title>Genome enabled prediction for sparse testing in multi environmental wheat trials</article-title>. <source>Plant Genome</source> <volume>14</volume>, <fpage>e20151</fpage>. <pub-id pub-id-type="doi">10.1002/tpg2.20151</pub-id>
</citation>
</ref>
<ref id="B5">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Perez-Elizalde</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Miguel Cotes</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Viele</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2011</year>). <article-title>Bayesian estimation of the additive main effects and multiplicative interaction model</article-title>. <source>Crop Sci.</source> <volume>51</volume>, <fpage>1458</fpage>&#x2013;<lpage>1469</lpage>. <pub-id pub-id-type="doi">10.2135/cropsci2010.06.0343</pub-id>
</citation>
</ref>
<ref id="B6">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>P&#xe9;rez-Rodr&#xed;guez</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Cuevas</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>O.</given-names>
</name>
<name>
<surname>Jarqu&#xed;n</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>de los Campos</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2017</year>). <article-title>Genomic selection in plant breeding: methods, models, and perspectives</article-title>. <source>Trends Plant Sci.</source> <volume>22</volume>, <fpage>961</fpage>&#x2013;<lpage>975</lpage>. <pub-id pub-id-type="doi">10.1016/j.tplants.2017.08.011</pub-id>
</citation>
</ref>
<ref id="B7">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>de Leon</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Jannink</surname>
<given-names>J.-L.</given-names>
</name>
<name>
<surname>Edwards</surname>
<given-names>J. W.</given-names>
</name>
<name>
<surname>Kaeppler</surname>
<given-names>S. M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Introduction to a special issue on genotype by environment interaction</article-title>. <source>Crop Sci.</source> <volume>56</volume>, <fpage>2081</fpage>&#x2013;<lpage>2089</lpage>. <pub-id pub-id-type="doi">10.2135/cropsci2016.07.0002in</pub-id>
</citation>
</ref>
<ref id="B8">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Diers</surname>
<given-names>B. W.</given-names>
</name>
<name>
<surname>Specht</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Rainey</surname>
<given-names>K. M.</given-names>
</name>
<name>
<surname>Cregan</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Song</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Ramasubramanian</surname>
<given-names>V.</given-names>
</name>
<etal/>
</person-group> (<year>2018</year>). <article-title>Genetic architecture of soybean yield and agronomic traits</article-title>. <source>G3 Genes&#x7c;Genomes&#x7c;Genetics</source> <volume>8</volume>, <fpage>3367</fpage>&#x2013;<lpage>3375</lpage>. <pub-id pub-id-type="doi">10.1534/g3.118.200332</pub-id>
</citation>
</ref>
<ref id="B9">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Endelman</surname>
<given-names>J. B.</given-names>
</name>
</person-group> (<year>2011</year>). <article-title>Ridge regression and other kernels for genomic selection with R package rrBLUP</article-title>. <source>Plant Genome</source> <volume>4</volume>, <fpage>250</fpage>&#x2013;<lpage>255</lpage>. <pub-id pub-id-type="doi">10.3835/plantgenome2011.08.0024</pub-id>
</citation>
</ref>
<ref id="B10">
<citation citation-type="book">
<person-group person-group-type="author">
<name>
<surname>Gale</surname>
<given-names>F.</given-names>
</name>
<name>
<surname>Valdes</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Ash</surname>
<given-names>M.</given-names>
</name>
</person-group> (<year>2019</year>). <source>Interdependence of China, United States, and Brazil in soybean trade</source>. <publisher-name>Economic Research Service &#x2013; USDA</publisher-name>, <fpage>1</fpage>&#x2013;<lpage>48</lpage>.</citation>
</ref>
<ref id="B11">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Habier</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Fernando</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Dekkers</surname>
<given-names>J. C.</given-names>
</name>
</person-group> (<year>2007</year>). <article-title>The impact of genetic relationship information on genome-assisted breeding values</article-title>. <source>Genetics</source> <volume>177</volume> (<issue>4</issue>), <fpage>2389</fpage>&#x2013;<lpage>2397</lpage>. <pub-id pub-id-type="doi">10.1534/genetics.107.081190</pub-id>
</citation>
</ref>
<ref id="B12">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Habier</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Fernando</surname>
<given-names>R. L.</given-names>
</name>
<name>
<surname>Garrick</surname>
<given-names>D. J.</given-names>
</name>
</person-group> (<year>2013</year>). <article-title>Genomic BLUP decoded: a look into the black box of genomic prediction</article-title>. <source>Genetics</source> <volume>194</volume> (<issue>3</issue>), <fpage>597</fpage>&#x2013;<lpage>607</lpage>. <pub-id pub-id-type="doi">10.1534/genetics.113.152207</pub-id>
</citation>
</ref>
<ref id="B13">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>He</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Thistlethwaite</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Hayden</surname>
<given-names>M. J.</given-names>
</name>
<name>
<surname>Trethowan</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Daetwyler</surname>
<given-names>H. D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Improving selection efficiency of crop breeding with genomic prediction aided sparse phenotyping</article-title>. <source>Front. Plant Sci.</source> <volume>12</volume>, <fpage>735285</fpage>. <pub-id pub-id-type="doi">10.3389/fpls.2021.735285</pub-id>
</citation>
</ref>
<ref id="B14">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lacaze</surname>
<given-names>X.</given-names>
</name>
<name>
<surname>Du Cheyron</surname>
<given-names>P.</given-names>
</name>
<name>
<surname>Daucourt</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Lorgeou</surname>
<given-names>J.</given-names>
</name>
<etal/>
</person-group> (<year>2014a</year>). <article-title>A reaction norm model for genomic selection using high-dimensional genomic and environmental data</article-title>. <source>Theor. Appl. Genet.</source> <volume>127</volume>, <fpage>595</fpage>&#x2013;<lpage>607</lpage>. <pub-id pub-id-type="doi">10.1007/s00122-013-2243-1</pub-id>
</citation>
</ref>
<ref id="B15">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>de Leon</surname>
<given-names>N.</given-names>
</name>
<name>
<surname>Romay</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Bohn</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Buckler</surname>
<given-names>E. S.</given-names>
</name>
<name>
<surname>Ciampitti</surname>
<given-names>I.</given-names>
</name>
<etal/>
</person-group> (<year>2020a</year>). <article-title>Utility of climatic information via combining ability models to improve genomic prediction for yield within the genomes to fields maize project</article-title>. <source>Front. Genet.</source> <volume>08</volume>, <fpage>592769</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2020.592769</pub-id>
</citation>
</ref>
<ref id="B16">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Howard</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Beyene</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Gowda</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Martini</surname>
<given-names>J. W. R.</given-names>
</name>
<etal/>
</person-group> (<year>2020b</year>). <article-title>Genomic prediction enhanced sparse testing for multi-environment trials</article-title>. <source>G3 Genes&#x7c;Genomes&#x7c;Genetics</source> <volume>10</volume>, <fpage>2725</fpage>&#x2013;<lpage>2739</lpage>. <pub-id pub-id-type="doi">10.1534/g3.120.401349</pub-id>
</citation>
</ref>
<ref id="B17">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Kocak</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Posadas</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Hyma</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Jedlicka</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Graef</surname>
<given-names>G.</given-names>
</name>
<etal/>
</person-group> (<year>2014b</year>). <article-title>Genotyping by sequencing for genomic prediction in a soybean breeding population</article-title>. <source>BMC Genomics</source> <volume>15</volume>, <fpage>740</fpage>. <pub-id pub-id-type="doi">10.1186/1471-2164-15-740</pub-id>
</citation>
</ref>
<ref id="B18">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Koester</surname>
<given-names>R. P.</given-names>
</name>
<name>
<surname>Skoneczka</surname>
<given-names>J. A.</given-names>
</name>
<name>
<surname>Cary</surname>
<given-names>T. R.</given-names>
</name>
<name>
<surname>Diers</surname>
<given-names>B. W.</given-names>
</name>
<name>
<surname>Ainsworth</surname>
<given-names>E. A.</given-names>
</name>
</person-group> (<year>2014</year>). <article-title>Historical gains in soybean (Glycine max Merr.) seed yield are driven by linear increases in light interception, energy conversion, and partitioning efficiencies</article-title>. <source>J. Exp. Bot.</source> <volume>65</volume>, <fpage>3311</fpage>&#x2013;<lpage>3321</lpage>. <pub-id pub-id-type="doi">10.1093/jxb/eru187</pub-id>
</citation>
</ref>
<ref id="B19">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Ma</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Reif</surname>
<given-names>J. C.</given-names>
</name>
<name>
<surname>Jiang</surname>
<given-names>Y.</given-names>
</name>
<name>
<surname>Wen</surname>
<given-names>Z.</given-names>
</name>
<name>
<surname>Wang</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Liu</surname>
<given-names>Z.</given-names>
</name>
<etal/>
</person-group> (<year>2016</year>). <article-title>Potential of marker selection to increase prediction accuracy of genomic selection in soybean (Glycine max L)</article-title>. <source>Mol. Breed.</source> <volume>36</volume>, <fpage>113</fpage>. <pub-id pub-id-type="doi">10.1007/s11032-016-0504-9</pub-id>
</citation>
</ref>
<ref id="B20">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Meuwissen</surname>
<given-names>T. H. E.</given-names>
</name>
<name>
<surname>Hayes</surname>
<given-names>B. J.</given-names>
</name>
<name>
<surname>Goddard</surname>
<given-names>M. E.</given-names>
</name>
</person-group> (<year>2001</year>). <article-title>Prediction of total genetic value using genome-wide dense marker maps</article-title>. <source>Genetics</source> <volume>157</volume>, <fpage>1819</fpage>&#x2013;<lpage>1829</lpage>. <pub-id pub-id-type="doi">10.1093/genetics/157.4.1819</pub-id>
</citation>
</ref>
<ref id="B21">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montesinos Lopez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Mosqueda Gonz&#xe1;lez</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Salinas Ruiz</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Montesinos L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Crossa</surname>
<given-names>J.</given-names>
</name>
</person-group> (<year>2023a</year>). <article-title>Sparse multi trait genomic prediction under balanced incomplete block design</article-title>. <source>Plant Genome</source> <volume>16</volume>, <fpage>e20305</fpage>. <pub-id pub-id-type="doi">10.1002/tpg2.20305</pub-id>
</citation>
</ref>
<ref id="B22">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Montesinos Lopez</surname>
<given-names>O. A.</given-names>
</name>
<name>
<surname>Saint Pierre</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Gezan</surname>
<given-names>S. A.</given-names>
</name>
<name>
<surname>Bentley</surname>
<given-names>A. R.</given-names>
</name>
<name>
<surname>Mosqueda-Gonz&#xe1;lez</surname>
<given-names>B. A.</given-names>
</name>
<name>
<surname>Montesinos-L&#xf3;pez</surname>
<given-names>A.</given-names>
</name>
<etal/>
</person-group> (<year>2023b</year>). <article-title>Optimizing sparse testing for genomic prediction of plant breeding crops</article-title>. <source>Genes (Basel)</source> <volume>14</volume>, <fpage>927</fpage>. <pub-id pub-id-type="doi">10.3390/genes14040927</pub-id>
</citation>
</ref>
<ref id="B23">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Persa</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Iwata</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2020</year>). <article-title>Use of family structure information in interaction with environments for leveraging genomic prediction models</article-title>. <source>Crop J.</source> <volume>8</volume>, <fpage>843</fpage>&#x2013;<lpage>854</lpage>. <pub-id pub-id-type="doi">10.1016/j.cj.2020.06.004</pub-id>
</citation>
</ref>
<ref id="B24">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rincker</surname>
<given-names>K.</given-names>
</name>
<name>
<surname>Nelson</surname>
<given-names>R.</given-names>
</name>
<name>
<surname>Specht</surname>
<given-names>J.</given-names>
</name>
<name>
<surname>Sleper</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Cary</surname>
<given-names>T.</given-names>
</name>
<name>
<surname>Cianzio</surname>
<given-names>S. R.</given-names>
</name>
<etal/>
</person-group> (<year>2014</year>). <article-title>Genetic improvement of U.S. Soybean in maturity groups II, III, and IV</article-title>. <source>Crop Sci.</source> <volume>54</volume>, <fpage>1419</fpage>&#x2013;<lpage>1432</lpage>. <pub-id pub-id-type="doi">10.2135/cropsci2013.10.0665</pub-id>
</citation>
</ref>
<ref id="B25">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Rowntree</surname>
<given-names>S. C.</given-names>
</name>
<name>
<surname>Suhre</surname>
<given-names>J. J.</given-names>
</name>
<name>
<surname>Weidenbenner</surname>
<given-names>N. H.</given-names>
</name>
<name>
<surname>Wilson</surname>
<given-names>E. W.</given-names>
</name>
<name>
<surname>Davis</surname>
<given-names>V. M.</given-names>
</name>
<name>
<surname>Naeve</surname>
<given-names>S. L.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Genetic gain &#xd7; management interactions in soybean: I. Planting date</article-title>. <source>Crop Sci.</source> <volume>53</volume>, <fpage>1128</fpage>&#x2013;<lpage>1138</lpage>. <pub-id pub-id-type="doi">10.2135/cropsci2012.03.0157</pub-id>
</citation>
</ref>
<ref id="B26">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Hyten</surname>
<given-names>D. L.</given-names>
</name>
<name>
<surname>Jia</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Quigley</surname>
<given-names>C. V.</given-names>
</name>
<name>
<surname>Fickus</surname>
<given-names>E. W.</given-names>
</name>
<name>
<surname>Nelson</surname>
<given-names>R. L.</given-names>
</name>
<etal/>
</person-group> (<year>2013</year>). <article-title>Development and evaluation of SoySNP50K, a high-density genotyping array for soybean</article-title>. <source>PLoS One</source> <volume>8</volume>, <fpage>1</fpage>&#x2013;<lpage>12</lpage>. <pub-id pub-id-type="doi">10.1371/journal.pone.0054985</pub-id>
</citation>
</ref>
<ref id="B27">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Song</surname>
<given-names>Q.</given-names>
</name>
<name>
<surname>Yan</surname>
<given-names>L.</given-names>
</name>
<name>
<surname>Quigley</surname>
<given-names>C.</given-names>
</name>
<name>
<surname>Fickus</surname>
<given-names>E.</given-names>
</name>
<name>
<surname>Wei</surname>
<given-names>H.</given-names>
</name>
<name>
<surname>Chen</surname>
<given-names>L.</given-names>
</name>
<etal/>
</person-group> (<year>2020</year>). <article-title>Soybean BARCSoySNP6K: an assay for soybean genetics and breeding research</article-title>. <source>Plant J.</source> <volume>104</volume>, <fpage>800</fpage>&#x2013;<lpage>811</lpage>. <pub-id pub-id-type="doi">10.1111/tpj.14960</pub-id>
</citation>
</ref>
<ref id="B28">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Specht</surname>
<given-names>J. E.</given-names>
</name>
<name>
<surname>Hume</surname>
<given-names>D. J.</given-names>
</name>
<name>
<surname>Kumudini</surname>
<given-names>S. V.</given-names>
</name>
</person-group> (<year>1999</year>). <article-title>Soybean yield potential-A genetic and physiological perspective</article-title>. <source>Crop Sci.</source> <volume>39</volume>, <fpage>1560</fpage>&#x2013;<lpage>1570</lpage>. <pub-id pub-id-type="doi">10.2135/cropsci1999.3961560x</pub-id>
</citation>
</ref>
<ref id="B29">
<citation citation-type="book">
<collab>United States Department of Agriculture</collab> (<year>2022a</year>). <source>Oilseeds: world markets and trade</source>. <publisher-loc>Washington, DC</publisher-loc>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://apps.fas.usda.gov/psdonline/circulars/oilseeds.pdf">https://apps.fas.usda.gov/psdonline/circulars/oilseeds.pdf</ext-link> (Accessed April 25, 2023)</comment>.</citation>
</ref>
<ref id="B30">
<citation citation-type="journal">
<collab>United States Department of Agriculture</collab> (<year>2022b</year>). <article-title>Charts and maps: field crops</article-title>. <source>Natl. Agric. Stat. Serv</source>. <comment>Available at: <ext-link ext-link-type="uri" xlink:href="https://www.nass.usda.gov/Charts_and_Maps/Field_Crops">https://www.nass.usda.gov/Charts_and_Maps/Field_Crops</ext-link> (Accessed May 3, 2023)</comment>.</citation>
</ref>
<ref id="B31">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Virdi</surname>
<given-names>K. S.</given-names>
</name>
<name>
<surname>Sreekanta</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Dobbels</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Haaning</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
<name>
<surname>Stupar</surname>
<given-names>R. M.</given-names>
</name>
<etal/>
</person-group> (<year>2023</year>). <article-title>Branch angle and leaflet shape are associated with canopy coverage in soybean</article-title>. <source>Plant Genome</source> <volume>16</volume>, <fpage>e20304</fpage>. <pub-id pub-id-type="doi">10.1002/tpg2.20304</pub-id>
</citation>
</ref>
<ref id="B32">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Wartha</surname>
<given-names>C. A.</given-names>
</name>
<name>
<surname>Lorenz</surname>
<given-names>A. J.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>Implementation of genomic selection in public-sector plant breeding programs: current status and opportunities</article-title>. <source>Crop Breed. Appl. Biotechnol.</source> <volume>21</volume>. <pub-id pub-id-type="doi">10.1590/1984-70332021v21sa28</pub-id>
</citation>
</ref>
<ref id="B33">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Widener</surname>
<given-names>S.</given-names>
</name>
<name>
<surname>Graef</surname>
<given-names>G.</given-names>
</name>
<name>
<surname>Lipka</surname>
<given-names>A. E.</given-names>
</name>
<name>
<surname>Jarquin</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2021</year>). <article-title>An assessment of the factors influencing the prediction accuracy of genomic prediction models across multiple environments</article-title>. <source>Front. Genet.</source> <volume>12</volume>, <fpage>689319</fpage>. <pub-id pub-id-type="doi">10.3389/fgene.2021.689319</pub-id>
</citation>
</ref>
<ref id="B34">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xavier</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Habier</surname>
<given-names>D.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>A new approach fits multivariate genomic prediction models efficiently</article-title>. <source>Genet. Sel. Evol.</source> <volume>54</volume>, <fpage>45</fpage>. <pub-id pub-id-type="doi">10.1186/s12711-022-00730-w</pub-id>
</citation>
</ref>
<ref id="B35">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Xavier</surname>
<given-names>A.</given-names>
</name>
<name>
<surname>Muir</surname>
<given-names>W. M.</given-names>
</name>
<name>
<surname>Rainey</surname>
<given-names>K. M.</given-names>
</name>
</person-group> (<year>2016</year>). <article-title>Assessing predictive properties of genome-wide selection in soybeans</article-title>. <source>G3 Genes&#x7c;Genomes&#x7c;Genetics</source> <volume>6</volume>, <fpage>2611</fpage>&#x2013;<lpage>2616</lpage>. <pub-id pub-id-type="doi">10.1534/g3.116.032268</pub-id>
</citation>
</ref>
<ref id="B36">
<citation citation-type="journal">
<person-group person-group-type="author">
<name>
<surname>Yoosefzadeh-Najafabadi</surname>
<given-names>M.</given-names>
</name>
<name>
<surname>Rajcan</surname>
<given-names>I.</given-names>
</name>
</person-group> (<year>2022</year>). <article-title>Six decades of soybean breeding in Ontario, Canada: a tradition of innovation</article-title>. <source>Can. J. Plant Sci.</source> <volume>103</volume>, <fpage>333</fpage>&#x2013;<lpage>352</lpage>. <pub-id pub-id-type="doi">10.1139/cjps-2022-0183</pub-id>
</citation>
</ref>
</ref-list>
</back>
</article>