<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.3 20070202//EN" "journalpublishing.dtd">
<article xml:lang="EN" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article">
<front>
<journal-meta>
<journal-id journal-id-type="publisher-id">Front. Neurosci.</journal-id>
<journal-title>Frontiers in Neuroscience</journal-title>
<abbrev-journal-title abbrev-type="pubmed">Front. Neurosci.</abbrev-journal-title>
<issn pub-type="epub">1662-453X</issn>
<publisher>
<publisher-name>Frontiers Media S.A.</publisher-name>
</publisher>
</journal-meta>
<article-meta>
<article-id pub-id-type="doi">10.3389/fnins.2024.1346374</article-id>
<article-categories>
<subj-group subj-group-type="heading">
<subject>Neuroscience</subject>
<subj-group>
<subject>Original Research</subject>
</subj-group>
</subj-group>
</article-categories>
<title-group>
<article-title>DTDNet: Dynamic Target Driven Network for pedestrian trajectory prediction</article-title>
</title-group>
<contrib-group>
<contrib contrib-type="author">
<name><surname>Liu</surname> <given-names>Shaohua</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2308806/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Sun</surname> <given-names>Jingkai</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-original-draft/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Yao</surname> <given-names>Pengfei</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="aff" rid="aff3"><sup>3</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Zhu</surname> <given-names>Yinglong</given-names></name>
<xref ref-type="aff" rid="aff1"><sup>1</sup></xref>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2554938/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/validation/"/>
<role content-type="https://credit.niso.org/contributor-roles/visualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
<role content-type="https://credit.niso.org/contributor-roles/software/"/>
</contrib>
<contrib contrib-type="author" corresp="yes">
<name><surname>Mao</surname> <given-names>Tianlu</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<xref ref-type="corresp" rid="c001"><sup>&#x0002A;</sup></xref>
<uri xlink:href="http://loop.frontiersin.org/people/2308827/overview"/>
<role content-type="https://credit.niso.org/contributor-roles/conceptualization/"/>
<role content-type="https://credit.niso.org/contributor-roles/methodology/"/>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
<contrib contrib-type="author">
<name><surname>Wang</surname> <given-names>Zhaoqi</given-names></name>
<xref ref-type="aff" rid="aff2"><sup>2</sup></xref>
<role content-type="https://credit.niso.org/contributor-roles/project-administration/"/>
<role content-type="https://credit.niso.org/contributor-roles/supervision/"/>
<role content-type="https://credit.niso.org/contributor-roles/writing-review-editing/"/>
</contrib>
</contrib-group>
<aff id="aff1"><sup>1</sup><institution>School of Electronic Engineering, Beijing University of Posts and Telecommunications</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff2"><sup>2</sup><institution>Beijing Key Laboratory of Mobile Computing and Pervasive Device, Institute of Computing Technology, Chinese Academy of Sciences</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<aff id="aff3"><sup>3</sup><institution>School of Computer Science and Technology, University of Chinese Academy of Science</institution>, <addr-line>Beijing</addr-line>, <country>China</country></aff>
<author-notes>
<fn fn-type="edited-by"><p>Edited by: Krishna Kumar Mohbey, Central University of Rajasthan, India</p></fn>
<fn fn-type="edited-by"><p>Reviewed by: Neha Sharma, Delhi Technological University, India</p>
<p>Zhe Huang, University of Illinois at Urbana-Champaign, United States</p></fn>
<corresp id="c001">&#x0002A;Correspondence: Tianlu Mao <email>ltm&#x00040;ict.ac.cn</email></corresp>
</author-notes>
<pub-date pub-type="epub">
<day>30</day>
<month>04</month>
<year>2024</year>
</pub-date>
<pub-date pub-type="collection">
<year>2024</year>
</pub-date>
<volume>18</volume>
<elocation-id>1346374</elocation-id>
<history>
<date date-type="received">
<day>29</day>
<month>11</month>
<year>2023</year>
</date>
<date date-type="accepted">
<day>11</day>
<month>04</month>
<year>2024</year>
</date>
</history>
<permissions>
<copyright-statement>Copyright &#x000A9; 2024 Liu, Sun, Yao, Zhu, Mao and Wang.</copyright-statement>
<copyright-year>2024</copyright-year>
<copyright-holder>Liu, Sun, Yao, Zhu, Mao and Wang</copyright-holder>
<license xlink:href="http://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (CC BY). The use, distribution or reproduction in other forums is permitted, provided the original author(s) and the copyright owner(s) are credited and that the original publication in this journal is cited, in accordance with accepted academic practice. No use, distribution or reproduction is permitted which does not comply with these terms.</p></license>
</permissions>
<abstract>
<p>Predicting the trajectories of pedestrians is an important and difficult task for many applications, such as robot navigation and autonomous driving. Most of the existing methods believe that an accurate prediction of the pedestrian intention can improve the prediction quality. These works tend to predict a fixed destination coordinate as the agent intention and predict the future trajectory accordingly. However, in the process of moving, the intention of a pedestrian could be a definite location or a general direction and area, and may change dynamically with the changes of surrounding. Thus, regarding the agent intention as a fixed 2-d coordinate is insufficient to improve the future trajectory prediction. To address this problem, we propose Dynamic Target Driven Network for pedestrian trajectory prediction (DTDNet), which employs a multi-precision pedestrian intention analysis module to capture this dynamic. To ensure that this extracted feature contains comprehensive intention information, we design three sub-tasks: predicting coarse-precision endpoint coordinate, predicting fine-precision endpoint coordinate and scoring scene sub-regions. In addition, we propose a original multi-precision trajectory data extraction method to achieve multi-resolution representation of future intention and make it easier to extract local scene information. We compare our model with previous methods on two publicly available datasets (ETH-UCY and Stanford Drone Dataset). The experimental results show that our DTDNet achieves better trajectory prediction performance, and conducts better pedestrian intention feature representation.</p></abstract>
<kwd-group>
<kwd>multimodal trajectory prediction</kwd>
<kwd>pedestrian intention prediction</kwd>
<kwd>multi-precision motion prediction</kwd>
<kwd>multi-task neural network</kwd>
<kwd>trajectory endpoint prediction</kwd>
</kwd-group>
<counts>
<fig-count count="5"/>
<table-count count="6"/>
<equation-count count="20"/>
<ref-count count="31"/>
<page-count count="11"/>
<word-count count="7459"/>
</counts>
<custom-meta-wrap>
<custom-meta>
<meta-name>section-at-acceptance</meta-name>
<meta-value>Decision Neuroscience</meta-value>
</custom-meta>
</custom-meta-wrap>
</article-meta>
</front>
<body>
<sec sec-type="intro" id="s1">
<title>1 Introduction</title>
<p>Trajectory prediction is an essential research area that has various applications in autonomous driving (Bennewitz et al., <xref ref-type="bibr" rid="B2">2005</xref>; Ma et al., <xref ref-type="bibr" rid="B13">2019</xref>; Chandra et al., <xref ref-type="bibr" rid="B3">2020</xref>), robot navigation (Rasouli et al., <xref ref-type="bibr" rid="B19">2019</xref>), and surveillance systems (Oh et al., <xref ref-type="bibr" rid="B16">2011</xref>; Sultani et al., <xref ref-type="bibr" rid="B23">2018</xref>). For instance, in autonomous driving, vehicles need to estimate the future movements of pedestrians to avoid collisions and plan a safe driving path.</p>
<p>One of the basic challenges for trajectory prediction is to analyze the pedestrian future intention in the changing context, such as whether the pedestrian intends to cross the road before or after a car passes. This analysis can provide a useful information for trajectory prediction. Recently, some works have considered the agent intention prediction in the trajectory prediction task, such as PECNet (Mangalam et al., <xref ref-type="bibr" rid="B14">2020</xref>), TNT (Zhao et al., <xref ref-type="bibr" rid="B31">2021</xref>), DenseTNT (Gu et al., <xref ref-type="bibr" rid="B5">2021</xref>), and so on. However, these methods simplify the problem by assuming that the agent intention endpoint, which reveals the agent movement intention, remains constant during the prediction range.</p>
<p>In fact, predicting the endpoint coordinates of pedestrians is a very challenging task. Pedestrians will dynamically adjust their intent endpoint coordinates in respond to the change of scene information in different regions. As shown in <xref ref-type="fig" rid="F1">Figure 1</xref>, the pedestrian in the red frame is the target pedestrian. In the left image, the vehicle on the right is parked at the upper right of the image and has no tendency to move forward. At this time, the short-term movement target of the pedestrian is the red star below the vehicle. However, during the movement of the pedestrian, the vehicle starts to move forward, blocking the original movement target of the pedestrian. Due to environmental changes, pedestrians must change their original intention and move toward the green star at the upper right. It is important to dynamically analyze the pedestrian&#x00027;s intent coordinate by combining the pedestrian&#x00027;s motion state and scene characteristics.</p>
<fig id="F1" position="float">
<label>Figure 1</label>
<caption><p>Dynamic change of the pedestrian intention.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-18-1346374-g0001.tif"/>
</fig>
<p>In addition, when modeling the future intention of the pedestrian, existing methods generally use the multi-layer perceptrons (MLPs) to predict a 2-d coordinate as the intention feature. Huang et al. (<xref ref-type="bibr" rid="B10">2021</xref>) models the intention with a Mutable Intention Filter to address the drift in long-term pedestrian trajectory prediction, and its experiment demonstrates the goal prediction is changing during the prediction process. But there are limitations in the work. Firstly, this work assumes that all targets are located at the scene edges, which is unrealistic. And it models the intention with specific 2-D locations. The pedestrian&#x00027;s movement intention information should not be modeled as a specific physical coordinate, and the observable coordinate cannot fully represent the pedestrian&#x00027;s intention to help predict the future trajectory as in <xref ref-type="fig" rid="F1">Figure 1</xref>.</p>
<p>In this paper, we model the intention as features that combine both fine-precision destination and coarse-precision region representation, and could be dynamically changed in the prediction process, consider the dynamic changing caused by environment and pedestrian. To extract a feasible dynamic intention feature, we propose a multi-precision pedestrian intention analysis module, which dynamically predicts intent from the scene information and history trajectory. We generate the coarse-precision coordinate from the history trajectory, then we use the scene heatmap and the coarse-precision coordinate to calculate the local dynamic feature. By combining the local dynamic feature and the coarse-precision coordinate, we predict agent intention feature as an assistance to predicting the future trajectory. In addition, three sub-tasks including prediction of coarse-precision endpoint coordinate, fine-precision endpoint coordinate and scene sub-regions scoring are proposed to help training the feasible dynamic agent intent extraction module.</p>
<p>We propose Dynamic Target Driven Network for pedestrian trajectory prediction (DTDNet). First, we use a motion pattern encoding module to extract movement patterns from pedestrian historical trajectories. After that, we use multi-precision pedestrian intention analysis module to extract the feasible intention based on multi-precision feature input. At the same time, multi-precision intention analysis sub-tasks are introduced to aid pedestrian intent information extraction. Finally, a pedestrian trajectory decoding module based on the CVAE generation framework combines pedestrian movement patterns and scene information to predict pedestrian intent coordinates dynamically. The contributions of this paper are as follows:</p>
<list list-type="order">
<list-item><p>We discuss the dynamic changing attribute of pedestrian intention prediction process, and propose a novel module to extract the dynamic intention feature accordingly. This module encodes the pedestrian future intention at each time steps iteratively with scene information, and we propose a multi-task structure to aid the feature learning process with three related subtasks.</p></list-item>
<list-item><p>We propose a novel multi-precision pedestrian trajectory data representation method to estimate the multi-precision intention, including three aspects: coarse-precision coordinates, fine-precision coordinates, and local scene information.</p></list-item>
<list-item><p>We design a new trajectory prediction model DTDNet, which conducts the prediction with dynamic intention modeling and multi-precision history data. Qualitative and quantitative experiments show that this model outperforms current methods and predicts endpoint coordinates closer to the future endpoint.</p></list-item>
</list>
</sec>
<sec id="s2">
<title>2 Related work</title>
<sec>
<title>2.1 Trajectory prediction</title>
<p>Early researches on trajectory prediction are based on hand-craft rules and energy potentials. Helbing and Molnar (<xref ref-type="bibr" rid="B7">1995</xref>) model the force between pedestrians by attractive force and repulsive force. However, with the limitation of the hand-craft functions, the previous approaches cannot model the complicated interactions in crowded scenarios. Trajectory prediction is a time series prediction task, many data-driven methods (Oliveira et al., <xref ref-type="bibr" rid="B17">2021</xref>; Zhang et al., <xref ref-type="bibr" rid="B30">2022</xref>) have been proposed to solve this problem in recent years. Alahi et al. (<xref ref-type="bibr" rid="B1">2016</xref>) propose one of the earliest deep learning models for trajectory prediction, which uses a grid-based &#x0201C;social pooling&#x0201D; layer to aggregate the hidden state of the pedestrians in the neighborhood. Gupta et al. (<xref ref-type="bibr" rid="B6">2018</xref>) also use the pooling-based method and propose a &#x0201C;pooling module&#x0201D; to share information of all the pedestrians in the whole scene. Vemula et al. (<xref ref-type="bibr" rid="B25">2018</xref>) and Kosaraju et al. (<xref ref-type="bibr" rid="B11">2019</xref>) introduce the attention mechanism to assign different importance to different agents. Recent works (Huang et al., <xref ref-type="bibr" rid="B9">2019</xref>; Hu et al., <xref ref-type="bibr" rid="B8">2020</xref>; Mohamed et al., <xref ref-type="bibr" rid="B15">2020</xref>; Tao et al., <xref ref-type="bibr" rid="B24">2020</xref>) are all graph-based methods that use graph neural networks to model the interactions among the pedestrians.</p>
</sec>
<sec>
<title>2.2 Human-scene interaction</title>
<p>Pedestrian motion is not only affected by surrounding pedestrians, but the layout features of the scene also limit the movement space of pedestrians. Therefore, effectively extracting scene information plays a crucial role in trajectory prediction. Some works (Vemula et al., <xref ref-type="bibr" rid="B25">2018</xref>; Huang et al., <xref ref-type="bibr" rid="B9">2019</xref>) use VGGNet to encode a large scene&#x00027;s complete overhead image information. The model can learn any scene information and use the visual attention mechanism to assign important spatial regions to pedestrians. To incorporate scene category information, Yao et al. (<xref ref-type="bibr" rid="B27">2021</xref>) use a semantic segmentation model to process scene pictures. Pixel-level scene category information can be obtained by using semantic segmentation information. However, this method still has ambiguous information and does not know whether pedestrians in this category could move forward. Wang et al. (<xref ref-type="bibr" rid="B26">2022</xref>) proposed a heat map construction method based on historical trajectory statistics and used the GLU module to model scene information continuity.</p>
</sec>
<sec>
<title>2.3 Human intention prediction</title>
<p>Pedestrians have subjective intentions to guide themselves to reach their expected goals. Recently, some researchers have begun to research the endpoint prediction of pedestrians. Mangalam et al. (<xref ref-type="bibr" rid="B14">2020</xref>) used the CVAE module to predict the endpoint information and then predicted the complete trajectory. Different from the previous model, Lerner et al. (<xref ref-type="bibr" rid="B12">2007</xref>) used the bidirectional trajectory fitting method to predict the complete trajectory in the stage of generating the complete trajectory. Zhao et al. (<xref ref-type="bibr" rid="B31">2021</xref>) propose to set up multiple candidate endpoints in the region where pedestrians are likely to reach and score different candidate endpoints based on pedestrian characteristics. Gu et al. (<xref ref-type="bibr" rid="B5">2021</xref>) improved TNT (Zhao et al., <xref ref-type="bibr" rid="B31">2021</xref>) and proposed a trajectory prediction method without pre-defining candidate targets. It dramatically improves the performance of target estimation without relying on heuristic predefined target quality. Unlike previous work that only modeled a single long-term objective, Robicquet et al. (<xref ref-type="bibr" rid="B20">2016</xref>) proposed a step-wise objective-driven network for trajectory prediction that evaluates and uses the goal at multiple time scales.</p>
</sec>
</sec>
<sec id="s3">
<title>3 Method</title>
<p>In this section, we introduce structure of our DTDNet model, as shown in <xref ref-type="fig" rid="F2">Figure 2</xref>. At first, we present the construction of multi-precision data. Then we discuss the three sub-networks of DTDNet: the motion pattern encoding module, multi-precision pedestrian intention analysis module and trajectory decoding module.</p>
<fig id="F2" position="float">
<label>Figure 2</label>
<caption><p>Overview of our proposed DTDNet. Take fine-precision, coarse-percision pedestrian coordinate and local scene information as input, the DTDNet is consisted of three parts: a motion pattern encoding module based on pedestrian historical trajectories (blue), a dynamic multi-task intent analysis module based on multi-precision feature input (orange), a multi-modal trajectory decoding module based on the CVAE freamwork (green and brown). Green part in the CVAE module is used only in the training stage.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-18-1346374-g0002.tif"/>
</fig>
<sec>
<title>3.1 Formulations</title>
<p>We assume that there are <italic>N</italic> pedestrians in the scene <italic>I</italic>, the position coordinates of pedestrian <italic>i</italic> at time step <italic>t</italic> is denoted as <inline-formula><mml:math id="M1"><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:math></inline-formula>. Our model uses historical trajectories <inline-formula><mml:math id="M2"><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mrow><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>i</mml:mi></mml:mstyle><mml:mo>&#x0005F;</mml:mo><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>h</mml:mi></mml:mstyle></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x0007B;</mml:mo><mml:msubsup><mml:mi>P</mml:mi><mml:mi>i</mml:mi><mml:mi>t</mml:mi></mml:msubsup><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mo stretchy='false'>[</mml:mo><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>]</mml:mo><mml:mo>&#x0007D;</mml:mo></mml:mrow></mml:math></inline-formula> to predict the future locations <inline-formula><mml:math id="M3"><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi><mml:mstyle class="text"><mml:mtext>_</mml:mtext></mml:mstyle><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mover accent="true"><mml:mrow><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo>^</mml:mo></mml:mover><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>[</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo>]</mml:mo></mml:mrow></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:math></inline-formula> and minimize the distance between prediction and future trajectory <inline-formula><mml:math id="M4"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mstyle class="text"><mml:mtext class="textrm" mathvariant="normal">i</mml:mtext></mml:mstyle><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:mo>&#x02225;</mml:mo><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>-</mml:mo><mml:msubsup><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:msub><mml:mrow><mml:mo>&#x02225;</mml:mo></mml:mrow><mml:mrow><mml:mn>2</mml:mn></mml:mrow></mml:msub></mml:math></inline-formula>.</p>
</sec>
<sec>
<title>3.2 Multi-precision data construction</title>
<p>We get three kinds of data for the model to perform the multi-precision modeling, namely fine-precision coordinates, coarse-precision coordinates, and dynamic local scene information.</p>
<sec>
<title>3.2.1 Coarse precision coordinate generation</title>
<p>A schematic diagram of coarse-precision coordinates is shown on the left in <xref ref-type="fig" rid="F2">Figure 2</xref>, the model divides the global scene into multiple sub-regions. The region coordinates are the input coarse-precision coordinates, which retain the physical information of the scene location and are easy to combine with the scene information.</p>
<p>First, we collect coordinate ranges (<italic>x</italic><sub><italic>min</italic></sub>, <italic>x</italic><sub><italic>max</italic></sub>, <italic>y</italic><sub><italic>min</italic></sub>, <italic>y</italic><sub><italic>max</italic></sub>) of different scenes based on the training data. Following the principle of equal spacing, we get the segmentation space of each region according to the set division resolution <italic>R</italic> &#x0003D; <italic>m</italic>&#x000D7;<italic>n</italic>. Furthermore, we could use the pedestrian&#x00027;s current position <italic>P</italic><sub><italic>i</italic></sub>, the coordinate range of the scene (<italic>x</italic><sub><italic>min</italic></sub>, <italic>x</italic><sub><italic>max</italic></sub>, <italic>y</italic><sub><italic>min</italic></sub>, <italic>y</italic><sub><italic>max</italic></sub>), and the length of the region to calculate the coarse precision coordinates. By using <xref ref-type="fig" rid="F6">Algorithm 1</xref>, we could get the pedestrians&#x00027; coarse precision coordinates <italic>PR</italic> as shown in <xref ref-type="fig" rid="F6">Algorithm 1</xref>.</p>
<fig id="F6" position="float">
<label>Algorithm 1</label>
<caption><p>Strategy of coarse-precision coordinate generation.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-18-1346374-g0006.tif"/>
</fig>
</sec>
<sec>
<title>3.2.2 Fine precision coordinate generation</title>
<p>After obtaining the coarse-precision coordinates of pedestrians, we perform data pre-processing on both fine-precision coordinates and coarse-precision coordinates. To increase the generation capability of the model, we set the position (<italic>x</italic><sub><italic>T</italic><sub><italic>obs</italic></sub></sub>, <italic>y</italic><sub><italic>T</italic><sub><italic>obs</italic></sub></sub>) of the target pedestrian at the last observation time step as the origin, and convert the absolute position into relative position according to the position of origin.</p>
<p>We adopt the same data pre-processing method as Trajectron&#x0002B;&#x0002B; (Salzmann et al., <xref ref-type="bibr" rid="B21">2020</xref>). In addition to the position coordinates, the input data also uses the first-order derivation and second-order derivation of position to calculate the speed information and acceleration information in both <italic>x</italic> and <italic>y</italic> direction. And we augment the training dataset by rotating all trajectories every 15 degrees around the origin point.</p>
</sec>
<sec>
<title>3.2.3 Dynamic scene information</title>
<p>Most existing methods use semantic segmentation of the scene image to model scene information. Although semantic segmentation information has proved useful in 3D stereo reconstruction and other fields, this information is ambiguous and lacks the interaction semantics between scenes and pedestrians. For example, the lawn beside the road is defined the same as the lawn in the park. However, the lawn in the park is allowed for pedestrians to walk, and the roadside lawn is generally prohibited for pedestrians. The two have the same semantic information, but different social rules.</p>
<p>To solve the ambiguity of pedestrian interaction with semantic segmentation and make the scene information guide pedestrian future movement more accurately, DTDNet uses the method of STHGLU (Wang et al., <xref ref-type="bibr" rid="B26">2022</xref>) to get the probability heatmap of each scene generated from historical trajectory collections. This method could provide the distribution of pedestrian movable area and the corresponding probability information. Coarse-precision coordinates keeps the spatial location information of the scene, combined with the regional information to get the local scene information.</p>
<p>Assuming that the coarse precision of the scene is <italic>R</italic> &#x0003D; <italic>m</italic>&#x000D7;<italic>n</italic>, we divide each sub-region with the precision of 9 &#x000D7; 9, and obtain the global scene information with the precision of <italic>R</italic> &#x0003D; 81 &#x000D7; <italic>m</italic>&#x000D7;<italic>n</italic>. At each moment, the model dynamically models the local scene <italic>s</italic> based on the pedestrian coarse-precision coordinate, provides information to guide the pedestrian future movement and avoid the pedestrian moving into the unreasonable area.</p>
</sec>
</sec>
<sec>
<title>3.3 Motion pattern encoding sub-network</title>
<p>As shown in the upper blue part of <xref ref-type="fig" rid="F2">Figure 2</xref>, the backbone of motion pattern encoding module is GRU, which inputs the fine-precision coordinates of pedestrians to model the motion pattern feature of pedestrians.</p>
<p>In <xref ref-type="disp-formula" rid="E1">Equation 1</xref>, we encode three input trajectory data including the position <italic>x</italic><sup><italic>t</italic></sup>, <italic>y</italic><sup><italic>t</italic></sup>, velocity &#x00394;<italic>x</italic><sup><italic>t</italic></sup>, &#x00394;<italic>y</italic><sup><italic>t</italic></sup> and acceleration <italic>ax</italic><sup><italic>t</italic></sup>, <italic>ay</italic><sup><italic>t</italic></sup> to the pedestrian motion hidden representation <italic>e</italic><sup><italic>t</italic></sup>. In addition to the pedestrian motion state <italic>e</italic><sup><italic>t</italic></sup>, as shown in <xref ref-type="disp-formula" rid="E2">Equation 2</xref>, the model includes the pedestrian target intent vector <italic>g</italic><sup><italic>t</italic></sup>. At each moment, the endpoint decoding module uses the MLP as <italic>f</italic><sub><italic>goal</italic></sub> to map the output of GRU to the endpoint coordinates <inline-formula><mml:math id="M5"><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> of pedestrian, as shown in <xref ref-type="disp-formula" rid="E3">Equation 3</xref>. The goal prediction is trained with Lossdes, as shown in <xref ref-type="disp-formula" rid="E4">Equation 4</xref>. The goal prediction is trained with <italic>Loss</italic><sub><italic>des</italic></sub>, which is the distance between the real and the predict goal. Generation of the target intention vector <italic>g</italic><sup><italic>t</italic></sup> from <italic>h</italic><sup><italic>t</italic></sup> will be introduced in detail in Section 3.4.2.</p>
<disp-formula id="E1"><label>(1)</label><mml:math id="M6"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mo>&#x00394;</mml:mo><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mo>&#x00394;</mml:mo><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>a</mml:mi><mml:mi>x</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>a</mml:mi><mml:mi>y</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E2"><label>(2)</label><mml:math id="M7"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:mi>G</mml:mi><mml:mi>R</mml:mi><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>i</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>G</mml:mi><mml:mi>R</mml:mi><mml:mi>U</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E3"><label>(3)</label><mml:math id="M8"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mi>o</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi><mml:mi>o</mml:mi><mml:mi>a</mml:mi><mml:mi>l</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E4"><label>(4)</label><mml:math id="M9"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mrow><mml:mi>g</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.4 Dynamic pedestrian target prediction</title>
<sec>
<title>3.4.1 Multi-precision pedestrian intention analysis sub-network</title>
<p>In the model, the output <italic>h</italic><sup><italic>t</italic></sup> is used to predict the pedestrian target coordinates at each time step, using the mean square error as loss can not guarantee complete converge at. In order to model the pedestrian&#x00027;s target intention and achieve a better convergence effect, we design a pedestrian dynamic intent prediction sub-network to update the pedestrian&#x00027;s intent dynamically.</p>
<p>The model input of the sub-network consists of three parts: the fine-precision coordinate <italic>p</italic><sub><italic>f</italic></sub>, the coarse-precision coordinate <italic>p</italic><sub><italic>c</italic></sub>, the scene information <italic>s</italic>. It is the same as <xref ref-type="disp-formula" rid="E1">Equation 1</xref>, the multi-layer perceptron encodes the fine-precision <italic>p</italic><sub><italic>f</italic></sub> and coarse-precision <italic>p</italic><sub><italic>c</italic></sub> coordinate and obtains embeddings <italic>e</italic><sub><italic>f</italic></sub> and <italic>e</italic><sub><italic>c</italic></sub>, respectively. As shown in <xref ref-type="disp-formula" rid="E5">Equation 5</xref>, the model uses the convolutional neural network (CNN) to encode the local scene information <italic>s</italic><sup><italic>t</italic></sup> to obtain <inline-formula><mml:math id="M10"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>.</p>
<disp-formula id="E5"><label>(5)</label><mml:math id="M11"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>C</mml:mi><mml:mi>N</mml:mi><mml:mi>N</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi><mml:mi>n</mml:mi><mml:mi>n</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>In order to model the time series features and fuse them with the modeling information of the main network, we also use GRU to model the sequence of three kinds of information input by the sub-network. As shown in <xref ref-type="disp-formula" rid="E6">Equation 6</xref>, the input of the GRU model of the sub-network contains <inline-formula><mml:math id="M12"><mml:msubsup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M13"><mml:msubsup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, <inline-formula><mml:math id="M14"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> three dimensions of information, the output <inline-formula><mml:math id="M15"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> is the intent embedding predicted by the sub-network at time <italic>t</italic>, and <italic>W</italic><sub><italic>GRUsub</italic></sub> is the training parameters.</p>
<disp-formula id="E6"><label>(6)</label><mml:math id="M16"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mi>G</mml:mi><mml:mi>R</mml:mi><mml:mi>U</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:munder></mml:mstyle><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>-</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mstyle displaystyle="true"><mml:munder class="msub"><mml:mrow><mml:mi>G</mml:mi><mml:mi>R</mml:mi><mml:mi>U</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:munder></mml:mstyle></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
</sec>
<sec>
<title>3.4.2 Multi-precision pedestrian intention analysis sub-tasks</title>
<p>To extract the pedestrian intention feature, in addition to predicting the fine-precision coordinates of the target coordinate, DTDNet proposes two additional sub-tasks to model the pedestrian intent information, namely predicting the coarse-precision endpoint region and score the pedestrian intent destination region.</p>
<p>The first sub-tasks is shown in <xref ref-type="disp-formula" rid="E7">Equation 7</xref>. The model uses the MLP <italic>f</italic><sub><italic>f</italic></sub> to map the pedestrian motion intention embedding <inline-formula><mml:math id="M17"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> to predict the fine-precision coordinates of the pedestrian intention, where <italic>W</italic><sub><italic>f</italic></sub> are trainable parameters.</p>
<disp-formula id="E7"><label>(7)</label><mml:math id="M18"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>f</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The second sub-tasks is shown in <xref ref-type="disp-formula" rid="E8">Equation 8</xref>. The model uses the MLP <italic>f</italic><sub><italic>c</italic></sub> to map the pedestrian motion intention vector <inline-formula><mml:math id="M19"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> to predict the coarse-precision coordinates of the pedestrian&#x00027;s endpoint, where <italic>W</italic><sub><italic>c</italic></sub> are the model update parameters.</p>
<disp-formula id="E8"><label>(8)</label><mml:math id="M20"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>c</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>The third sub-task is to estimate the likelihood of all sub-regions. First, the model uses the MLP <italic>f</italic><sub><italic>score</italic></sub> to map <inline-formula><mml:math id="M21"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, where <italic>W</italic><sub><italic>score</italic></sub> are the model update parameters. Then uses the Softmax function to score <italic>R</italic> &#x0003D; <italic>m</italic>&#x000D7;<italic>n</italic> sub-regions in the scene, as shown in <xref ref-type="disp-formula" rid="E9">Equation 9</xref>. Because there is only one ground truth region, we set the score of the true region to 1 and the scores of other regions to 0.</p>
<disp-formula id="E9"><label>(9)</label><mml:math id="M22"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Through the above introduction, the loss function of the sub-network consists of three parts as shown in <xref ref-type="disp-formula" rid="E10">Equation 10</xref>. Where <inline-formula><mml:math id="M23"><mml:mover accent="true"><mml:mrow><mml:mi>p</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:math></inline-formula> is the endpoint coordinate predicted by the model, <italic>p</italic> is the actual endpoint coordinate, <italic>score</italic> is the region scoring result, the <italic>label</italic> is the actual region scoring label, and <italic>L</italic><sub><italic>CE</italic></sub> is the cross-entropy function.</p>
<disp-formula id="E10"><label>(10)</label><mml:math id="M24"><mml:mrow><mml:mtable columnalign='right'><mml:mtr columnalign='right'><mml:mtd columnalign='right'><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>R</mml:mi><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>p</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>f</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>f</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow><mml:mo>+</mml:mo><mml:mi>R</mml:mi><mml:mi>M</mml:mi><mml:mi>S</mml:mi><mml:mi>E</mml:mi><mml:mrow><mml:mo>(</mml:mo><mml:mrow><mml:msub><mml:mover accent='true'><mml:mi>p</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>c</mml:mi></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>c</mml:mi></mml:msub></mml:mrow><mml:mo>)</mml:mo></mml:mrow></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='right'><mml:mtd columnalign='right'><mml:mrow><mml:mo>+</mml:mo><mml:msub><mml:mi>L</mml:mi><mml:mrow><mml:mi>C</mml:mi><mml:mi>E</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mo>,</mml:mo><mml:mi>l</mml:mi><mml:mi>a</mml:mi><mml:mi>b</mml:mi><mml:mi>e</mml:mi><mml:mi>l</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
<p>However, since the current sub-network and the main network are decoupled, the main network cannot use the sub-networks loss function to assist in the model update. In order to use the back-propagation of the model to update the two networks synchronously, we design two network fusion schemes to couple the two parts of the network.</p>
<p>The first method is to fuse the motion state of the main network with the important scene information selected by the sub-network. The sub-network of the model scores the importance of <italic>m</italic>&#x000D7;<italic>n</italic> sub-regions at each moment and selects the Top K with the highest scores. The target sub-region is used as the key region, and the CNN shown in <xref ref-type="disp-formula" rid="E5">Equation 5</xref> encodes the selected K regions, respectively.</p>
<disp-formula id="E11"><label>(11)</label><mml:math id="M25"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mstyle displaystyle="true"><mml:munderover accentunder="false" accent="false"><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>j</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:munderover></mml:mstyle><mml:mi>s</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>r</mml:mi><mml:msub><mml:mrow><mml:mi>e</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000D7;</mml:mo><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>j</mml:mi></mml:mrow></mml:msubsup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>After encoding K regions, the model uses <xref ref-type="disp-formula" rid="E11">Equation 11</xref> to fuse K scene information to obtain the crucial regional information that pedestrians need to consider. Finally, the multi-attention mechanism and residual connection are used to combine the two networks to get the target intention vector <italic>g</italic><sup><italic>t</italic></sup>.</p>
<disp-formula id="E12"><label>(12)</label><mml:math id="M26"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>S</mml:mi><mml:mi>o</mml:mi><mml:mi>f</mml:mi><mml:mi>t</mml:mi><mml:mi>m</mml:mi><mml:mi>a</mml:mi><mml:mi>x</mml:mi><mml:mrow><mml:mo stretchy="true">(</mml:mo><mml:mrow><mml:mfrac><mml:mrow><mml:mo>&#x0003C;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>Q</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>K</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>&#x0003E;</mml:mo></mml:mrow><mml:mrow><mml:msqrt><mml:mrow><mml:mi>D</mml:mi></mml:mrow></mml:msqrt></mml:mrow></mml:mfrac></mml:mrow><mml:mo stretchy="true">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E13"><label>(13)</label><mml:math id="M27"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mi>g</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mo>&#x02211;</mml:mo></mml:mrow><mml:mrow><mml:mi>r</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mrow><mml:mo>{</mml:mo><mml:mrow><mml:mn>1</mml:mn><mml:mo>,</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>.</mml:mo><mml:mo>,</mml:mo><mml:mi>p</mml:mi></mml:mrow><mml:mo>}</mml:mo></mml:mrow></mml:mrow></mml:msub><mml:msub><mml:mrow><mml:mi>s</mml:mi></mml:mrow><mml:mrow><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mo>&#x000B7;</mml:mo><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>V</mml:mi></mml:mrow></mml:msub><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mo>,</mml:mo><mml:mi>r</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow><mml:mo>&#x0002B;</mml:mo><mml:msup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Where &#x0003C; &#x000B7;, &#x000B7;&#x0003E; is the inner product operator, and <italic>r</italic>&#x02208;{1, &#x02026;, <italic>p</italic>}, <italic>W</italic><sub><italic>Q</italic></sub>, <italic>W</italic><sub><italic>K</italic></sub> and <italic>W</italic><sub><italic>V</italic></sub> are trainable parameters, <italic>h</italic><sup><italic>t</italic></sup> is the output of the motion encoding network GRU of time step t, <italic>D</italic> is the embedding dimension of <italic>h</italic><sup><italic>t</italic></sup>, <italic>p</italic> is the number of heads in the multi-head attention mechanism, <italic>s</italic><sub><italic>r</italic></sub> is the attention score, and <italic>g</italic><sup><italic>t</italic></sup> is the target intent embedding.</p>
<p>The fusion method introduced in <xref ref-type="fig" rid="F6">Algorithm 1</xref> directly combines K important scene information, which may introduce excessively artificially set rule information. It is difficult to determine the optimal value of parameter K. Therefore, we attempt to directly fuse the output <inline-formula><mml:math id="M28"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula> of the sub-network with the GRU output <italic>h</italic><sup><italic>t</italic></sup> of the main network using the attention mechanism introduced in <xref ref-type="disp-formula" rid="E12">Equations 12</xref>, <xref ref-type="disp-formula" rid="E13">13</xref>.</p>
</sec>
</sec>
<sec>
<title>3.5 Trajectory decoding sub-network</title>
<p>This sub-network utilizes CVAE based framework to generate multi-modal trajectories. CVAE framework is composed by an encoding module and a decoding module. The encoding network is further divided into a recognition distribution network <italic>q</italic><sub>&#x003C8;</sub>(<italic>z</italic>|<bold>P<sub>h</sub></bold>, <bold>P<sub>f</sub></bold>) and a prior distribution network <italic>p</italic><sub>&#x003B8;</sub>(<italic>z</italic>|<bold>P<sub>h</sub></bold>) given future ground truth trajectory as <inline-formula><mml:math id="M29"><mml:mrow><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle></mml:msub><mml:mo>=</mml:mo><mml:mo>&#x0007B;</mml:mo><mml:msup><mml:mi>P</mml:mi><mml:mi>t</mml:mi></mml:msup><mml:mo>,</mml:mo><mml:mi>t</mml:mi><mml:mo>&#x02208;</mml:mo><mml:mo stretchy='false'>[</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub><mml:mo stretchy='false'>]</mml:mo><mml:mo>&#x0007D;</mml:mo></mml:mrow></mml:math></inline-formula>.</p>
<p>As shown in <xref ref-type="disp-formula" rid="E14">Equation 14</xref>, the model encodes the pedestrian historical and future motion feature, and generates the mean &#x003BC; and variance &#x003C3; corresponding to a Gaussian distribution, and samples high-dimensional latent variable <italic>z</italic> from Gaussian distribution <italic>N</italic>(&#x003BC;, &#x003C3;). Then combines the sampled high-dimensional latent variable <italic>z</italic> with the GRU output <italic>h</italic><sup><italic>t</italic></sup> to obtain the hidden state <inline-formula><mml:math id="M30"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup></mml:math></inline-formula>, and iterate the hidden state at each time step, as shown in <xref ref-type="disp-formula" rid="E15">Equations 15</xref>, <xref ref-type="disp-formula" rid="E16">16</xref>. Finally use the decoding module <xref ref-type="disp-formula" rid="E17">Equation 17</xref> to predict the complete future trajectory.</p>
<disp-formula id="E14"><label>(14)</label><mml:math id="M31"><mml:mrow><mml:mi>&#x003BC;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>&#x003C8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>z</mml:mi><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>h</mml:mi></mml:mstyle></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:mi>z</mml:mi><mml:mo>~</mml:mo><mml:mi>N</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:mi>&#x003BC;</mml:mi><mml:mo>,</mml:mo><mml:mi>&#x003C3;</mml:mi><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E15"><label>(15)</label><mml:math id="M32"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msup><mml:mo>&#x02295;</mml:mo><mml:mi>z</mml:mi><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>m</mml:mi><mml:mi>l</mml:mi><mml:mi>p</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E16"><label>(16)</label><mml:math id="M33"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>=</mml:mo><mml:mi>D</mml:mi><mml:mo>-</mml:mo><mml:mi>G</mml:mi><mml:mi>R</mml:mi><mml:mi>U</mml:mi><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msubsup><mml:mo>,</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi><mml:mi>d</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<disp-formula id="E17"><label>(17)</label><mml:math id="M34"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>,</mml:mo><mml:msup><mml:mrow><mml:mi>&#x00177;</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msup><mml:mo>=</mml:mo><mml:msub><mml:mrow><mml:mi>f</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msub><mml:mrow><mml:mo stretchy="false">(</mml:mo><mml:mrow><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:mi>t</mml:mi><mml:mo>&#x0002B;</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msubsup><mml:mo>;</mml:mo><mml:msub><mml:mrow><mml:mi>W</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi><mml:mi>o</mml:mi><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>r</mml:mi></mml:mrow></mml:msub></mml:mrow><mml:mo stretchy="false">)</mml:mo></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<p>Where <italic>q</italic><sub>&#x003C8;</sub>, <italic>f</italic><sub><italic>mlp</italic></sub>,<italic>f</italic><sub><italic>pred</italic></sub>,<italic>f</italic><sub><italic>decoder</italic></sub> are implemented as MLPs, and &#x02295; represents the concatenate operation. <inline-formula><mml:math id="M35"><mml:msubsup><mml:mrow><mml:mi>h</mml:mi></mml:mrow><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>c</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mrow><mml:mi>T</mml:mi></mml:mrow><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msubsup></mml:math></inline-formula> represents the initial embedding of decoder GRU(D-GRU), <italic>h</italic><sup><italic>obs</italic></sup> is the motion information of the pedestrian at time <italic>T</italic><sub><italic>obs</italic></sub>, <italic>z</italic> represents the latent variable generated by the CVAE framework; <inline-formula><mml:math id="M36"><mml:msup><mml:mrow><mml:mover accent="true"><mml:mrow><mml:mi>x</mml:mi></mml:mrow><mml:mo>^</mml:mo></mml:mover></mml:mrow><mml:mrow><mml:mi>t</mml:mi></mml:mrow></mml:msup></mml:math></inline-formula>,&#x00177;<sup><italic>t</italic></sup> represents the pedestrian position predicted by the model at time step <italic>t</italic>.</p>
<p>In the testing phase, the latent variable <italic>z</italic> is directly sampled from <italic>p</italic><sub>&#x003B8;</sub>(<italic>z</italic>|<bold>P<sub>h</sub></bold>), and the recognition distribution is not calculated. We use <italic>KL</italic> divergence to make sure that prior distribution is same with the recognition distribution in the training stage, as shown in <xref ref-type="disp-formula" rid="E18">Equation 18</xref>. Finally, the model is trained end-to-end from loss <italic>Loss</italic><sub><italic>variety</italic></sub>, which is composed by the KL-divergence, sub-tasks loss, goal prediction loss, and the distance between the best prediction and the future trajectory, as shown in <xref ref-type="disp-formula" rid="E19">Equation 19</xref>.</p>
<disp-formula id="E18"><label>(18)</label><mml:math id="M37"><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mi>D</mml:mi><mml:mo stretchy='false'>(</mml:mo><mml:msub><mml:mi>q</mml:mi><mml:mi>&#x003C8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>z</mml:mi><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>h</mml:mi></mml:mstyle></mml:msub><mml:mo>,</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>f</mml:mi></mml:mstyle></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo>,</mml:mo><mml:msub><mml:mi>p</mml:mi><mml:mi>&#x003B8;</mml:mi></mml:msub><mml:mo stretchy='false'>(</mml:mo><mml:mi>z</mml:mi><mml:mo>&#x0007C;</mml:mo><mml:msub><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>P</mml:mi></mml:mstyle><mml:mstyle mathvariant='bold' mathsize='normal'><mml:mi>h</mml:mi></mml:mstyle></mml:msub><mml:mo stretchy='false'>)</mml:mo><mml:mo stretchy='false'>)</mml:mo></mml:mrow></mml:math></disp-formula>
<disp-formula id="E19"><label>(19)</label><mml:math id="M38"><mml:mrow><mml:mtable columnalign='right'><mml:mtr columnalign='right'><mml:mtd columnalign='right'><mml:mrow><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>v</mml:mi><mml:mi>a</mml:mi><mml:mi>r</mml:mi><mml:mi>i</mml:mi><mml:mi>e</mml:mi><mml:mi>t</mml:mi><mml:mi>y</mml:mi></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:munder><mml:mrow><mml:mi>m</mml:mi><mml:mi>i</mml:mi><mml:mi>n</mml:mi></mml:mrow><mml:mi>k</mml:mi></mml:munder><mml:mstyle displaystyle='true'><mml:munderover><mml:mo>&#x02211;</mml:mo><mml:mrow><mml:mi>t</mml:mi><mml:mo>=</mml:mo><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>o</mml:mi><mml:mi>b</mml:mi><mml:mi>s</mml:mi><mml:mo>+</mml:mo><mml:mn>1</mml:mn></mml:mrow></mml:msub></mml:mrow><mml:mrow><mml:msub><mml:mi>T</mml:mi><mml:mrow><mml:mi>p</mml:mi><mml:mi>r</mml:mi><mml:mi>e</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:munderover><mml:mrow><mml:msub><mml:mrow><mml:mrow><mml:mo>&#x02016;</mml:mo><mml:mrow><mml:msubsup><mml:mover accent='true'><mml:mi>p</mml:mi><mml:mo>&#x0005E;</mml:mo></mml:mover><mml:mi>k</mml:mi><mml:mi>t</mml:mi></mml:msubsup><mml:mo>&#x02212;</mml:mo><mml:msup><mml:mi>p</mml:mi><mml:mi>t</mml:mi></mml:msup></mml:mrow><mml:mo>&#x02016;</mml:mo></mml:mrow></mml:mrow><mml:mn>1</mml:mn></mml:msub></mml:mrow></mml:mstyle><mml:mo>+</mml:mo><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>d</mml:mi><mml:mi>e</mml:mi><mml:mi>s</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr><mml:mtr columnalign='right'><mml:mtd columnalign='right'><mml:mrow><mml:mo>+</mml:mo><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>K</mml:mi><mml:mi>L</mml:mi><mml:mi>D</mml:mi></mml:mrow></mml:msub><mml:mo>+</mml:mo><mml:mi>L</mml:mi><mml:mi>o</mml:mi><mml:mi>s</mml:mi><mml:msub><mml:mi>s</mml:mi><mml:mrow><mml:mi>s</mml:mi><mml:mi>u</mml:mi><mml:mi>b</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:mtd></mml:mtr></mml:mtable></mml:mrow></mml:math></disp-formula>
</sec>
</sec>
<sec id="s4">
<title>4 Experiments and results</title>
<p>Datasets: We evaluate the performance of our model and report results on two real-world public datasets: ETH-UCY Dataset (Pellegrini et al., <xref ref-type="bibr" rid="B18">2009</xref>; Dendorfer et al., <xref ref-type="bibr" rid="B4">2021</xref>) and Stanford Drone Dataset (Shi et al., <xref ref-type="bibr" rid="B22">2021</xref>). <bold>ETH-UCY</bold> contains five subsets: ETH, HOTEL, UNIV, ZARA1, ZARA2. It contains 1,536 pedestrians and introduces interactions like group interactions, collision avoidance. We follow the experimental settings in Trajectron&#x0002B;&#x0002B; (Yu et al., <xref ref-type="bibr" rid="B28">2020</xref>), which convert the data to the world coordinate system and split them into 8 s segments (20 time steps). We use historical 3.2 s (eight time steps) to predict the future 4.8 s (12 time steps). <bold>Stanford Drone Dataset</bold> contains 20 scenes. We use the data released by NMMP (Tao et al., <xref ref-type="bibr" rid="B24">2020</xref>), whose coordinates of trajectories are provided in pixels, and the experimental settings are the same as ETH-UCY. For the ETH-UCY and Stanford Drone Dataset, we use the leave-one-out evaluation strategy to test different models.</p>
<p>Implementation details: We train our models with Adam optimizer, batch size 64, learning rate 0.0001 on a single NVIDIA Tesla T4 GPU. In coarse-precision modeling, we adopt different partitioning strategies. We divide ETH-UCY into 5 &#x000D7; 5 regions, and Stanford Drone Dataset into 9 &#x000D7; 9 regions. The resolution of scene information for each sub-region is 9 &#x000D7; 9. MLP and GRU hidden layer dimension are set to 256. The dimension of latent variable <italic>z</italic> is 64, which is sampled from a CVAE framework generated distribution. The hyper-parameter of variety loss weight is set to 20.</p>
<sec>
<title>4.1 Quantitative evaluation</title>
<p>We compare our method with seven state-of-the-art methods, including PMP-NMMP, Social-STGCNN, STAR, PECNET, Trajectron&#x0002B;&#x0002B;. The results are shown in <xref ref-type="table" rid="T1">Table 1</xref>, which are evaluated with the ADE and FDE metrics. The results indicate that our method significantly outperforms all the competing methods on the ETH and UCY datasets. Our method outperforms Agentformer (Yuan et al., <xref ref-type="bibr" rid="B29">2021</xref>) by 17.4% on the ADE metric, and on the FDE metric, our method outperforms Agentformer by 7.7%.</p>
<table-wrap position="float" id="T1">
<label>Table 1</label>
<caption><p>Quantitative results of all the previous state-of-the-art methods and our model on ETH-UCY.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>ETH</bold></th>
<th valign="top" align="center"><bold>HOTEL</bold></th>
<th valign="top" align="center"><bold>UNIV</bold></th>
<th valign="top" align="center"><bold>ZARA1</bold></th>
<th valign="top" align="center"><bold>ZARA2</bold></th>
<th valign="top" align="center"><bold>AVG</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">PMP-NMMP (Tao et al., <xref ref-type="bibr" rid="B24">2020</xref>)</td>
<td valign="top" align="center">0.61/1.08</td>
<td valign="top" align="center">0.33/0.63</td>
<td valign="top" align="center">0.52/1.11</td>
<td valign="top" align="center">0.32/0.66</td>
<td valign="top" align="center">0.29/0.61</td>
<td valign="top" align="center">0.41/0.82</td>
</tr>
<tr>
<td valign="top" align="left">Social-STGCNN (Hu et al., <xref ref-type="bibr" rid="B8">2020</xref>)</td>
<td valign="top" align="center">0.64/1.11</td>
<td valign="top" align="center">0.49/0.85</td>
<td valign="top" align="center">0.44/0.79</td>
<td valign="top" align="center">0.34/0.48</td>
<td valign="top" align="center">0.30/0.48</td>
<td valign="top" align="center">0.44/0.75</td>
</tr>
<tr>
<td valign="top" align="left"><italic>STAR</italic> (Yuan et al., <xref ref-type="bibr" rid="B29">2021</xref>)</td>
<td valign="top" align="center"><bold>0.36/0.65</bold></td>
<td valign="top" align="center">0.17/0.36</td>
<td valign="top" align="center">0.31/0.62</td>
<td valign="top" align="center">0.26/0.55</td>
<td valign="top" align="center">0.22/0.46</td>
<td valign="top" align="center">0.26/0.53</td>
</tr>
<tr>
<td valign="top" align="left"><italic>PECNet</italic> (Mangalam et al., <xref ref-type="bibr" rid="B14">2020</xref>)</td>
<td valign="top" align="center">0.54/0.87</td>
<td valign="top" align="center">0.18/0.24</td>
<td valign="top" align="center">0.35/0.60</td>
<td valign="top" align="center">0.22/0.39</td>
<td valign="top" align="center">0.17/0.30</td>
<td valign="top" align="center">0.29/0.48</td>
</tr>
<tr>
<td valign="top" align="left">Trajectron&#x0002B;&#x0002B; (Yu et al., <xref ref-type="bibr" rid="B28">2020</xref>)</td>
<td valign="top" align="center">0.43/0.86</td>
<td valign="top" align="center"><bold>0.12/0.19</bold></td>
<td valign="top" align="center">0.22/<bold>0.43</bold></td>
<td valign="top" align="center">0.17/0.32</td>
<td valign="top" align="center">0.12/0.25</td>
<td valign="top" align="center">0.21/0.41</td>
</tr>
<tr>
<td valign="top" align="left">MG-GAN (Dendorfer et al., <xref ref-type="bibr" rid="B4">2021</xref>)</td>
<td valign="top" align="center">0.47/0.91</td>
<td valign="top" align="center">0.14/0.24</td>
<td valign="top" align="center">0.54/1.07</td>
<td valign="top" align="center">0.36/0.73</td>
<td valign="top" align="center">0.29/0.60</td>
<td valign="top" align="center">0.36/0.71</td>
</tr>
<tr>
<td valign="top" align="left"><italic>SGCN</italic> (Shi et al., <xref ref-type="bibr" rid="B22">2021</xref>)</td>
<td valign="top" align="center">0.63/1.03</td>
<td valign="top" align="center">0.32/0.55</td>
<td valign="top" align="center">0.37/0.70</td>
<td valign="top" align="center">0.29/0.53</td>
<td valign="top" align="center">0.25/0.45</td>
<td valign="top" align="center">0.37/0.65</td>
</tr>
<tr>
<td valign="top" align="left"><italic>Agentformer</italic> (Yuan et al., <xref ref-type="bibr" rid="B29">2021</xref>)</td>
<td valign="top" align="center">0.45/0.75</td>
<td valign="top" align="center">0.14/0.22</td>
<td valign="top" align="center">0.25/0.45</td>
<td valign="top" align="center">0.18/0.30</td>
<td valign="top" align="center">0.14/0.24</td>
<td valign="top" align="center">0.23/0.39</td>
</tr>
<tr>
<td valign="top" align="left">DTDNet (No sub-tasks)</td>
<td valign="top" align="center">0.38/0.69</td>
<td valign="top" align="center">0.13/0.24</td>
<td valign="top" align="center">0.23/0.47</td>
<td valign="top" align="center">0.13/0.27</td>
<td valign="top" align="center">0.12/0.24</td>
<td valign="top" align="center">0.20/0.38</td>
</tr>
<tr>
<td valign="top" align="left">DTDNet (Ours)</td>
<td valign="top" align="center">0.37/0.67</td>
<td valign="top" align="center">0.13/0.23</td>
<td valign="top" align="center"><bold>0.21</bold>/0.44</td>
<td valign="top" align="center"><bold>0.13/0.26</bold></td>
<td valign="top" align="center"><bold>0.12/0.23</bold></td>
<td valign="top" align="center"><bold>0.19/0.36</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>We calculate the metrics for <italic>T</italic><sub><italic>obs</italic></sub> = 8 (3.2s) and <italic>T</italic><sub><italic>pre</italic></sub> = 12 (4.8 s) (best of 20 samples). The bold value indicates the best result.</p>
</table-wrap-foot>
</table-wrap>
<p>To compare the results of deterministic sampling, we compared the past three models, namely STGAT, STAR, and Trajectron&#x0002B;&#x0002B;. The experimental results are shown in <xref ref-type="table" rid="T2">Table 2</xref>. Although our method is consistent with Trajectron&#x0002B;&#x0002B; in ADE metrics, our method is superior to Trajectorn&#x0002B;&#x0002B; by 12.6% in FDE, which shows that the intent prediction module has played a role, and pedestrians&#x00027; intent coordinates could be predicted more accurately.</p>
<table-wrap position="float" id="T2">
<label>Table 2</label>
<caption><p>Quantitative results of all the previous state-of-the-art methods and our model on ETH-UCY.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>ETH</bold></th>
<th valign="top" align="center"><bold>HOTEL</bold></th>
<th valign="top" align="center"><bold>UNIV</bold></th>
<th valign="top" align="center"><bold>ZARA1</bold></th>
<th valign="top" align="center"><bold>ZARA2</bold></th>
<th valign="top" align="center"><bold>AVG</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>STGAT</italic> (Mohamed et al., <xref ref-type="bibr" rid="B15">2020</xref>)</td>
<td valign="top" align="center">0.88/1.66</td>
<td valign="top" align="center">0.56/1.15</td>
<td valign="top" align="center">0.51/1.13</td>
<td valign="top" align="center">0.41/0.91</td>
<td valign="top" align="center">0.31/0.68</td>
<td valign="top" align="center">0.51/1.11</td>
</tr>
<tr>
<td valign="top" align="left"><italic>STAR</italic> (Yuan et al., <xref ref-type="bibr" rid="B29">2021</xref>)</td>
<td valign="top" align="center"><bold>0.56/1.11</bold></td>
<td valign="top" align="center">0.26/0.50</td>
<td valign="top" align="center">0.52/1.15</td>
<td valign="top" align="center">0.41/0.90</td>
<td valign="top" align="center">0.31/0.71</td>
<td valign="top" align="center">0.41/0.87</td>
</tr>
<tr>
<td valign="top" align="left">Trajectron&#x0002B;&#x0002B; (Yu et al., <xref ref-type="bibr" rid="B28">2020</xref>)</td>
<td valign="top" align="center">0.71/1.68</td>
<td valign="top" align="center"><bold>0.22/0.46</bold></td>
<td valign="top" align="center"><bold>0.41</bold>/1.07</td>
<td valign="top" align="center">0.30/0.77</td>
<td valign="top" align="center">0.23/0.59</td>
<td valign="top" align="center">0.37/0.95</td>
</tr>
<tr>
<td valign="top" align="left">DTDNet (Ours)</td>
<td valign="top" align="center">0.63/1.42</td>
<td valign="top" align="center">0.25/0.51</td>
<td valign="top" align="center">0.43/<bold>1.01</bold></td>
<td valign="top" align="center"><bold>0.26/0.63</bold></td>
<td valign="top" align="center"><bold>0.24/0.57</bold></td>
<td valign="top" align="center"><bold>0.36/0.83</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>We calculate the metrics for <italic>T</italic><sub><italic>obs</italic></sub> = 8 (3.2 s) and <italic>T</italic><sub><italic>pre</italic></sub> = 12 (4.8 s) (one sample). The bold value indicates the best result.</p>
</table-wrap-foot>
</table-wrap>
<p><xref ref-type="table" rid="T3">Table 3</xref> shows the experimental results of Stanford Drone Dataset. The scenes of Stanford Drone Dataset are rich and various, and our model performs better than all previous works on this dataset. We outperform the best Trajectron&#x0002B;&#x0002B; model on the ADE metrics by 7.1%, and in the FDE metrics, our method outperforms the PECNet model by 3.1%. It means that our model has a better ability in the migration of different scenes.</p>
<table-wrap position="float" id="T3">
<label>Table 3</label>
<caption><p>Quantitative comparison on Stanford Drone Dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>ADE</bold></th>
<th valign="top" align="center"><bold>FDE</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">Sophie (Vemula et al., <xref ref-type="bibr" rid="B25">2018</xref>)</td>
<td valign="top" align="center">16.3</td>
<td valign="top" align="center">29.4</td>
</tr>
<tr>
<td valign="top" align="left">PMP-NMMPN (Tao et al., <xref ref-type="bibr" rid="B24">2020</xref>)</td>
<td valign="top" align="center">14.7</td>
<td valign="top" align="center">26.7</td>
</tr>
<tr>
<td valign="top" align="left">STGAT (Mohamed et al., <xref ref-type="bibr" rid="B15">2020</xref>)</td>
<td valign="top" align="center">14.2</td>
<td valign="top" align="center">26.7</td>
</tr>
<tr>
<td valign="top" align="left">MG-GAN (Dendorfer et al., <xref ref-type="bibr" rid="B4">2021</xref>)</td>
<td valign="top" align="center">13.6</td>
<td valign="top" align="center">25.8</td>
</tr>
<tr>
<td valign="top" align="left">Trajectron&#x0002B;&#x0002B; (Yu et al., <xref ref-type="bibr" rid="B28">2020</xref>)</td>
<td valign="top" align="center">9.9</td>
<td valign="top" align="center">16.8</td>
</tr>
<tr>
<td valign="top" align="left"><italic>PECNet</italic> (Mangalam et al., <xref ref-type="bibr" rid="B14">2020</xref>)</td>
<td valign="top" align="center">10.0</td>
<td valign="top" align="center">15.9</td>
</tr>
<tr>
<td valign="top" align="left">DTDNet (Ours)</td>
<td valign="top" align="center"><bold>9.2</bold></td>
<td valign="top" align="center"><bold>15.4</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Given previous 3.2 s, predicting future 4.8 s. ADE/FDE is reported in pixels (20 samples). The bold value indicates the best result.</p>
</table-wrap-foot>
</table-wrap>
</sec>
<sec>
<title>4.2 Ablation study</title>
<p>To verify the role of the auxiliary loss function in the sub-tasks, we designed an ablation experiment on ETH-UCY dataset for comparison in the last two lines of <xref ref-type="table" rid="T1">Table 1</xref>. The ablation model still retains local scene information and coarse-precision coordinates but does not add the loss function for auxiliary sub-tasks updates. Compared with the ablation model, the whole model can improve the ADE and FDE metrics by 5.0 and 5.6%, respectively.</p>
<p>To evaluate the promotion effect of the three sub-tasks on pedestrian intent prediction, as shown in <xref ref-type="table" rid="T4">Table 4</xref>, we designed four ablation models on SDD dataset for comparative experiments: (1) Replace the CVAE module with Gaussian noise sampling, (2) without the sub-task of scene scoring, (3) without the coarse-precision prediction sub-task, (4) without the fine-precision prediction sub-task. It shows that the fine-precision prediction task is still the most effective task that affects the trajectory prediction results most significantly. The coarse-precision prediction and scene scoring tasks also could improve the trajectory prediction effect. Our model does not take any pedestrian interaction information into consideration, which shows that only using pedestrian motion features and scene information could achieve sota results.</p>
<table-wrap position="float" id="T4">
<label>Table 4</label>
<caption><p>Ablation study of DTDNet structure on Stanford Drone Dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Method</bold></th>
<th valign="top" align="center"><bold>ADE</bold></th>
<th valign="top" align="center"><bold>FDE</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left"><italic>No CVAE module</italic></td>
<td valign="top" align="center">9.7</td>
<td valign="top" align="center">16.4</td>
</tr>
<tr>
<td valign="top" align="left"><italic>No scene scoringmodule</italic></td>
<td valign="top" align="center">9.4</td>
<td valign="top" align="center">15.8</td>
</tr>
<tr>
<td valign="top" align="left"><italic>No coarse precision loss function</italic></td>
<td valign="top" align="center">9.5</td>
<td valign="top" align="center">15.9</td>
</tr>
<tr>
<td valign="top" align="left"><italic>No fine precision loss function</italic></td>
<td valign="top" align="center">9.6</td>
<td valign="top" align="center">16.1</td>
</tr>
<tr>
<td valign="top" align="left">DTDNet (Ours)</td>
<td valign="top" align="center"><bold>9.2</bold></td>
<td valign="top" align="center"><bold>15.4</bold></td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>Given previous 3.2 s, predicting future 4.8 s. ADE/FDE is reported in pixels (choose the best from 20 samples). The bold value indicates the best result.</p>
</table-wrap-foot>
</table-wrap>
<p>To evaluate the effectiveness of the sub-tasks and choose an appropriate region division accuracy, we conduct experiments in <xref ref-type="table" rid="T5">Tables 5</xref>, <xref ref-type="table" rid="T6">6</xref>. In <xref ref-type="table" rid="T5">Table 5</xref>, we conduct experiments with different coarse precision settings on the SDD dataset, and the ADE/FDE results show that the 9 &#x000D7; 9 precision division results are better than other precision settings. In <xref ref-type="table" rid="T6">Table 6</xref>, we evaluated the recall for the important region scoring sub-tasks at time step <italic>T</italic><sub>8</sub> and compared the effects of different region division accuracy and different recall numbers. <italic>TP</italic> is the number of target regions recalled by the model, <italic>P</italic> is the number of samples in the test experiment, each sample has only one target region, and <italic>P</italic><sub><italic>recall</italic></sub> is the recall rate, as shown in <xref ref-type="disp-formula" rid="E20">Equation 20</xref>.</p>
<disp-formula id="E20"><label>(20)</label><mml:math id="M39"><mml:mtable class="eqnarray" columnalign="left"><mml:mtr><mml:mtd><mml:msub><mml:mrow><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mstyle class="mbox"><mml:mtext>recall</mml:mtext></mml:mstyle></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mrow><mml:mi>T</mml:mi><mml:mi>P</mml:mi></mml:mrow><mml:mrow><mml:mi>P</mml:mi></mml:mrow></mml:mfrac><mml:mo>&#x000D7;</mml:mo><mml:mn>100</mml:mn><mml:mi>%</mml:mi></mml:mtd></mml:mtr></mml:mtable></mml:math></disp-formula>
<table-wrap position="float" id="T5">
<label>Table 5</label>
<caption><p>Ablation study of different coarse precisions on Stanford Drone Dataset (ADE/FDE is reported).</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold>Precision</bold></th>
<th valign="top" align="center"><bold>ADE</bold></th>
<th valign="top" align="center"><bold>FDE</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">5 &#x000D7; 5</td>
<td valign="top" align="center">9.4</td>
<td valign="top" align="center">15.8</td>
</tr>
<tr>
<td valign="top" align="left">9 &#x000D7; 9</td>
<td valign="top" align="center"><bold>9.2</bold></td>
<td valign="top" align="center"><bold>15.4</bold></td>
</tr>
<tr>
<td valign="top" align="left">15 &#x000D7; 15</td>
<td valign="top" align="center">9.3</td>
<td valign="top" align="center">15.5</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold value indicates the best result.</p>
</table-wrap-foot>
</table-wrap>
<table-wrap position="float" id="T6">
<label>Table 6</label>
<caption><p>Relationship between recall rate <italic>P</italic> and recall number <italic>k</italic> under different precisions on Stanford Drone Dataset.</p></caption>
<table frame="box" rules="all">
<thead>
<tr style="background-color:#919498;color:#ffffff">
<th valign="top" align="left"><bold><italic>Precision</italic></bold></th>
<th valign="top" align="center"><bold>1</bold></th>
<th valign="top" align="center"><bold>2</bold></th>
<th valign="top" align="center"><bold>3</bold></th>
<th valign="top" align="center"><bold>4</bold></th>
<th valign="top" align="center"><bold>5</bold></th>
<th valign="top" align="center"><bold>6</bold></th>
</tr>
</thead>
<tbody>
<tr>
<td valign="top" align="left">5 &#x000D7; 5</td>
<td valign="top" align="center">61.8%</td>
<td valign="top" align="center">84.6%</td>
<td valign="top" align="center">91.4%</td>
<td valign="top" align="center">96.9%</td>
<td valign="top" align="center">98.3%</td>
<td valign="top" align="center">99.1%</td>
</tr>
<tr>
<td valign="top" align="left">9 &#x000D7; 9</td>
<td valign="top" align="center"><bold>68.6%</bold></td>
<td valign="top" align="center"><bold>89.3%</bold></td>
<td valign="top" align="center"><bold>95.1%</bold></td>
<td valign="top" align="center"><bold>98.3%</bold></td>
<td valign="top" align="center"><bold>99.1%</bold></td>
<td valign="top" align="center"><bold>99.6%</bold></td>
</tr>
<tr>
<td valign="top" align="left">15 &#x000D7; 15</td>
<td valign="top" align="center">67.2%</td>
<td valign="top" align="center">88.1%</td>
<td valign="top" align="center">94.2%</td>
<td valign="top" align="center">98.2%</td>
<td valign="top" align="center">99.0%</td>
<td valign="top" align="center">99.4%</td>
</tr></tbody>
</table>
<table-wrap-foot>
<p>The bold value indicates the best result.</p>
</table-wrap-foot>
</table-wrap>
<p><xref ref-type="table" rid="T6">Table 6</xref> shows that the model recalls the Top 1 scored region, and the recall rate of the target area is more than 60%. When the recall number is 6, the recall rate of the target region is close to 100%. The regional scoring task can identify important areas and predict the target region of pedestrians with better accuracy. <xref ref-type="table" rid="T6">Table 6</xref> shows that the recall rate of the model in the 9 &#x000D7; 9 precision are better than the 5 &#x000D7; 5 or 15 &#x000D7; 15 precision. This result is consistent with the results in <xref ref-type="table" rid="T5">Table 5</xref>, so we set the coarse precision size to 9 &#x000D7; 9 on dataset with a larger scene.</p>
</sec>
<sec>
<title>4.3 Qualitative evaluation</title>
<sec>
<title>4.3.1 Visualization of the DTDNet and ground truth</title>
<p>We select two motion modes for display: group motion and pedestrian motion to avoid collision. In <xref ref-type="fig" rid="F3">Figures 3A</xref>, <xref ref-type="fig" rid="F3">B</xref>, multiple groups of pedestrians are moving in the same direction, and the results predicted by our model almost completely fit the actual red trajectories. In <xref ref-type="fig" rid="F3">Figures 3C</xref>, <xref ref-type="fig" rid="F3">D</xref>, the pedestrian motion trajectory avoids collision with surrounding pedestrians and obstacles. Our model predicts the pedestrian&#x00027;s turning motion intention and effectively predicts the pedestrian&#x00027;s offset angle, avoids collision with vehicles and passing pedestrians.</p>
<fig id="F3" position="float">
<label>Figure 3</label>
<caption><p><bold>(A&#x02013;D)</bold> Qualitative analysis of DTDNet. For a better view, only part of the pedestrians in the scene is presented. The illustration scenes are selected from ZARA1. Observed trajectories are shown as solid lines, and the predicted trajectories are shown as dashed lines. The red line represents the true trajectory.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-18-1346374-g0003.tif"/>
</fig>
</sec>
<sec>
<title>4.3.2 Visualization of the trajectory distribution</title>
<p>As shown in the <xref ref-type="fig" rid="F4">Figure 4</xref>, we compare our model (DTDNet) with Social-STGCNN in four different scenarios selected from ETH, HOTEL, ZARA1 and ZARA2 dataset. The dashed line represents the observed trajectory, and the solid line represents ground truth of the prediction and the color density is the predicted trajectory distribution. <xref ref-type="fig" rid="F4">Figure 4A</xref> shows that the future trajectories of the two pedestrians above are slightly shifted downward, DTDNet model predicts the same trajectory distribution, but Social-STGCNN predicts that the pedestrians are still going straight. As shown in <xref ref-type="fig" rid="F4">Figure 4B</xref>, compared with Social-STGCNN, DTDNet can predict the pedestrian&#x00027;s speed and the pedestrian&#x00027;s endpoint more accurately, so it can cover the true trajectory of the pedestrian. We could even predict multiple distribution trends in cases where there may be many likely future trajectories, and our generation framework does not have a mode collapse problem like other methods. As shown in <xref ref-type="fig" rid="F4">Figure 4C</xref>, taking the green trajectory in the figure as example, DTDNet not only predicts the movement of turning upward, but also predicts the trend of downward turning. However, the prediction effect of the model also has certain shortcomings. As shown in <xref ref-type="fig" rid="F4">Figure 4D</xref>, when pedestrians perform a sudden turning in the prediction time region, existing methods cannot predict the turning trend successfully. In future, we will try to introduce interactive information between dynamic obstacles in the predicting period to explore this problem.</p>
<fig id="F4" position="float">
<label>Figure 4</label>
<caption><p><bold>(A&#x02013;D)</bold> Qualitative analysis of DTDNet and Social-STGCNN. Upper ones are from DTDNet, lower ones are from Social-STGCNN.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-18-1346374-g0004.tif"/>
</fig>
</sec>
<sec>
<title>4.3.3 Visualization of intention prediction</title>
<p>To exhibit the dynamic prediction of pedestrian intent coordinate, we select a scene from the Stanford Drone Dataset and visualize the dynamic pedestrian intent and regional score predicted by the model in <xref ref-type="fig" rid="F5">Figure 5</xref>. The red star represents the future target endpoint, and the yellow star represents the predicted target coordinate at different time steps, the color of each sub-region represents the magnitude of the scene importance score, and the red region represents the high score. In <xref ref-type="fig" rid="F5">Figure 5</xref>, four time step results of pedestrian movement and divide the scene into 81 sub-regions according to the precision of 9 &#x000D7; 9. The model dynamically predicts pedestrian intent coordinates and the importance score of the scene. As the pedestrian moves, the target coordinate of the yellow star predicted by the model gradually approaches the real target. The importance score of the region near the actual location gradually increases. The color of the visualization gradually turns red, such as the region where the red star is located by the yellow at time <italic>T</italic><sub>1</sub> in <xref ref-type="fig" rid="F5">Figure 5A</xref> becomes red at time <italic>T</italic><sub>8</sub> in <xref ref-type="fig" rid="F5">Figure 5D</xref>. The number of the red regions near the finish area also increases significantly.</p>
<fig id="F5" position="float">
<label>Figure 5</label>
<caption><p>Visualization of dynamic intent prediction by DTDNet. Red star is the future endpoint, and yellow star represents the predicted target. Color of each sub-region represents the scene importance score. Red is the highest, green is the middle and blue is the lowest score. <bold>(A)</bold> T1. <bold>(B)</bold> T3. <bold>(C)</bold> T5. <bold>(D)</bold> T8.</p></caption>
<graphic mimetype="image" mime-subtype="tiff" xlink:href="fnins-18-1346374-g0005.tif"/>
</fig>
</sec>
</sec>
</sec>
<sec sec-type="conclusions" id="s5">
<title>5 Conclusion</title>
<p>In this work, we propose DTDNet, a Dynamic Target Driven Network for pedestrian trajectory prediction. Different from previous models that predict a fixed endpoint, DTDNet is designed to model the intention of a pedestrian dynamically with a hidden representation. This hidden representation could jointly represents mixture information of intention. We also introduce a multi-precision data representation method and three sub-tasks to analyze pedestrians motion intentions from different precision feature. The three sub-tasks are proved helpful to make sure the hidden representation could converge and be useful to the intention representation at each time step. Our proposed model is a superior to the baseline models in quantitative metrics on two publicly available datasets. Qualitative experiments show that our model could predict pedestrian intention accurately and dynamically. In the future, research should consider the potential effects of bringing related subtasks to help the network hidden representation of pedestrian converge better and add more supervision to the feature. Furthermore, the dynamic modeling of intentions at each timestep, along with predictions, could benefit from a more complicate network architecture that incorporates the modeling of complex interactions among moving objects within the scene to distill involved information.</p>
</sec>
<sec sec-type="data-availability" id="s6">
<title>Data availability statement</title>
<p>Publicly available datasets were analyzed in this study. This data can be found at: <ext-link ext-link-type="uri" xlink:href="https://github.com/StanfordASL/Trajectron-plus-plus/tree/master/experiments/pedestrians/raw">https://github.com/StanfordASL/Trajectron-plus-plus/tree/master/experiments/pedestrians/raw</ext-link>.</p>
</sec>
<sec sec-type="author-contributions" id="s7">
<title>Author contributions</title>
<p>SL: Conceptualization, Methodology, Supervision, Writing &#x02013; review &#x00026; editing. JS: Conceptualization, Methodology, Software, Writing &#x02013; original draft. PY: Methodology, Validation, Visualization, Writing &#x02013; review &#x00026; editing. YZ: Methodology, Validation, Visualization, Writing &#x02013; review &#x00026; editing, Software. TM: Conceptualization, Methodology, Project administration, Supervision, Writing &#x02013; review &#x00026; editing. ZW: Project administration, Supervision, Writing &#x02013; review &#x00026; editing.</p>
</sec>
</body>
<back>
<sec sec-type="funding-information" id="s8">
<title>Funding</title>
<p>The author(s) declare financial support was received for the research, authorship, and/or publication of this article. This work was supported in part by the Major Program of National Natural Science Foundation of China under Grant 91938301, in part by the National Key Research and Development Program of China under Grant 2020YFB1710400, in part by the Youth Program of National Natural Science Foundation of China under Grant 62002345, and in part by the Innovation Program of Institute of Computing Technology Chinese Academy of Sciences under Grant E261070.</p>
</sec>
<sec sec-type="COI-statement" id="conf1">
<title>Conflict of interest</title>
<p>The authors declare that the research was conducted in the absence of any commercial or financial relationships that could be construed as a potential conflict of interest.</p>
</sec>
<sec sec-type="disclaimer" id="s9">
<title>Publisher&#x00027;s note</title>
<p>All claims expressed in this article are solely those of the authors and do not necessarily represent those of their affiliated organizations, or those of the publisher, the editors and the reviewers. Any product that may be evaluated in this article, or claim that may be made by its manufacturer, is not guaranteed or endorsed by the publisher.</p>
</sec>
<ref-list>
<title>References</title>
<ref id="B1">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Alahi</surname> <given-names>A.</given-names></name> <name><surname>Goel</surname> <given-names>K.</given-names></name> <name><surname>Ramanathan</surname> <given-names>V.</given-names></name> <name><surname>Robicquet</surname> <given-names>A.</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L.</given-names></name> <name><surname>Savarese</surname> <given-names>S.</given-names></name> <etal/></person-group>. (<year>2016</year>). <article-title>&#x0201C;Social lstm: human trajectory prediction in crowded spaces,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Las Vegas, NV</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>961</fpage>&#x02013;<lpage>971</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2016.110</pub-id><pub-id pub-id-type="pmid">38381633</pub-id></citation></ref>
<ref id="B2">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Bennewitz</surname> <given-names>M.</given-names></name> <name><surname>Burgard</surname> <given-names>W.</given-names></name> <name><surname>Cielniak</surname> <given-names>G.</given-names></name> <name><surname>Thrun</surname> <given-names>S.</given-names></name></person-group> (<year>2005</year>). <article-title>Learning motion patterns of people for compliant robot motion</article-title>. <source>Int. J. Robot. Res</source>. <volume>24</volume>, <fpage>31</fpage>&#x02013;<lpage>48</lpage>. <pub-id pub-id-type="doi">10.1177/0278364904048962</pub-id></citation>
</ref>
<ref id="B3">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Chandra</surname> <given-names>R.</given-names></name> <name><surname>Guan</surname> <given-names>T.</given-names></name> <name><surname>Panuganti</surname> <given-names>S.</given-names></name> <name><surname>Mittal</surname> <given-names>T.</given-names></name> <name><surname>Bhattacharya</surname> <given-names>U.</given-names></name> <name><surname>Bera</surname> <given-names>A.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>Forecasting trajectory and behavior of road-agents using spectral clustering in graph-lstms</article-title>. <source>IEEE Robot. Autom. Lett</source>. <volume>5</volume>, <fpage>4882</fpage>&#x02013;<lpage>4890</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2020.3004794</pub-id></citation>
</ref>
<ref id="B4">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Dendorfer</surname> <given-names>P.</given-names></name> <name><surname>Elflein</surname> <given-names>S.</given-names></name> <name><surname>Leal-Taix&#x000E9;</surname> <given-names>L.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;MG-GAN: a multi-generator model preventing out-of-distribution samples in pedestrian trajectory prediction,&#x0201D;</article-title> in <source>2021 IEEE/CVF International Conference on Computer Vision (ICCV)</source> (<publisher-loc>Montreal, QC</publisher-loc>), <fpage>13138</fpage>&#x02013;<lpage>13147</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01291</pub-id></citation>
</ref>
<ref id="B5">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Gu</surname> <given-names>J.</given-names></name> <name><surname>Sun</surname> <given-names>C.</given-names></name> <name><surname>Zhao</surname> <given-names>H.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Densetnt: end-to-end trajectory prediction from dense goal sets,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Montreal, QC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>15303</fpage>&#x02013;<lpage>15312</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.01502</pub-id></citation>
</ref>
<ref id="B6">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Gupta</surname> <given-names>A.</given-names></name> <name><surname>Johnson</surname> <given-names>J.</given-names></name> <name><surname>Fei-Fei</surname> <given-names>L.</given-names></name> <name><surname>Savarese</surname> <given-names>S.</given-names></name> <name><surname>Alahi</surname> <given-names>A.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Social Gan: socially acceptable trajectories with generative adversarial networks,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Salt Lake City, UT</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>2255</fpage>&#x02013;<lpage>2264</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00240</pub-id><pub-id pub-id-type="pmid">38400437</pub-id></citation></ref>
<ref id="B7">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Helbing</surname> <given-names>D.</given-names></name> <name><surname>Molnar</surname> <given-names>P.</given-names></name></person-group> (<year>1995</year>). <article-title>Social force model for pedestrian dynamics</article-title>. <source>Phys. Rev. E</source> <volume>51</volume>:<fpage>4282</fpage>. <pub-id pub-id-type="doi">10.1103/PhysRevE.51.4282</pub-id><pub-id pub-id-type="pmid">9963139</pub-id></citation></ref>
<ref id="B8">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Hu</surname> <given-names>Y.</given-names></name> <name><surname>Chen</surname> <given-names>S.</given-names></name> <name><surname>Zhang</surname> <given-names>Y.</given-names></name> <name><surname>Gu</surname> <given-names>X.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Collaborative motion prediction via neural motion message passing,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6319</fpage>&#x02013;<lpage>6328</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.00635</pub-id></citation>
</ref>
<ref id="B9">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Y.</given-names></name> <name><surname>Bi</surname> <given-names>H.</given-names></name> <name><surname>Li</surname> <given-names>Z.</given-names></name> <name><surname>Mao</surname> <given-names>T.</given-names></name> <name><surname>Wang</surname> <given-names>Z.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;STGAT: modeling spatial-temporal interactions for human trajectory prediction,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Seoul</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6272</fpage>&#x02013;<lpage>6281</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2019.00637</pub-id></citation>
</ref>
<ref id="B10">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Huang</surname> <given-names>Z.</given-names></name> <name><surname>Hasan</surname> <given-names>A.</given-names></name> <name><surname>Shin</surname> <given-names>K.</given-names></name> <name><surname>Li</surname> <given-names>R.</given-names></name> <name><surname>Driggs-Campbell</surname> <given-names>K.</given-names></name></person-group> (<year>2021</year>). <article-title>Long-term pedestrian trajectory prediction using mutable intention filter and warp lstm</article-title>. <source>IEEE Robot. Autom. Lett</source>. <volume>6</volume>, <fpage>542</fpage>&#x02013;<lpage>549</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2020.3047731</pub-id></citation>
</ref>
<ref id="B11">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Kosaraju</surname> <given-names>V.</given-names></name> <name><surname>Sadeghian</surname> <given-names>A.</given-names></name> <name><surname>Mart&#x000ED;n-Mart&#x000ED;n</surname> <given-names>R.</given-names></name> <name><surname>Reid</surname> <given-names>I.</given-names></name> <name><surname>Rezatofighi</surname> <given-names>H.</given-names></name> <name><surname>Savarese</surname> <given-names>S.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;Social-BiGAT: multimodal trajectory forecasting using bicycle-GAN and graph attention networks,&#x0201D;</article-title> in <source>Proceedings of the 33rd International Conference on Neural Information Processing Systems</source> (<publisher-loc>Red Hook, NY</publisher-loc>: <publisher-name>Curran Associates Inc.</publisher-name>).</citation>
</ref>
<ref id="B12">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Lerner</surname> <given-names>A.</given-names></name> <name><surname>Chrysanthou</surname> <given-names>Y.</given-names></name> <name><surname>Lischinski</surname> <given-names>D.</given-names></name></person-group> (<year>2007</year>). <article-title>&#x0201C;Crowds by example,&#x0201D;</article-title> in <source>Computer graphics forum, Vol. 26</source> (<publisher-loc>Hoboken, NJ</publisher-loc>: <publisher-name>Wiley Online Library</publisher-name>), <fpage>655</fpage>&#x02013;<lpage>664</lpage>. <pub-id pub-id-type="doi">10.1111/j.1467-8659.2007.01089.x</pub-id></citation>
</ref>
<ref id="B13">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Ma</surname> <given-names>Y.</given-names></name> <name><surname>Zhu</surname> <given-names>X.</given-names></name> <name><surname>Zhang</surname> <given-names>S.</given-names></name> <name><surname>Yang</surname> <given-names>R.</given-names></name> <name><surname>Wang</surname> <given-names>W.</given-names></name> <name><surname>Manocha</surname> <given-names>D.</given-names></name> <etal/></person-group>. (<year>2019</year>). <article-title>&#x02018;Trafficpredict: trajectory prediction for heterogeneous traffic-agents</article-title>,. <source>Proc. AAAI Conf. Artif. Intell</source>. <volume>33</volume>, <fpage>6120</fpage>&#x02013;<lpage>6127</lpage>. <pub-id pub-id-type="doi">10.1609/aaai.v33i01.33016120</pub-id></citation>
</ref>
<ref id="B14">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Mangalam</surname> <given-names>K.</given-names></name> <name><surname>Girase</surname> <given-names>H.</given-names></name> <name><surname>Agarwal</surname> <given-names>S.</given-names></name> <name><surname>Lee</surname> <given-names>K.-H.</given-names></name> <name><surname>Adeli</surname> <given-names>E.</given-names></name> <name><surname>Malik</surname> <given-names>J.</given-names></name> <etal/></person-group>. (<year>2020</year>). <article-title>&#x0201C;It is not the journey but the destination: endpoint conditioned trajectory prediction,&#x0201D;</article-title> in <source>Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, August 23&#x02013;28, 2020, Proceedings, Part II 16</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>759</fpage>&#x02013;<lpage>776</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58536-5_45</pub-id></citation>
</ref>
<ref id="B15">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Mohamed</surname> <given-names>A.</given-names></name> <name><surname>Qian</surname> <given-names>K.</given-names></name> <name><surname>Elhoseiny</surname> <given-names>M.</given-names></name> <name><surname>Claudel</surname> <given-names>C.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Social-STGCNN: a social spatio-temporal graph convolutional neural network for human trajectory prediction&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Seattle, WA</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>14424</fpage>&#x02013;<lpage>14432</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR42600.2020.01443</pub-id></citation>
</ref>
<ref id="B16">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Oh</surname> <given-names>S.</given-names></name> <name><surname>Hoogs</surname> <given-names>A.</given-names></name> <name><surname>Perera</surname> <given-names>A.</given-names></name> <name><surname>Cuntoor</surname> <given-names>N.</given-names></name> <name><surname>Chen</surname> <given-names>C.-C.</given-names></name> <name><surname>Lee</surname> <given-names>J. T.</given-names></name> <etal/></person-group>. (<year>2011</year>). <article-title>&#x0201C;A large-scale benchmark dataset for event recognition in surveillance video,&#x0201D;</article-title> in <source>CVPR 2011</source> (<publisher-loc>Colorado Springs, CO</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>3153</fpage>&#x02013;<lpage>3160</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2011.5995586</pub-id></citation>
</ref>
<ref id="B17">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Oliveira</surname> <given-names>D. D.</given-names></name> <name><surname>Rampinelli</surname> <given-names>M.</given-names></name> <name><surname>Tozatto</surname> <given-names>G. Z.</given-names></name> <name><surname>Andre&#x000E3;o</surname> <given-names>R. V.</given-names></name> <name><surname>M&#x000FC;ller</surname> <given-names>S. M.</given-names></name></person-group> (<year>2021</year>). <article-title>Forecasting vehicular traffic flow using MLP and LSTM</article-title>. <source>Neural Comput. Appl</source>. <volume>33</volume>, <fpage>17245</fpage>&#x02013;<lpage>17256</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-021-06315-w</pub-id></citation>
</ref>
<ref id="B18">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Pellegrini</surname> <given-names>S.</given-names></name> <name><surname>Ess</surname> <given-names>A.</given-names></name> <name><surname>Schindler</surname> <given-names>K.</given-names></name> <name><surname>Van Gool</surname> <given-names>L.</given-names></name></person-group> (<year>2009</year>). <article-title>&#x0201C;You&#x00027;ll never walk alone: modeling social behavior for multi-target tracking,&#x0201D;</article-title> in <source>2009 IEEE 12th International Conference on Computer Vision</source> (<publisher-loc>Kyoto</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>261</fpage>&#x02013;<lpage>268</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2009.5459260</pub-id></citation>
</ref>
<ref id="B19">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Rasouli</surname> <given-names>A.</given-names></name> <name><surname>Kotseruba</surname> <given-names>I.</given-names></name> <name><surname>Kunic</surname> <given-names>T.</given-names></name> <name><surname>Tsotsos</surname> <given-names>J. K.</given-names></name></person-group> (<year>2019</year>). <article-title>&#x0201C;PIE: a large-scale dataset and models for pedestrian intention estimation and trajectory prediction,&#x0201D;</article-title> in <source>2019 IEEE/CVF International Conference on Computer Vision (ICCV)</source> (<publisher-loc>Seoul</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6261</fpage>&#x02013;<lpage>6270</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV.2019.00636</pub-id></citation>
</ref>
<ref id="B20">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Robicquet</surname> <given-names>A.</given-names></name> <name><surname>Sadeghian</surname> <given-names>A.</given-names></name> <name><surname>Alahi</surname> <given-names>A.</given-names></name> <name><surname>Savarese</surname> <given-names>S.</given-names></name></person-group> (<year>2016</year>). <article-title>&#x0201C;Learning social etiquette: human trajectory understanding in crowded scenes,&#x0201D;</article-title> in <source>Computer Vision-ECCV 2016: 14th, European Conference, Amsterdam, The Netherlands, October 11&#x02013;14, 2016, Proceedings, Part VIII 14</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>549</fpage>&#x02013;<lpage>565</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-319-46484-8_33</pub-id></citation>
</ref>
<ref id="B21">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Salzmann</surname> <given-names>T.</given-names></name> <name><surname>Ivanovic</surname> <given-names>B.</given-names></name> <name><surname>Chakravarty</surname> <given-names>P.</given-names></name> <name><surname>Pavone</surname> <given-names>M.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Trajectron&#x0002B;&#x0002B;: dynamically-feasible trajectory forecasting with heterogeneous data,&#x0201D;</article-title> in <source>Computer Vision-ECCV 2020: 16th, European Conference, Glasgow, UK, August 23&#x02013;28, 2020, Proceedings, Part XVIII 16</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>683</fpage>&#x02013;<lpage>700</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58523-5_40</pub-id></citation>
</ref>
<ref id="B22">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Shi</surname> <given-names>L.</given-names></name> <name><surname>Wang</surname> <given-names>L.</given-names></name> <name><surname>Long</surname> <given-names>C.</given-names></name> <name><surname>Zhou</surname> <given-names>S.</given-names></name> <name><surname>Zhou</surname> <given-names>M.</given-names></name> <name><surname>Niu</surname> <given-names>Z.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;SGCN: sparse graph convolution network for pedestrian trajectory prediction,&#x0201D;</article-title> in <source>2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</source> (<publisher-loc>Nashville, TN</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>8990</fpage>&#x02013;<lpage>8999</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR46437.2021.00888</pub-id><pub-id pub-id-type="pmid">37028327</pub-id></citation></ref>
<ref id="B23">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Sultani</surname> <given-names>W.</given-names></name> <name><surname>Chen</surname> <given-names>C.</given-names></name> <name><surname>Shah</surname> <given-names>M.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Real-world anomaly detection in surveillance videos,&#x0201D;</article-title> in <source>Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition</source> (<publisher-loc>Salt Lake City, UT</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>6479</fpage>&#x02013;<lpage>6488</lpage>. <pub-id pub-id-type="doi">10.1109/CVPR.2018.00678</pub-id></citation>
</ref>
<ref id="B24">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Tao</surname> <given-names>C.</given-names></name> <name><surname>Jiang</surname> <given-names>Q.</given-names></name> <name><surname>Duan</surname> <given-names>L.</given-names></name> <name><surname>Luo</surname> <given-names>P.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Dynamic and static context-aware lstm for multi-agent motion prediction,&#x0201D;</article-title> in <source>European Conference on Computer Vision</source> (<publisher-loc>Cham</publisher-loc>: <publisher-name>Springer</publisher-name>), <fpage>547</fpage>&#x02013;<lpage>563</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58589-1_33</pub-id></citation>
</ref>
<ref id="B25">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Vemula</surname> <given-names>A.</given-names></name> <name><surname>Muelling</surname> <given-names>K.</given-names></name> <name><surname>Oh</surname> <given-names>J.</given-names></name></person-group> (<year>2018</year>). <article-title>&#x0201C;Social attention: modeling attention in human crowds,&#x0201D;</article-title> in <source>2018 IEEE international Conference on Robotics and Automation (ICRA)</source> (<publisher-loc>Brisbane, QLD</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>4601</fpage>&#x02013;<lpage>4607</lpage>. <pub-id pub-id-type="doi">10.1109/ICRA.2018.8460504</pub-id></citation>
</ref>
<ref id="B26">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Wang</surname> <given-names>C.</given-names></name> <name><surname>Wang</surname> <given-names>Y.</given-names></name> <name><surname>Xu</surname> <given-names>M.</given-names></name> <name><surname>Crandall</surname> <given-names>D. J.</given-names></name></person-group> (<year>2022</year>). <article-title>Stepwise goal-driven networks for trajectory prediction</article-title>. <source>IEEE Robot. Autom. Lett</source>. <volume>7</volume>, <fpage>2716</fpage>&#x02013;<lpage>2723</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2022.3145090</pub-id></citation>
</ref>
<ref id="B27">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yao</surname> <given-names>Y.</given-names></name> <name><surname>Atkins</surname> <given-names>E.</given-names></name> <name><surname>Johnson-Roberson</surname> <given-names>M.</given-names></name> <name><surname>Vasudevan</surname> <given-names>R.</given-names></name> <name><surname>Du</surname> <given-names>X.</given-names></name></person-group> (<year>2021</year>). <article-title>Bitrap: bi-directional pedestrian trajectory prediction with multi-modal goal estimation</article-title>. <source>IEEE Robot. Autom. Lett</source>. <volume>6</volume>, <fpage>1463</fpage>&#x02013;<lpage>1470</lpage>. <pub-id pub-id-type="doi">10.1109/LRA.2021.3056339</pub-id></citation>
</ref>
<ref id="B28">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Yu</surname> <given-names>C.</given-names></name> <name><surname>Ma</surname> <given-names>X.</given-names></name> <name><surname>Ren</surname> <given-names>J.</given-names></name> <name><surname>Zhao</surname> <given-names>H.</given-names></name> <name><surname>Yi</surname> <given-names>S.</given-names></name></person-group> (<year>2020</year>). <article-title>&#x0201C;Spatio-temporal graph transformer networks for pedestrian trajectory prediction,&#x0201D;</article-title> in <source>Computer Vision-ECCV 2020</source>: 16th, <italic>European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part XII 16</italic> (Cham: Springer), <fpage>507</fpage>&#x02013;<lpage>523</lpage>. <pub-id pub-id-type="doi">10.1007/978-3-030-58610-2_30</pub-id></citation>
</ref>
<ref id="B29">
<citation citation-type="book"><person-group person-group-type="author"><name><surname>Yuan</surname> <given-names>Y.</given-names></name> <name><surname>Weng</surname> <given-names>X.</given-names></name> <name><surname>Ou</surname> <given-names>Y.</given-names></name> <name><surname>Kitani</surname> <given-names>K. M.</given-names></name></person-group> (<year>2021</year>). <article-title>&#x0201C;Agentformer: agent-aware transformers for socio-temporal multi-agent forecasting,&#x0201D;</article-title> in <source>Proceedings of the IEEE/CVF International Conference on Computer Vision</source> (<publisher-loc>Montreal, QC</publisher-loc>: <publisher-name>IEEE</publisher-name>), <fpage>9813</fpage>&#x02013;<lpage>9823</lpage>. <pub-id pub-id-type="doi">10.1109/ICCV48922.2021.00967</pub-id></citation>
</ref>
<ref id="B30">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhang</surname> <given-names>X.</given-names></name> <name><surname>Xu</surname> <given-names>Y.</given-names></name> <name><surname>Shao</surname> <given-names>Y.</given-names></name></person-group> (<year>2022</year>). <article-title>Forecasting traffic flow with spatial-temporal convolutional graph attention networks</article-title>. <source>Neural Comput. Appl</source>. <volume>34</volume>, <fpage>15457</fpage>&#x02013;<lpage>15479</lpage>. <pub-id pub-id-type="doi">10.1007/s00521-022-07235-z</pub-id></citation>
</ref>
<ref id="B31">
<citation citation-type="journal"><person-group person-group-type="author"><name><surname>Zhao</surname> <given-names>H.</given-names></name> <name><surname>Gao</surname> <given-names>J.</given-names></name> <name><surname>Lan</surname> <given-names>T.</given-names></name> <name><surname>Sun</surname> <given-names>C.</given-names></name> <name><surname>Sapp</surname> <given-names>B.</given-names></name> <name><surname>Varadarajan</surname> <given-names>B.</given-names></name> <etal/></person-group>. (<year>2021</year>). <article-title>&#x0201C;TNT: target-driven trajectory prediction,&#x0201D;</article-title> in <source>Proceedings of the 2020 Conference on Robot Learning</source>, eds J. Kober, F. Ramos, and C. Tomlin (Cambridge, MA: IEEE), <fpage>895</fpage>&#x02013;<lpage>904</lpage>.</citation>
</ref>
</ref-list>
</back>
</article>