test.tex

% Template for PLoS
% Version 3.5 March 2018
%
% % % % % % % % % % % % % % % % % % % % % %
%
% -- IMPORTANT NOTE
%
% This template contains comments intended
% to minimize problems and delays during our production
% process. Please follow the template instructions
% whenever possible.
%
% % % % % % % % % % % % % % % % % % % % % % %
%
% Once your paper is accepted for publication,
% PLEASE REMOVE ALL TRACKED CHANGES in this file
% and leave only the final text of your manuscript.
% PLOS recommends the use of latexdiff to track changes during review, as this will help to maintain a clean tex file.
% Visit https://www.ctan.org/pkg/latexdiff?lang=en for info or contact us at latex@plos.org.
%
%
% There are no restrictions on package use within the LaTeX files except that
% no packages listed in the template may be deleted.
%
% Please do not include colors or graphics in the text.
%
% The manuscript LaTeX source should be contained within a single file (do not use \input, \externaldocument, or similar commands).
%
% % % % % % % % % % % % % % % % % % % % % % %
%
% -- FIGURES AND TABLES
%
% Please include tables/figure captions directly after the paragraph where they are first cited in the text.
%
% DO NOT INCLUDE GRAPHICS IN YOUR MANUSCRIPT
% - Figures should be uploaded separately from your manuscript file.
% - Figures generated using LaTeX should be extracted and removed from the PDF before submission.
% - Figures containing multiple panels/subfigures must be combined into one image file before submission.
% For figure citations, please use "Fig" instead of "Figure".
% See http://journals.plos.org/plosone/s/figures for PLOS figure guidelines.
%
% Tables should be cell-based and may not contain:
% - spacing/line breaks within cells to alter layout or alignment
% - do not nest tabular environments (no tabular environments within tabular environments)
% - no graphics or colored text (cell background color/shading OK)
% See http://journals.plos.org/plosone/s/tables for table guidelines.
%
% For tables that exceed the width of the text column, use the adjustwidth environment as illustrated in the example table in text below.
%
% % % % % % % % % % % % % % % % % % % % % % % %
%
% -- EQUATIONS, MATH SYMBOLS, SUBSCRIPTS, AND SUPERSCRIPTS
%
% IMPORTANT
% Below are a few tips to help format your equations and other special characters according to our specifications. For more tips to help reduce the possibility of formatting errors during conversion, please see our LaTeX guidelines at http://journals.plos.org/plosone/s/latex
%
% For inline equations, please be sure to include all portions of an equation in the math environment.  For example, x$^2$ is incorrect; this should be formatted as $x^2$ (or $\mathrm{x}^2$ if the romanized font is desired).
%
% Do not include text that is not math in the math environment. For example, CO2 should be written as CO\textsubscript{2} instead of CO$_2$.
%
% Please add line breaks to long display equations when possible in order to fit size of the column.
%
% For inline equations, please do not include punctuation (commas, etc) within the math environment unless this is part of the equation.
%
% When adding superscript or subscripts outside of brackets/braces, please group using {}.  For example, change "[U(D,E,\gamma)]^2" to "{[U(D,E,\gamma)]}^2".
%
% Do not use \cal for caligraphic font.  Instead, use \mathcal{}
%
% % % % % % % % % % % % % % % % % % % % % % % %
%
% Please contact latex@plos.org with any questions.
%
% % % % % % % % % % % % % % % % % % % % % % % %

\documentclass[10pt,letterpaper]{article}
\usepackage[top=0.85in,left=2.75in,footskip=0.75in]{geometry}

% amsmath and amssymb packages, useful for mathematical formulas and symbols
\usepackage{amsmath,amssymb}

% Use adjustwidth environment to exceed column width (see example table in text)
\usepackage{changepage}

% Use Unicode characters when possible
\usepackage[utf8x]{inputenc}

% textcomp package and marvosym package for additional characters
\usepackage{textcomp,marvosym}

% cite package, to clean up citations in the main text. Do not remove.
\usepackage{cite}

% Use nameref to cite supporting information files (see Supporting Information section for more info)
\usepackage{nameref,hyperref}

% line numbers
\usepackage[right]{lineno}

% ligatures disabled
\usepackage{microtype}
\DisableLigatures[f]{encoding = *, family = * }

% color can be used to apply background shading to table cells only
\usepackage[table]{xcolor}

% array package and thick rules for tables
\usepackage{array}

% create "+" rule type for thick vertical lines
\newcolumntype{+}{!{\vrule width 2pt}}

% create \thickcline for thick horizontal lines of variable length
\newlength\savedwidth
\newcommand\thickcline[1]{%
  \noalign{\global\savedwidth\arrayrulewidth\global\arrayrulewidth 2pt}%
  \cline{#1}%
  \noalign{\vskip\arrayrulewidth}%
  \noalign{\global\arrayrulewidth\savedwidth}%
}

% \thickhline command for thick horizontal lines that span the table
\newcommand\thickhline{\noalign{\global\savedwidth\arrayrulewidth\global\arrayrulewidth 2pt}%
\hline
\noalign{\global\arrayrulewidth\savedwidth}}


% Remove comment for double spacing
%\usepackage{setspace}
%\doublespacing

% Text layout
\raggedright
\setlength{\parindent}{0.5cm}
\textwidth 5.25in
\textheight 8.75in

% Bold the 'Figure #' in the caption and separate it from the title/caption with a period
% Captions will be left justified
\usepackage[aboveskip=1pt,labelfont=bf,labelsep=period,justification=raggedright,singlelinecheck=off]{caption}
\renewcommand{\figurename}{Fig}

% Use the PLoS provided BiBTeX style
\bibliographystyle{plos2015}

% Remove brackets from numbering in List of References
\makeatletter
\renewcommand{\@biblabel}[1]{\quad#1.}
\makeatother


% Header and Footer with logo
\usepackage{lastpage,fancyhdr,graphicx}
\usepackage{epstopdf}
%\pagestyle{myheadings}
\pagestyle{fancy}
\fancyhf{}
%\setlength{\headheight}{27.023pt}
%\lhead{\includegraphics[width=2.0in]{PLOS-submission.eps}}
\rfoot{\thepage/\pageref{LastPage}}
\renewcommand{\headrulewidth}{0pt}
\renewcommand{\footrule}{\hrule height 2pt \vspace{2mm}}
\fancyheadoffset[L]{2.25in}
\fancyfootoffset[L]{2.25in}
\lfoot{\today}

%% Include all macros below

\newcommand{\lorem}{{\bf LOREM}}
\newcommand{\ipsum}{{\bf IPSUM}}

%% END MACROS SECTION


\begin{document}
\vspace*{0.2in}

% Title must be 250 characters or less.
\begin{flushleft}
{\Large
\textbf\newline{Title of submission to PLOS journals} % Please use "sentence case" for title and headings (capitalize only the first word in a title (or heading), the first word in a subtitle (or subheading), and any proper nouns).
}
\newline
% Insert author names, affiliations and corresponding author email (do not include titles, positions, or degrees).
\\
Name1 Surname\textsuperscript{1,2\Yinyang},
Name2 Surname\textsuperscript{2\Yinyang},
Name3 Surname\textsuperscript{2,3\textcurrency},
Name4 Surname\textsuperscript{2},
Name5 Surname\textsuperscript{2\ddag},
Name6 Surname\textsuperscript{2\ddag},
Name7 Surname\textsuperscript{1,2,3*},
with the Lorem Ipsum Consortium\textsuperscript{\textpilcrow}
\\
\bigskip
\textbf{1} Affiliation Dept/Program/Center, Institution Name, City, State, Country
\\
\textbf{2} Affiliation Dept/Program/Center, Institution Name, City, State, Country
\\
\textbf{3} Affiliation Dept/Program/Center, Institution Name, City, State, Country
\\
\bigskip

% Insert additional author notes using the symbols described below. Insert symbol callouts after author names as necessary.
%
% Remove or comment out the author notes below if they aren't used.
%
% Primary Equal Contribution Note
\Yinyang These authors contributed equally to this work.

% Additional Equal Contribution Note
% Also use this double-dagger symbol for special authorship notes, such as senior authorship.
\ddag These authors also contributed equally to this work.

% Current address notes
\textcurrency Current Address: Dept/Program/Center, Institution Name, City, State, Country % change symbol to "\textcurrency a" if more than one current address note
% \textcurrency b Insert second current address
% \textcurrency c Insert third current address

% Deceased author note
\dag Deceased

% Group/Consortium Author Note
\textpilcrow Membership list can be found in the Acknowledgments section.

% Use the asterisk to denote corresponding authorship and provide email address in note below.
* correspondingauthor@institute.edu

\end{flushleft}
% Please keep the abstract below 300 words
\section*{Abstract}
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eget porta erat. Morbi consectetur est vel gravida pretium. Suspendisse ut dui eu ante cursus gravida non sed sem. Nullam sapien tellus, commodo id velit id, eleifend volutpat quam. Phasellus mauris velit, dapibus finibus elementum vel, pulvinar non tellus. Nunc pellentesque pretium diam, quis maximus dolor faucibus id. Nunc convallis sodales ante, ut ullamcorper est egestas vitae. Nam sit amet enim ultrices, ultrices elit pulvinar, volutpat risus.


% Please keep the Author Summary between 150 and 200 words
% Use first person. PLOS ONE authors please skip this step.
% Author Summary not valid for PLOS ONE submissions.
\section*{Author summary}
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Curabitur eget porta erat. Morbi consectetur est vel gravida pretium. Suspendisse ut dui eu ante cursus gravida non sed sem. Nullam sapien tellus, commodo id velit id, eleifend volutpat quam. Phasellus mauris velit, dapibus finibus elementum vel, pulvinar non tellus. Nunc pellentesque pretium diam, quis maximus dolor faucibus id. Nunc convallis sodales ante, ut ullamcorper est egestas vitae. Nam sit amet enim ultrices, ultrices elit pulvinar, volutpat risus.

\linenumbers

% Use "Eq" instead of "Equation" for equation citations.
\section*{Introduction}
Lorem ipsum dolor sit~\cite{bib1} amet, consectetur adipiscing elit. Curabitur eget porta erat. Morbi consectetur est vel gravida pretium. Suspendisse ut dui eu ante cursus gravida non sed sem. Nullam Eq~(\ref{eq:schemeP}) sapien tellus, commodo id velit id, eleifend volutpat quam. Phasellus mauris velit, dapibus finibus elementum vel, pulvinar non tellus. Nunc pellentesque pretium diam, quis maximus dolor faucibus id.~\cite{bib2} Nunc convallis sodales ante, ut ullamcorper est egestas vitae. Nam sit amet enim ultrices, ultrices elit pulvinar, volutpat risus.

\begin{eqnarray}
\label{eq:schemeP}
	\mathrm{P_Y} = \underbrace{H(Y_n) - H(Y_n|\mathbf{V}^{Y}_{n})}_{S_Y} + \underbrace{H(Y_n|\mathbf{V}^{Y}_{n})- H(Y_n|\mathbf{V}^{X,Y}_{n})}_{T_{X\rightarrow Y}},
\end{eqnarray}

\section*{Materials and methods}
\subsection*{Etiam eget sapien nibh}

% For figure citations, please use "Fig" instead of "Figure".
Nulla mi mi, Fig~\ref{fig1} venenatis sed ipsum varius, volutpat euismod diam. Proin rutrum vel massa non gravida. Quisque tempor sem et dignissim rutrum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi at justo vitae nulla elementum commodo eu id massa. In vitae diam ac augue semper tincidunt eu ut eros. Fusce fringilla erat porttitor lectus cursus, \nameref{S1_Video} vel sagittis arcu lobortis. Aliquam in enim semper, aliquam massa id, cursus neque. Praesent faucibus semper libero.

% Place figure captions after the first paragraph in which they are cited.
\begin{figure}[!h]
\caption{{\bf Bold the figure title.}
Figure caption text here, please use this space for the figure panel descriptions instead of using subfigure commands. A: Lorem ipsum dolor sit amet. B: Consectetur adipiscing elit.}
\label{fig1}
\end{figure}

% Results and Discussion can be combined.
\section*{Results}
Nulla mi mi, venenatis sed ipsum varius, Table~\ref{table1} volutpat euismod diam. Proin rutrum vel massa non gravida. Quisque tempor sem et dignissim rutrum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi at justo vitae nulla elementum commodo eu id massa. In vitae diam ac augue semper tincidunt eu ut eros. Fusce fringilla erat porttitor lectus cursus, vel sagittis arcu lobortis. Aliquam in enim semper, aliquam massa id, cursus neque. Praesent faucibus semper libero.

% Place tables after the first paragraph in which they are cited.
\begin{table}[!ht]
\begin{adjustwidth}{-2.25in}{0in} % Comment out/remove adjustwidth environment if table fits in text column.
\centering
\caption{
{\bf Table caption Nulla mi mi, venenatis sed ipsum varius, volutpat euismod diam.}}
\begin{tabular}{|l+l|l|l|l|l|l|l|}
\hline
\multicolumn{4}{|l|}{\bf Heading1} & \multicolumn{4}{|l|}{\bf Heading2}\\ \thickhline
$cell1 row1$ & cell2 row 1 & cell3 row 1 & cell4 row 1 & cell5 row 1 & cell6 row 1 & cell7 row 1 & cell8 row 1\\ \hline
$cell1 row2$ & cell2 row 2 & cell3 row 2 & cell4 row 2 & cell5 row 2 & cell6 row 2 & cell7 row 2 & cell8 row 2\\ \hline
$cell1 row3$ & cell2 row 3 & cell3 row 3 & cell4 row 3 & cell5 row 3 & cell6 row 3 & cell7 row 3 & cell8 row 3\\ \hline
\end{tabular}
\begin{flushleft} Table notes Phasellus venenatis, tortor nec vestibulum mattis, massa tortor interdum felis, nec pellentesque metus tortor nec nisl. Ut ornare mauris tellus, vel dapibus arcu suscipit sed.
\end{flushleft}
\label{table1}
\end{adjustwidth}
\end{table}


%PLOS does not support heading levels beyond the 3rd (no 4th level headings).
\subsection*{\lorem\ and \ipsum\ nunc blandit a tortor}
\subsubsection*{3rd level heading}
Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque. Quisque augue sem, tincidunt sit amet feugiat eget, ullamcorper sed velit. Sed non aliquet felis. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris commodo justo ac dui pretium imperdiet. Sed suscipit iaculis mi at feugiat.

\begin{enumerate}
	\item{react}
	\item{diffuse free particles}
	\item{increment time by dt and go to 1}
\end{enumerate}


\subsection*{Sed ac quam id nisi malesuada congue}

Nulla mi mi, venenatis sed ipsum varius, volutpat euismod diam. Proin rutrum vel massa non gravida. Quisque tempor sem et dignissim rutrum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi at justo vitae nulla elementum commodo eu id massa. In vitae diam ac augue semper tincidunt eu ut eros. Fusce fringilla erat porttitor lectus cursus, vel sagittis arcu lobortis. Aliquam in enim semper, aliquam massa id, cursus neque. Praesent faucibus semper libero.

\begin{itemize}
	\item First bulleted item.
	\item Second bulleted item.
	\item Third bulleted item.
\end{itemize}

\section*{Discussion}
Nulla mi mi, venenatis sed ipsum varius, Table~\ref{table1} volutpat euismod diam. Proin rutrum vel massa non gravida. Quisque tempor sem et dignissim rutrum. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Morbi at justo vitae nulla elementum commodo eu id massa. In vitae diam ac augue semper tincidunt eu ut eros. Fusce fringilla erat porttitor lectus cursus, vel sagittis arcu lobortis. Aliquam in enim semper, aliquam massa id, cursus neque. Praesent faucibus semper libero~\cite{bib3}.

\section*{Conclusion}

CO\textsubscript{2} Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque. Quisque augue sem, tincidunt sit amet feugiat eget, ullamcorper sed velit.

Sed non aliquet felis. Lorem ipsum dolor sit amet, consectetur adipiscing elit. Mauris commodo justo ac dui pretium imperdiet. Sed suscipit iaculis mi at feugiat. Ut neque ipsum, luctus id lacus ut, laoreet scelerisque urna. Phasellus venenatis, tortor nec vestibulum mattis, massa tortor interdum felis, nec pellentesque metus tortor nec nisl. Ut ornare mauris tellus, vel dapibus arcu suscipit sed. Nam condimentum sem eget mollis euismod. Nullam dui urna, gravida venenatis dui et, tincidunt sodales ex. Nunc est dui, sodales sed mauris nec, auctor sagittis leo. Aliquam tincidunt, ex in facilisis elementum, libero lectus luctus est, non vulputate nisl augue at dolor. For more information, see \nameref{S1_Appendix}.

\section*{Supporting information}

% Include only the SI item label in the paragraph heading. Use the \nameref{label} command to cite SI items in the text.
\paragraph*{S1 Fig.}
\label{S1_Fig}
{\bf Bold the title sentence.} Add descriptive text after the title of the item (optional).

\paragraph*{S2 Fig.}
\label{S2_Fig}
{\bf Lorem ipsum.} Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque.

\paragraph*{S1 File.}
\label{S1_File}
{\bf Lorem ipsum.}  Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque.

\paragraph*{S1 Video.}
\label{S1_Video}
{\bf Lorem ipsum.}  Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque.

\paragraph*{S1 Appendix.}
\label{S1_Appendix}
{\bf Lorem ipsum.} Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque.

\paragraph*{S1 Table.}
\label{S1_Table}
{\bf Lorem ipsum.} Maecenas convallis mauris sit amet sem ultrices gravida. Etiam eget sapien nibh. Sed ac ipsum eget enim egestas ullamcorper nec euismod ligula. Curabitur fringilla pulvinar lectus consectetur pellentesque.

\section*{Acknowledgments}
Cras egestas velit mauris, eu mollis turpis pellentesque sit amet. Interdum et malesuada fames ac ante ipsum primis in faucibus. Nam id pretium nisi. Sed ac quam id nisi malesuada congue. Sed interdum aliquet augue, at pellentesque quam rhoncus vitae.

\nolinenumbers

% Either type in your references using
% \begin{thebibliography}{}
% \bibitem{}
% Text
% \end{thebibliography}
%
% or
%
% Compile your BiBTeX database using our plos2015.bst
% style file and paste the contents of your .bbl file
% here. See http://journals.plos.org/plosone/s/latex for
% step-by-step instructions.
%
\begin{thebibliography}{10}

\bibitem{bib1}
Conant GC, Wolfe KH.
\newblock {{T}urning a hobby into a job: how duplicated genes find new
  functions}.
\newblock Nat Rev Genet. 2008 Dec;9(12):938--950.

\bibitem{bib2}
Ohno S.
\newblock Evolution by gene duplication.
\newblock London: George Alien \& Unwin Ltd. Berlin, Heidelberg and New York:
  Springer-Verlag.; 1970.

\bibitem{bib3}
Magwire MM, Bayer F, Webster CL, Cao C, Jiggins FM.
\newblock {{S}uccessive increases in the resistance of {D}rosophila to viral
  infection through a transposon insertion followed by a {D}uplication}.
\newblock PLoS Genet. 2011 Oct;7(10):e1002337.

\end{thebibliography}


\end{document} qqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqqq

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%2345678901234567890123456789012345678901234567890123456789012345678901234567890
%        1         2         3         4         5         6         7         8

\documentclass[letterpaper, 10 pt, conference]{ieeeconf}  % Comment this line out
                                                          % if you need a4paper
%\documentclass[a4paper, 10pt, conference]{ieeeconf}      % Use this line for a4
                                                          % paper

\IEEEoverridecommandlockouts                              % This command is only
                                                          % needed if you want to
                                                          % use the \thanks command
\overrideIEEEmargins
% See the \addtolength command later in the file to balance the column lengths
% on the last page of the document


% The following packages can be found on http:\\www.ctan.org
%\usepackage{graphics} % for pdf, bitmapped graphics files
%\usepackage{epsfig} % for postscript graphics files
\usepackage{color}
\usepackage{float}
\usepackage{comment}
\definecolor{rosegold}{rgb}{0.72, 0.43, 0.47}
%\usepackage{mathptmx} % assumes new font selection scheme installed
%\usepackage{times} % assumes new font selection scheme installed
%\usepackage{amsmath} % assumes amsmath package installed
%\usepackage{amssymb}  % assumes amsmath package installed
\usepackage[english]{babel}
\usepackage[utf8x]{inputenc}
\usepackage[T1]{fontenc}
\usepackage{placeins}
\usepackage{multirow}
\usepackage{graphicx}      % include this line if your document contains figures
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{eurosym}
%\usepackage{subcaption}
%\usepackage{caption}
\usepackage{color}
\newcommand{\kXX}[1]{\color{blue} XX #1 XX \color{black}}
\setlength{\textfloatsep}{3pt}% Remove \textfloatsep
\usepackage{subcaption}

\DeclareMathOperator*{\argminA}{arg\,min}
\DeclareMathOperator*{\lSupervised}{l}
\usepackage[linesnumbered,ruled]{algorithm2e}

% http://ctan.org/pkg/graphicx
\title{\LARGE \bf
Sample Efficient Interactive End-to-End Deep Learning for Self-Driving Cars with Selective Multi-Class Safe Dataset Aggregation
}

%\author{ \parbox{3 in}{\centering Huibert Kwakernaak*
%         \thanks{*Use the $\backslash$thanks command to put information here}\\
%         Faculty of Electrical Engineering, Mathematics and Computer Science\\
%         University of Twente\\
%         7500 AE Enschede, The Netherlands\\
%         {\tt\small h.kwakernaak@autsubmit.com}}
%         \hspace*{ 0.5 in}
%         \parbox{3 in}{ \centering Pradeep Misra**
%         \thanks{**The footnote marks may be inserted manually}\\
%        Department of Electrical Engineering \\
%         Wright State University\\
%         Dayton, OH 45435, USA\\
%         {\tt\small pmisra@cs.wright.edu}}
%}

\author{Yunus Bicer$^{1}$, Ali Alizadeh$^{2}$, Nazim Kemal Ure$^{3}$, Ahmetcan Erdogan$^{4}$ and Orkun Kizilirmak$^{4}$ % <-this % stops a space
\thanks{*This work is supported by AVL Turkey and Scientific  and  Technological Research  Council  of  Turkey under the grant agreement TEYDEB 1515 / 5169901}% <-this % stops a space
\thanks{$^{1}$Y. Bicer is with Faculty of Aeronautics and Astronautics, Aerospace Engineering,
        Istanbul Technical University, Turkey
        {\tt\small biceryu at itu.edu.tr}}%
\thanks{$^{2}$A. Alizadeh is with Faculty of Mechatronics Engineering, Istanbul Technical University, Turkey {\tt\small Alizadeha at itu.edu.tr}}%
\thanks{$^{3}$N.K. Ure is with Faculty of Aeronautics and Astronautics, Department of Aeronautical Engineering, Istanbul Technical University, Turkey
        {\tt\small ure at itu.edu.tr}}%
\thanks{$^{4}$A. Erdogan and O. Kizilirmak are with AVL Turkey, Istanbul, Turkey
        {\tt\small ahmetcan.erdogan, orkun.kizilirmak at avl.com}}%
}


\begin{document}


\maketitle
\thispagestyle{empty}
\pagestyle{empty}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\begin{abstract}
The objective of this paper is to develop a sample efficient end-to-end deep learning method for self-driving cars, where we attempt to increase the value of the information extracted from samples, through careful analysis obtained from each call to expert driver's policy. End-to-end imitation learning is a popular method for computing self-driving car policies. The standard approach relies on collecting pairs of inputs (camera images) and outputs (steering angle, etc.) from an expert policy and fitting a deep neural network to this data to learn the driving policy. Although this approach had some successful demonstrations in the past, learning a good policy might require a lot of samples from the expert driver, which might be resource-consuming. In this work, we develop a novel framework based on the Safe Dataset Aggregation (safe DAgger) approach, where the current learned policy is automatically segmented into different trajectory classes, and the algorithm identifies trajectory segments/classes with the weak performance at each step. Once the trajectory segments with weak performance identified, the sampling algorithm focuses on calling the expert policy only on these segments, which improves the convergence rate. The presented simulation results show that the proposed approach can yield significantly better performance compared to the standard Safe DAgger algorithm while using the same amount of samples from the expert.
\end{abstract}


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\section{INTRODUCTION}

Recent years saw significant advances in self-driving car technologies, mainly due to several breakthroughs in the area of deep learning. In particular, the use of vision-based methods to generate driving policies has been of interest to a vast body of researchers, resulting in a variety of different learning and control architectures, that can be roughly classified into classical and end-to-end methods. Conventional methods approach the problem of autonomous driving in three stages; perception, path planning, and control \cite{leonard2008}. In the perception stage, feature extraction and image processing techniques such as color enhancement, edge detection, etc. are applied to image data to detect lane markings. In path planning, reference, and the current path of the car is determined based on the identified features in perception. In the control part, control actions for the vehicle such as steering, speed, etc. are calculated from reference and the current path with an appropriate control algorithm. The performance of the classical methods heavily depends on the performance of the perception stage, and this performance can be sub-optimal because of the manually defined features and rules in this stage \cite{c1}. Sequential structure of the classical methods might also lead to the non-robustness against errors, as an error in feature extraction can result in an inaccurate final decision.

On the other hand, end-to-end learning methods learn a function from the samples obtained from an expert driving policy. The learned function can generate the control inputs directly from the vision data, combining the three layers of the classical control sequence into a single step. By far, the most popular approach for representing the mapping from images to controls in end-to-end driving is using neural networks (NN). ALVINN by Pomerleau \cite{c2} is one of the initial works in this area, which uses a feedforward neural network that maps frames of the front-facing camera to steering input. Researchers from Nvidia utilized convolutional neural networks (CNN) \cite{c3} to automatize the feature extraction process and predict steering input. An FCN-LSTM architecture\cite{c4} is proposed to increase learning performance with scene segmentation. In \cite{c5}, a visual attention model used to highlight some essential regions of frames for better prediction. Although the steering input prediction in an end-to-end manner is a well-studied problem in the literature, the steering input alone is not sufficient for fully autonomous driving. In \cite{c6}, a CNN-LSTM network is proposed to predict the speed and steering inputs synchronously.

Pure end-to-end learning policies are limited to the demonstrated performance, and although the training and validation loss on the data collected from the expert might be low, errors accumulated from the execution of the learned driving policy might lead to poor performance in the long run. This performance loss is partly because the learned driving policy is likely to observe states that do not belong to the distribution of the original expert demonstration data. DAgger \cite{c7} algorithm addresses this issue by iteratively collecting training data from both expert and trained policies. The main idea behind DAgger is to actively obtain more samples from the expert to improve the learned policy. Even though DAgger achieves better driving performance, it might end up obtaining a lot of samples from the expert, which can be time and resource-consuming in many real-life scenarios. SafeDAgger \cite{c8} algorithm, an extension of DAgger, attempts to minimize the number of calls to the expert by predicting the unsafe trajectories of the learned driving policy and only calls the expert on such cases. Another extension of DAgger, EnsembleDAgger \cite{c9}, predicts the variance of the decisions by using multiple models and takes it as additional safety criteria like SafeDAgger.

In this paper, we propose a novel framework which is sample-efficient compared to the SafeDAgger algorithm (state-of-the-art data aggregation method), named Selective SafeDAgger. The proposed algorithm classifies the trajectories executed by the learned policy to safe and multiple classes of unsafe segments. After the prediction, the model focuses on obtaining the expert policy samples primarily from the identified unsafe segment classes. Our main contribution is an imitation learning algorithm that collects the most promising samples from the expert policy, which enables outperforming the SafeDAgger method while limited to the same number of calls to the expert.


This paper is organized as follows. Section II provides the details of the methodology. The experimental setup is provided in section III, followed by a discussion about results in section IV and conclusions in section V.

\section{METHODOLOGY}

In this section, driving policies, the architecture of the network, and devised algorithm are explained in detail.

\subsection{Driving Policies}

We begin with giving definitions of the used terms to explain driving policies in detail.

A set of states $S$ for the car in this paper is an environment model, and $s \in S$ is one of the states for the car in that environment. Observation of the state $s$ is defined as $\phi(s) \in \Phi(S)$ where $\Phi(S)$ is the observation set for all states. $a(s)\in A(S)$ will be driving action at observation $\phi(s)$ where $A(S)$ is the set of all possible actions.

A set of driving policies $\Pi$ is defined as in Eq. (\ref{eq:drivingPloicy}).\begin{equation} \label{eq:drivingPloicy}
\Pi: \Phi(S) \rightarrow A(S)
\end{equation}where $\Pi$ is a mapping from state observations $\phi(s)$ to driving actions $a(s)$ such as steering, throttle, brake, etc.

Two distinct driving policies are defined throughout the paper. The first one is an expert policy $\pi^*\in\Pi$ that drives the car with a reasonable performance that we want to imitate. An expert policy in an autonomous driving scenario is usually chosen as actions of a human driver. Variants of DAgger algorithms, however, have mislabeling problem in case of the human driver, since drivers do not have feedback feelings from their actions and they can give incorrect reactions to the given states. To overcome the mislabeling problem, we have used a rule-based controller which contains speed and steering controllers, as an expert policy in this paper. The second one is a primary policy $\pi_0\in\Pi$ that is trained to drive a car. This policy is a sub-optimal policy according to the expert policy since it is trained on a subset of observation set $\Phi(S)$.

Training a primary policy to mimic an expert policy is called imitation learning or learning by demonstration. One of the most common methods for imitation learning is based on supervised learning techniques. The loss function for the supervised learning is defined as in Eq. (\ref{eq:loss}) \cite{c8}. \begin{equation}
\label{eq:loss}
l_{supervised}(\pi,\pi^{*},D_0) = \frac{1}{N}\sum_{i=1}^{N} ||\pi(\phi(s_i)) - \pi^{*}(\phi(s_i))||^2
\end{equation}where $l_{supervised} $ refers to $l^2$-Norm between trained and expert policy actions.

A primary policy as in Eq. (\ref{eq:primary}) is defined as a policy that minimizes the loss function as follows.\begin{equation}
    \label{eq:primary}
        \pi_{0} = \argminA_{\pi} l_{supervised}(\pi, \pi^{*}, D_{0})
\end{equation}

Minimization of the loss function can be challenging since it is known that the relation between image frames and driving actions is highly nonlinear. So, we have used a deep neural network architecture to find an optimal solution for the primary policy.


\subsection{Network Architecture}
The earlier works in end-to-end learning for self-driving cars focus on computing only the steering angle from a single image or a sequence of images. The longitudinal control component is required to reach a higher level of autonomy in the end-to-end framework. In this work, we utilize the multi-task model proposed in \cite{c6} as our baseline, which is capable of generating both longitudinal and lateral control inputs for the car. Besides, we utilize a speed controller rather than the classical throttle/brake commands for the longitudinal control. The steering action is predicted from the raw image inputs taken from the cameras located in front of the vehicle through convolution layers, and the speed is predicted from a sequence of speed profiles through a Long-Short Term Memory (LSTM) layer. There exists a single-direction coupling between the longitudinal controller (speed controller) and the lateral steering actions. In particular, the speed of the vehicle has a significant impact on the prediction model, since entering a turn with low speed represents different dynamics for the lateral controller when compared to a high-speed maneuver. Moreover, the straight trajectory dominates the whole other trajectory types (e.g., turn left, turn right); therefore, the trained network will be biased toward the straight path. To recover from this issue, we decided to define various trajectory types including all major maneuvers such as straight, turn left, turn right and low and high-speed scenarios, by which the devised model will learn the other less-occurring maneuvers.

The model architecture is shown in Fig \ref{fig:Architecture}. It takes the current observation and the past speed profile and returns steering action, speed action, and the class of the trajectory segment. The multi-task network predicts the steering angle through a visual encoder using a stack of convolution and fully-connected layers.  In the first two convolution layers (Conv1 and Conv2), large kernel size is adopted to better capture the environment features, which is suitable for the front-view camera. Inputs and kernels of the each convolution layer is denoted by "$\# channels @ input\,height \times input\,width $" and "$kernel\,height \times kernel\,width \times \# channels$" and each fully connected layer is denoted by "$FC - \text{size of neurons}$". The speed and trajectory class are predicted through a concatenation of visual encoder and feedback speed features. The speed features are extracted by an LSTM layer followed by fully-connected layers. ReLU (Rectified Linear Unit) is used as the activation function for all layers. Mean absolute error is the loss function for both speed and steering angle predictions as regression problems. On the other hand, the cross-entropy applies to the trajectory classifier as a classification problem.

\begin{figure}[]
	\centering
	\includegraphics[width=0.8\columnwidth,trim={0 0.46cm 0 0.33cm},clip]{architecture.png}
	\caption{Sample-efficient Selective SafeDAgger model}
	\label{fig:Architecture}
\end{figure}

The multi-class classifier highlighted in Fig. \ref{fig:Architecture} extends the safeDAgger method to a novel algorithm devised in this paper. The trajectory classes are defined as follows:\begin{equation} \label{eq:multi-class}
c_{s}(\pi,\phi(s)) =     \begin{cases}
       1,&\text{Safe  Trajectories} \\
       2,& \text{Unsafe Low-Speed Left(LL)} \\
       3,&\text{Unsafe High-Speed  Left(HL)} \\
       4,&\text{Unsafe Low-Speed  Right(LR)} \\
       5,&\text{Unsafe High-Speed  Right(HR)} \\
       6,&\text{Unsafe Low-Speed Straight(LS)} \\
       7,&\text{Unsafe High-Speed Straight(HS)} \\
      \end{cases}
  \end{equation}Low and high speeds with combinations of left, straight and right turn cover almost all unsafe trajectories. Same combinations also applicable for safe trajectories but since it is not needed to call expert policy in safe trajectories, we define only one class for the safe trajectories.


The multi-class classifier takes the partial observation of the state $\phi(s)$ which contains the visual perception and the past speed profile and returns a label indicating in which part of the trajectory the policy will likely to deviate from the expert policy $\pi^{*}$.


The labels for training the model was generated through one-hot-encoding method, defined by sequential decisions; first, it was decided whether the policy is safe by measuring its distance from the expert policy through $l^2$-Norm metric using Eq. (\ref{eq:l2_norm_metric}). \begin{equation}
\label{eq:l2_norm_metric}
c_{s}(\pi, \phi(s)) = \begin{cases}
0, & ||\pi(\phi(s)) -\pi^{*}(\phi(s))|| > \tau_{safe} \\
1,&\text{otherwise}\end{cases}
\end{equation}where $\tau_{safe}$ is a predefined threshold and can be chosen arbitrarily. Furthermore, to distinguish between low-speed and high-speed turn trajectories, steering threshold $\tau_{turn}$, speed thresholds for turn maneuver $\tau_{speed, turn}$ and straight trajectory $\tau_{speed, straight}$ are defined heuristically based on the response of the car dynamics in these trajectories. The threshold values for this work is depicted in Table \ref{table:thresholds}.
\vspace{-3mm}

\begin{table}[h]
{\setlength{\tabcolsep}{14pt}
\caption{Threshold Values in Labeling Process}
\begin{center}
\vspace{-3mm}
\begin{tabular}{cc}
\hline\hline
Parameter & Threshold value \\
\hline
$\tau_{safe}$ & $0.5$ \\
$\tau_{turn}$ & $0.25^{\circ}$\\
$\tau_{speed, turn}$ & $10 \, \, m/s$ \\
$\tau_{speed, straight}$ & $13.75 \, \, m/s$ \\
\hline
\end{tabular}
\end{center}
\label{table:thresholds}}
\vspace{-4mm}
\end{table}\noindent where $\tau_{safe}$ as $0.5$ yields $0.25^{\circ}$ for the steering angle and $1 \,  m/s$ for the speed difference between the network prediction and expert policy output.

\subsection{Selective SafeDAgger Algorithm}

\begin{algorithm}

\SetKwInOut{Input}{Input}
\SetKwInOut{Output}{Output}
Collect $D_{0}$ using $\pi^{*}$ \label{D_0} \\
$\pi_{0} = \argminA_{\pi} l_{supervised}(\pi, \pi^{*}, D_{0})$ \\
\For{i = 1:N}{
    \label{begin}
    \textcolor{blue}{{$c_{i} \leftarrow $ Define unsafe classes over $D_{0}$} \\      $D \leftarrow [\,]$  \\
    \While{$k \le T$}{\label{begin2}
        $\phi_k \leftarrow  \phi(s)$\\
        $c_{\phi_k}  \leftarrow$ classifier output of $\pi_i(\phi_k)$ \\
        \eIf{$c_{\phi_k}  \in c_i$}{
            use $ \pi^*(\phi_k)$\\
            $D^{'} \leftarrow [\phi_k]$\\
            $k = k+1$
        }
        {
            use $\pi_i(\phi_k)$\\
        }
    }\label{end2}}
    $D_{i} = D_{i-1} \cup D^{'}$ \\
    $\pi_{i+1} = \argminA_\pi \,\lSupervised_{supervised}(\pi, \pi^{*}, D_{i})$ \\
    \label{end}
}
\textbf{return} best $\pi_i$ over validation set
\caption{Selective SafeDAgger: \quad \small \textcolor{blue}{Blue} fonts distinguishes the difference between Selective SafeDAgger and SafeDAgger}
\label{Algorithm1}
\end{algorithm}

Algorithm \ref{Algorithm1} describes the proposed method in detail, which takes the expert policy $\pi^{*}$ as an input and gives $\pi_i$ as an output. The primary dataset $D_0$ is collected by using $\pi^*$, which is then utilized in training a primary policy $\pi_0$ by a supervised learning method. Having the $\pi_0$ at hand,  $c_i$, the unsafe classes of $D_0$ for the trained policy $\pi_i$ are determined. An observation $\phi_k $ taken from environment $\phi(s)$ is evaluated by $\pi_i$ to find its class $c_{\phi_k}$. If  $c_{\phi_k}$ is an element of $c_i$, $\pi^{*}$ takes over the control of the car and  $\phi_k $ is appended to $D^{'}$. Otherwise, $\pi_i$ continues to command the car until it encounters an unsafe class. As depicted in lines \ref{begin2}-\ref{end2}, the algorithm continues to append data to $D^{'}$ for T number of iterations. The appended dataset $D^{'}$ is aggregated into  $D_{i-1}$ to create $D_i$ and $\pi_{i+1}$ is trained on $D_i$. This loop is repeated for N times, as shown in lines \ref{begin}-\ref{end}. In the end, the algorithm returns the best $\pi_i$ over the validation set.


\section{EXPERIMENTS}

\subsection{System Setup}

\subsubsection{Simulator}

AirSim used in this work is an Unreal Engine Plugin based simulator for drones and cars established by Microsoft to create a platform for AI studies to develop and test new deep learning, computer vision and reinforcement learning algorithms for autonomous vehicles with photo-realistic graphs in simulations \cite{c10}. It has built-in APIs for interfacing with Python coding language. Furthermore, the engine editor creates custom environments or scenarios.

The road track for the training process of the algorithm is devised in a way to capture all defined scenarios in this work. The geometry of the custom created training track is shown in Fig. \ref{fig:track_visualization}, in which all the trajectory classes are illustrated.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.9\columnwidth,trim={0 1.3cm 0 2.3cm},clip]{track_visualization.png}
    \caption{Train set track}
    \label{fig:track_visualization}

\end{figure}

Representative power of the training set can be increased by collecting data from unseen observations. With that reason, two additional cameras were added to the front-facing camera with an angle of $\gamma$ to imitate turning-like maneuvers \cite{c3}. Airsim APIs provide ground truth labels for the front-facing camera frames, but ground truth labels for the left and right cameras should be adjusted with a threshold as in Eq. (\ref{eq:3cam_eqn}).

\begin{equation} \label{eq:3cam_eqn}
    \begin{bmatrix}
    L_{l} \\
    L_{r}
    \end{bmatrix} =
        \begin{bmatrix}
    L_{c_{steering}} +\gamma &L_{c_{speed}} - p_{speed} \\
    L_{c_{steering}} - \gamma & L_{c_{speed}} -  p_{speed}
    \end{bmatrix}
\end{equation}where $L_{l}$, $L_{r}$, $L_{c_{steering}}$ and $L_{c_{speed}} $ refer to the ground truth for the left and right cameras, center camera steering and speed actions respectively. In the turning case, the ground truth speed of the vehicle is adjusted by a parameter $p_{speed}$ which is chosen as $4$ $m/s$ heuristically.

%\begin{figure}[]
%\centering
%\fbox{\includegraphics[width=0.3\columnwidth]{3Camera.png}}
%\caption{3 camera view with an $\alpha$ angle }
%\label{fig:3Cam}
%\end{figure}

%\noindent where $L_{l}$ and $L_{r}$ refer to the ground truth for the left and right cameras. In addition, ground truth for center camera represented by $L_{c_{steering}}$ and $L_{c_{speed}} $ for steering and speed commands. In turning case, ground truth speed of the vehicle adjusted by a parameter $p_{speed}$ which is chosen as $4$ $m/s$ heuristically.


\subsubsection{Data Preprocessing}
A couple of techniques were utilized in the preprocessing phase. The input raw image was down-sampled to the size of 1442563 (RGB) and a Region of Interest (ROI) defined with the size of 59255 to cover almost the entire road and ignore the features above the horizon, which reduces the computational cost. Moreover, to improve the convergence rate and robustness of the neural network model, the processed image was normalized to the range [0,1] and augmented by randomly changing the brightness of the image with a scale of 0.4. The normalization was done by dividing all image pixels by 255.


% \begin{figure}[]
%     \centering
%     %\includegraphics[width=120pt]{ROI.png}
%     \includegraphics[width=0.8\columnwidth]{ROI.png}
%     \caption{Region of Interest}
%     \label{fig:ROI}
% \end{figure}

\subsubsection{Expert Policy }


To automatize the data collection part of the algorithm, a rule-based expert policy is defined as shown in Fig. \ref{fig:expert}.

\begin{figure}[h]
    \centering
    \includegraphics[width=0.75\columnwidth]{Expert.png}
    \caption{Expert policy}
    \label{fig:expert}
\end{figure}
For the steering action, $T_1$ is a tangent line to the road spline at the position of the car and $P_{1}$ is a point on road spline with $l_{ref}$ distance along spline from that positions. Tangent line at $P_{1}$ according to road spline is $T_2$. The angle between $T_1$ and $T_2$ which is $\alpha$ will be expert steering action as depicted in Eq. (\ref{eq:SteeringCommand}).\begin{equation}
    \label{eq:SteeringCommand}
     a_{steering}= \alpha = \arccos{\left(\frac{ T_1\cdot T_2}{\left\lVert T_1\right\rVert \left\lVert T_2\right\rVert}\right)}
\end{equation}
For the speed action, $P_2$ is a point on the road spline with a distance $l_{P_2}$ from the position of the car along the road spline as depicted in Eq. (\ref{eq:RefDist}). \begin{equation}
    \label{eq:RefDist}
    l_{P_2} = l_{ref} V_{current} k_{steering}
\end{equation}

\noindent where $V_{current}$ is current speed and $k_{steering}$ is a fine tuned constant. Tangent line at $P_2$ according to the road spline is $T_3$.

\begin{figure*}
    \centering
    \includegraphics[width=0.8\linewidth,trim={0 0.52cm 0 0.5cm},clip]{multiClass_convergance.png}
    %\includegraphics[width=\columnwidth]{multiClass_convergance.png}
    \caption{Convergence rate of the proposed model; It shows the improvement of the model as the number of dataset aggregation iterations increases.}
    \label{fig:3-iteration_convergence}
    \vspace{-4mm}
\end{figure*}

Expert speed action is defined by Eq. (\ref{eq:SpeedCommand}). \begin{equation}
    \label{eq:SpeedCommand}
    a_{speed} = V_{cruise}-\beta k_{speed}
\end{equation}

\noindent where $V_{cruise}$ is a pre-defined cruise speed, $k_{speed}$ is a fine tuned gain and $\beta$ is an angle between $T_1$ and $T_3$.

For our implementation, the parameters are chosen as $l_{ref} = 1$ m, $k_{steering} = 5 $,  $V_{cruise}=13.8$ m/s and $k_{speed}=10$.


\subsection{Training}

For the training of the primary policy $\pi_0$, dataset $D_0$, which contains 2800 image data were collected by using expert policy $\pi^*$. Nesterov Adam Optimizer (Nadam) was used as an optimizer for the training of the network with the initial learning rate of $10^{-5}$ and moment of 0.99. The Training continued for ten epochs with the batch size of 32.

Trained primary policy $\pi_0$ is tested on the pre-collected dataset to classify trajectories and calculate the $l^2$-Norm of each sample in the dataset. The weakness of the network over trajectory segments is determined by a coefficient of weakness, which is defined as in Eq. (\ref{eq:class_weak}). \begin{equation}
\label{eq:class_weak}
c_{i} = \dfrac{N_{L2_i}}{N_{i}} \times \mu_{L2,i}
\end{equation}

\noindent where $\mu$, $\sigma$ are mean and standard deviations for the $l^2$-Norm of class$_i$. $N_{L2_i}$ is the total number of samples in class$_i$ that $l^2$-Norm of samples fall in the region of one $\sigma$ away from the mean $\mu$. $N_{i}$ is the total number of samples in class$_i$.

Once the weakness coefficients are calculated, trajectory classes are sorted according to their weakness coefficients, and the two of the most dominant unsafe classes will be chosen for data aggregation as shown in Table \ref{table:coeff}. Additionally, the classes with the mean $l^2$-Norm lower than 1, will be selected as allowable classes.

As depicted in Table \ref{table:coeff}, the weakness coefficients for the class $LS$ and $HS$ are quite low and never chosen as weak classes. The initial dataset for the training is biased toward $LS$, and $HS$ classes and $l^2$-Norms in those classes are low, which lead to low weakness coefficients. Moreover, training track does not have many samples from class $LR$ so that weakness coefficients for the class $LR$ is also low.

\begin{table}[h]
{\setlength{\tabcolsep}{7pt}
\caption{Coefficient of weakness for each class}
\begin{center}
\vspace{-3mm}
\begin{tabular}{ccccccc}
\hline\hline
\# Iter. &  $LL$ &  $HL$  &  $LR$  &  $HR$  &  $LS$ & $HS$ \\ \hline
1 & 0.004 & \textbf{0.321} & 0.019 & \textbf{0.694} & 0.002 & 0.010 \\
2 & \textbf{0.505} & 0.122 & 0.037 & \textbf{0.278} & 0.001 & 0.023 \\
3 & \textbf{0.635} & 0.264 & 0.028 & \textbf{0.607} & 0.001 & 0.062 \\
4 & \textbf{0.751} & 0.515 & 0.046 & \textbf{0.646} & 0.001 & 0.010 \\
5 & 0.018 & \textbf{0.678} & 0.034 & \textbf{0.755} & 0.001 & 0.010 \\
6 & 0.009 & \textbf{0.752} & 0.039 & \textbf{0.849} & 0.000 & 0.006 \\
7 & 0.717 & \textbf{0.790} & 0.038 & \textbf{0.780} & 0.001 & 0.004 \\
8 & 0.028 & \textbf{0.787} & 0.017 & \textbf{0.794} & 0.001 & 0.006 \\
9 & \textbf{0.670} & 0.634 & 0.011 & \textbf{0.713} & 0.001 & 0.005 \\
10& 0.012 & \textbf{0.768} & 0.020 & \textbf{0.809} & 0.001 & 0.003 \\
\hline
\end{tabular}
\end{center}
\label{table:coeff}}
\vspace{-3mm}
\end{table}

After determination of the weak and allowable classes, the data aggregation phase begins. In this phase, policy $\pi_i$ drives the car to collect 10 batches of data in dominant classes. If policy faces with dominant classes, the expert policy takes control of the vehicle and samples are taken in that time labeled and reserved for aggregation. If policy $\pi_i$ faces with allowable classes which are unsafe, it continues to drive the car. For all the other unsafe classes, the expert policy takes control of the vehicle with the query limit of 10 batch-size. When the number of query reaches the limit, data aggregation freezes and training starts with the new aggregated dataset $D_i$.

After the training, $\pi_i$ becomes $\pi_{i+1}$, and determination of dominant weak classes on the pre-collected data is repeated for collecting relevant data. This process will be repeated for 10 iterations. As shown in Fig. \ref{fig:3-iteration_convergence}, in the dataset aggregation iteration number 1, a significant fraction of dataset is unsafe, and as it proceeds to recover from the most problematic cases, the model error converges. The progress of this process can be seen from iteration number 1 to 10.

\section{RESULTS}

\begin{figure}
	\centering
	\vspace{-2mm}
	\begin{subfigure}[b]{0.5\textwidth}
		\centering
		\includegraphics[width=0.80\linewidth,trim={0.2cm 0 1.2cm 1.2cm},clip]{Norm.png}
		\caption{}
		\label{fig:Norm}
	\end{subfigure}

	\begin{subfigure}[b]{0.5\textwidth}
		\centering
		\includegraphics[width=0.80\linewidth,trim={0.2cm 0 1.2cm 1cm},clip]{results1200.png}
		\caption{}
		\label{fig:Norm2}
	\end{subfigure}

	\caption[Two numerical solutions]{(a) Performance of the Selective SafeDAgger algorithm for all classes at each aggregation iteration. (b) $l^2$-Norm of prediction and ground truth over 10000 samples at each iteration.}
\end{figure}
% \vspace{-3mm}

In Fig. \ref{fig:Norm}, we present the performance of the Selective SafeDAgger with using metric of $l^2$-Norm in each class during the training process. For the first iteration, $HR$ and $HL$ are chosen as weak classes and data for new dataset comes from those classes by querying expert policy. It is seen that in the second iteration, $l^2$-Norms drops for all classes by using aggregated dataset. Notice that the performance of the policy for the other classes is also increased without querying expert policy for those classes which are not the case for the SafeDAgger. Sequential decision making is the main idea behind this behavior. In SafeDAgger, when policy shifts from nominal conditions, the expert policy is called, and the new dataset is collected until the safety criterion is met, which leads to an unnecessary query of the expert policy. On the other hand, Selective SafeDAgger tries to solve the problem from the beginning by finding problematic classes. Besides, after the seventh iteration, the norm of all classes drops below the allowable threshold, which means that resultant dataset covers almost all trajectory classes as seen in Fig. \ref{fig:3-iteration_convergence}.

The trained model is tested at each iteration by taking 10000 samples from the environment and mean $l^2$-Norms are calculated, accordingly. Fig. \ref{fig:Norm2} shows that selective SafeDAgger method has better performance in all iterations than the SafeDAgger method even though both ways have the same amount of query to the expert as depicted in Table \ref{table:query}.

%\begin{figure}[]
%	\centering
%	\fbox{\includegraphics[width=0.8\columnwidth,trim={0 0 0 1.2cm},clip]{results1200.png}}
%	\caption{$l^2$-Norm of prediction and ground truth over 10000 samples at each iteration.}
%	\label{fig:results}
%\end{figure}

\vspace{-2mm}
\begin{table}[h]
	{\setlength{\tabcolsep}{5pt}
		\caption{Query to expert }
		\begin{center}
			\vspace{-3mm}
			\begin{tabular}{cccccccc}
				\hline\hline
				\multirow{2}{*}{} &  \multicolumn{6}{c}{Selective SafeDAgger}  & SafeDAgger \\
				\cline{2-7}   &  $LL$ &  $HL$  &  $LR$  &  $HR$  &  $LS$ & $HS$ &  unsafe \\
				\hline
				Iteration 1     &  0 & 127  &  38 &  155 &  0 &  0 & 320\\
				Iteration 2     &  0 & 44  & 0  & 228  &  0 & 48  &  320\\
				Iteration 3     &  19 &  63 & 0  & 238  & 0  & 0  & 320\\
				Iteration 4     & 27  & 12  & 0  & 281  & 0  & 0  & 320\\
				Iteration 5     &  0 & 165  & 0  & 155  & 0  & 0  & 320\\
				Iteration 6     & 31  & 189  & 0  & 100  & 0  & 0  & 320\\
				Iteration 7     & 0  & 93  & 0  & 227  & 0  & 0  & 320\\ \
				Iteration 8     &  2 & 162  & 0   & 156  & 2  & 5  & 320\\
				Iteration 9     &  83 & 0  & 0  & 237  & 0  & 0  & 320\\
				Iteration 10    &  0 & 205  &  0 & 115  & 0  & 0  & 320\\
				\cline{2-7}
				Total         &  \multicolumn{6}{c}{3200} & 3200\\
				\hline
			\end{tabular}
		\end{center}
		\label{table:query}}
	\vspace{-6mm}
\end{table}


\begin{figure}[h]
	\centering
	\includegraphics[width=0.9\columnwidth]{3tracks.png}
	\caption{Geometry of test tracks.}
	\label{fig:3test}
% 		\vspace{-4mm}
\end{figure}

Three unseen test tracks were devised to evaluate the generalization performance of the proposed method, where their layouts are illustrated in Fig. \ref{fig:3test}. The generalization performance of the Selective SafeDAgger is depicted in Table \ref{table}, which shows its superiority over SafeDAgger method. The selectivity of the proposed algorithm will define the unsafe cases that dominate all other classes, which results in faster convergence of the model error compared to different dataset aggregation methods.

\begin{table}[H]
	{\setlength{\tabcolsep}{10pt}
		\caption{Mean $l^2$-Norm on Unseen Test Track}
		\begin{center}
			\vspace{-3mm}
			\begin{tabular}{ccc}
				\hline\hline
				& Selective SafeDAgger & SafeDAgger \\
				\hline
				1. Test Track & \textbf{0.4794} & 0.5518\\
				2. Test Track & \textbf{0.3295} & 0.4986\\
				3. Test Track & \textbf{0.3254} & 0.3632\\
				\hline
			\end{tabular}
		\end{center}
		\label{table}}
	\vspace{-2mm}
\end{table}

\section{CONCLUSIONS}

In this work, we implemented a Selective SafeDAgger algorithm which is sample-efficient in the selection of dataset aggregation. The proposed algorithm evaluates the performance of the trained policy and determines the weakness of the policy over different trajectory classes and recovers the policy from those specific trajectory classes. Our method outperforms the SafeDAgger algorithms in term of sample-efficiency and convergence rate. Next, we aim to cluster the trajectories with unsupervised neural network techniques to have a better realization of the road trajectories.

\addtolength{\textheight}{-0cm}   % This command serves to balance the column lengths
                                  % on the last page of the document manually. It shortens
                                  % the text height of the last page by a suitable amount.
                                  % This command does not take effect until the next page
                                  % so it should come on the page before the last. Make
                                  % sure that you do not shorten the text height too much.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%\section*{APPENDIX}

%Appendixes should appear before the acknowledgment.

\section*{ACKNOWLEDGMENT}

This  work  is  supported  by \textbf{Scientific  and  Technological Research  Council  of  Turkey} (Turkish:\textbf{TBTAK}) under the grant agreement \textbf{TEYDEB 1515 / 5169901}.


%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


\begin{thebibliography}{99}

\bibitem{leonard2008}J. Leonard, et al., "A Perception-Driven Autonomous Urban Vehicle." Journal of Field Robotics. (10):727-748.

\bibitem{c1} Z. Chen and X. Huang, "End-to-end learning for lane keeping of self-driving cars," 2017 IEEE Intelligent Vehicles Symposium (IV), Los Angeles, CA, 2017, pp. 1856-1860.

\bibitem{c2} D. Pomerleau, ALVINN: An Autonomous Land Vehicle in a Neural Network. NIPS (1988).

\bibitem{c3} M. Bojarski, et al., "End to end learning for self-driving cars", CoRR, vol. abs/1604.07316, 2016.

\bibitem{c4} H. Xu, Y. Gao, F. Yu, and T. Darrell, "End-to-End Learning of Driving Models from Large-Scale Video Datasets," 2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR), Honolulu, HI, 2017, pp. 3530-3538.

\bibitem{c5} J. Kim and J. Canny, "Interpretable Learning for Self-Driving Cars by Visualizing Causal Attention," 2017 IEEE International Conference on Computer Vision (ICCV), Venice, 2017, pp. 2961-2969.

\bibitem{c6} Z. Yang, Y. Zhang, J. Yu, Junjie Cai, and Jiebo Luo.
End-to-end multi-modal multi-task vehicle control for self-driving cars with visual perception. arXiv preprint arXiv:1801.06734, 2018.


\bibitem{c7} S.  Ross,  G.  Gordon,  and  A.  Bagnell.   A reduction of imitation learning and structured prediction to no-regret online learning. Journal of Machine  Learning Research, 15:627635, 2011.

\bibitem{c8} J. Zhang and K. Cho. Query-efficient imitation learning for end-to-end simulated driving. In AAAI, 2017.

\bibitem{c9} K. Menda, et al., EnsembleDAgger: A Bayesian Approach to Safe Imitation Learning. CoRR abs/1807.08364 (2018): n. pag.

\bibitem{c10} S. Shah, D. Dey, C. Lovett, A. Kapoor, "Airsim: High-fidelity visual and physical simulation for autonomous vehicles", Field and Service Robotics, 2017.


%\bibitem{c11} M. Teti, E. Barenholtz, S. Martin, \& W. Hahn, (2018). A Systematic Comparison of Deep Learning Architectures in an Autonomous Vehicle. arXiv preprint arXiv:1803.09386.

%\bibitem{c12} S. Du, H. Guo, and A. Simpson, Self-Driving Car Steering Angle Prediction Based on Image Recognition. (2017).

%\bibitem{c13} F. Codevilla, M.  Mller, A. Lpez,  V.  Koltun,  and A. Dosovitskiy. (2017). End-to-end    driving    via    conditional   imitation    learning. [Online].  Available:  https://arxiv.org/abs/1710.02410.

\end{thebibliography}

\end{document}

\documentclass{article}
% if you need to pass options to natbib, use, e.g.:
 \PassOptionsToPackage{numbers, compress}{natbib}
% before loading nips_2018

% ready for submission
%\usepackage{nips_2018}

% to compile a preprint version, e.g., for submission to arXiv, add
% add the [preprint] option:
%\usepackage[preprint]{nips_2018}
\usepackage{graphicx}

% to compile a camera-ready version, add the [final] option, e.g.:
\usepackage[final]{nips_2018}

% to avoid loading the natbib package, add option nonatbib:
%\usepackage[nonatbib]{nips_2018}
\usepackage[utf8]{inputenc} % allow utf-8 input
\usepackage[T1]{fontenc}    % use 8-bit T1 fonts
\usepackage{hyperref}       % hyperlinks
\usepackage{url}            % simple URL typesetting
\usepackage{booktabs}       % professional-quality tables
\usepackage{amsfonts}       % blackboard math symbols
\usepackage{nicefrac}       % compact symbols for 1/2, etc.
\usepackage{microtype}      % microtypography
\setcitestyle{square}

\title{
Two-stream Convolutional Networks for End-to-end Learning of Self-driving Cars
}

% The \author macro works with any number of authors. There are two
% commands used to separate the names and addresses of multiple
% authors: \And and \AND.
%
% Using \And between authors leaves it to LaTeX to determine where to
% break the lines. Using \AND forces a line break at that point. So,
% if LaTeX puts 3 of 4 authors names on the first line, and the last
% on the second line, try using \AND instead of \And before the third
% author name.

\author{
  Nelson Fernandez\textsuperscript{a,\dag}\\
  \textsuperscript{a} Renault Automotive\\
  %Technocentre Guyancourt, 78280, France \\
  \texttt{nelson.fernandez-pinto@renault.com} \\
  \textsuperscript{\dag} Previously at Axionable Labs, Paris France\\
  %\texttt{nelson.fernandez-pinto@telecom-sudparis.eu} \\
  %% examples of more authors
  %% \And
  %% Coauthor \\
  %% Affiliation \\
  %% Address \\
  %% \texttt{email} \\
  %% \AND
  %% Coauthor \\
  %% Affiliation \\
  %% Address \\
  %% \texttt{email} \\
  %% \And
  %% Coauthor \\
  %% Affiliation \\
  %% Address \\
  %% \texttt{email} \\
  %% \And
  %% Coauthor \\
  %% Affiliation \\
  %% Address \\
  %% \texttt{email} \\
}

\newcommand\blfootnote[1]{%
  \begingroup
  \renewcommand\thefootnote{}\footnote{#1}%
  \addtocounter{footnote}{-1}%
  \endgroup
}

\begin{document}
% \nipsfinalcopy is no longer used
%\blfootnote{A footnote without marker}
\blfootnote{\dag This work has been supported by  Axionable Labs, Paris 75003, France.}

\maketitle
\begin{abstract}
We propose a methodology to extend the concept of Two-Stream Convolutional Networks to perform end-to-end learning for self-driving cars with temporal cues. The system has the ability to learn spatiotemporal features by simultaneously mapping raw images and pre-calculated optical flows directly to steering commands. Although optical flows encode temporal-rich information, we found that 2D-CNNs are prone to capturing features only as spatial representations. We show how the use of Multitask Learning favors the learning of temporal features via inductive transfer from a shared spatiotemporal representation. Preliminary results demonstrate a competitive improvement of 30\% in prediction accuracy and stability compared to widely used regression methods trained on the Comma.ai dataset.
\end{abstract}

\section{Introduction}
Decision making in the spatiotemporal domain is a key issue for autonomous driving systems \cite{DecisionMaking}. The current paradigm is the implementation of a Convolutional Neural Network (CNN) to perform direct steering commands regression from raw images \cite{Pilotnet}. A forward-facing camera records video footage of the road, which is tagged with the vehicle's cinematic measurements, such as speed, wheel angle and acceleration. The data is then used to train the CNN regressor with a supervised learning approach. This system has proven to be efficient on extracting spatial cues \cite{Pilotnet2} and achieves respectable accuracy. The main limitation is that frames are analyzed individually without taking into account previous actions. Therefore, any temporal information is lost due to this abstraction.

Following its introduction by Symonyan and Zisserman (2014) \cite{Simonyan}, Two-Stream Convolutional Networks have become widely adopted as the preferred method for action recognition in videos. The main advantage is the exploitation of the temporal dynamics captured by the optical flow between adjacent frames. In this case, the temporal stream is prone to decoding features instead of making an estimation of the motion field. To the best of our knowledge, the use of this architecture in regression tasks, especially autonomous steering, has not been studied in depth.

It is widely believed that the use of temporal features together with spatial characteristics could lead to better model predictions. Currently, the main research efforts are focused on the use of Recurrent Neural Networks (RNN) \cite{Eraqi} and 3D-Convolutional Neural Networks (3D-CNN) \cite{3D}. In contrast to intuition, the use of 2D-CNNs could bring some advantages compared to RNNs, in particular, a faster and more straightforward training. Also, 2D-CNNs are less likely to overfit when performing temporal tasks \cite{overfit}. On the other hand, 3D-CNNs must perform an implicit estimation of the optical flow while trying to learn temporal features through the depth of the input image tensor, which makes training difficult.

 The principal motivation of this work is to demonstrate that Two-Stream Convolutional Networks are a robust alternative to perform end-to-end learning with temporal cues. This article describes the preliminary results tested on Comma.ai's dataset \cite{Comma} compared to other regression methods.

\section{Methods}
\label{gen_inst}
Based on the Two-Stream Convolutional Network architecture, it is possible to design a custom self-driving system that takes advantage of spatial and temporal features using 2D-CNNs. This system is composed of two visual streams taken from a single front-facing camera and its optical flow calculated from the previous image. The optical flow can be estimated efficiently in almost real time using the current accelerated computing technology. In addition, specialized hardware can be integrated to obtain live optical flow streams. A recording device uses the controller area network (CAN) bus to collect steering wheel commands directly from the vehicle's computer. This data is used to train a regressor, which provides a prediction of the required steering angle based on inputs. The network output is decoded and sent to a proportionalintegralderivative controller (PID), to finally steer the position of the wheels. Figure 1 shows an overview of the proposed self-driving car system.

\begin{figure}[h!]
  \centering
  \includegraphics[width=0.85\textwidth]{system_overview.png}
  \caption{Proposed Two-Stream self driving navigation system.}
\end{figure}

Figure 2 shows the proposed network architecture that incorporates key elements of Two-Stream Convolutional Networks and 2D-CNNs to steering angle regression. The ensemble is composed by two identical CNN branches similar to \cite{Inception}. Global spatial pooling is applied in the last convolutional layers, aggregating information from all dimensions into single real values. This global aggregation considerably reduces the number of parameters and decreases the risk of overfitting \cite{GAP}. The resulting embeddings are merged with a fusion layer of type element-wise multiplication, forming a shared spatiotemporal representation. A multi-layered perceptron (MLP) with linear output neurons performs the regression.
Following the Two-Stream Networks logic, the first branch is intended to learn spatial features extracted from raw images \cite{Pilotnet2}. The second learns temporal characteristics from the estimated relative displacement of objects between frames.

Based on the inputs, it is possible to train the network to simultaneously predict the current and previous state of the target variable using Multitask Learning (MTL). As the goal of architecture is to capture short-term temporal dependencies, we assume that on the horizon studied, the inputs provide the information needed to perform the regression. The main motivation of performing MTL is to favor the learning of temporal features via inductive transfer \cite{Caruana}. By consequence, the auxiliary target is dropped during the inference step. The architecture has about 45 million parameters and is trained using backpropagation.

\begin{figure}[h!]
  \centering
  \includegraphics[width=1\textwidth]{architecture.png}
  \caption{Proposed Two-Stream Convolutional Network architecture.}
\end{figure}

The model was trained with 6.2 driving hours from the Comma.ai dataset \cite{Comma}, corresponding to approximately 220K image/optical flow pairs of size 160x320x3 sampled at 10 Hz. The optical flow was calculated using the Farnebck's \cite{Optical} implementation available in OpenCV. The estimated motion field was mapped to a standardized RGB color wheel. Dropout and data augmentation strategies, including spatial and photometric transformations were applied to avoid overfitting. Due to the small size of the training set, this work is more aimed at obtaining a benchmark rather than training an autopilot ready for production.

A two-step training strategy was used, with 60\% of the training data for feature learning, and 40\% for fine-tuning. During the first step, the CNN branches are trained independently, the weights obtained are used as initialization of the second stage. In the second step, we fine-tune the architecture by focusing on the MLP. We set the following hyper parameters: Loss function of type Mean Squared Error (MSE), Adam optimizer \cite{Adam} with learning rate 1e-4 for feature learning and 0.5e-4 for fine tuning, and a batch size of 64 for feature learning, and 8 for fine tuning. The model trained for 30 epochs in the feature learning step and for 1 epoch for fine tuning. The architecture was implemented using the Keras functional API running on a machine with a single NVIDIA M6 GPU and 120Gb of RAM.

%\section{Results}
%\label{others}
%\subsection{Validation}
The test set comprised 56 minutes of video taken from the second and third files of the Comma.ai dataset, approximately 33K images sampled at 10Hz. This was done to prevent data leakage due to similar frames in the training and test partitions in accordance with \cite{Eraqi}. As no examples of these folders were included in the training set, the regression task becomes harder, measuring the generalization capabilities of the model.

The main evaluation metric was the Root Mean Squared Error (RMSE), that measured the accuracy of the steering angle regression. We also used the whiteness \cite{Eraqi}, that measured the smoothness of the predictions over time. This quantity is based on the square of the first time derivative calculated according to expression (1). The whiteness reflects the temporal dynamics of the steering task. Low values mean smooth steering, and therefore a more comfortable driving experience for users.
%The lower the value of the whiteness, the smoother the prediction.

\begin{equation}\label{(whiteness)}
Whiteness =\sqrt{\frac{1}{D}\sum_{i=1}^{D}\frac{\partial P_{t_{i}}}{\partial t}^{2}}[\frac{degrees}{time}]
\end{equation}


\section{Results}
\label{headings}
The first experiment is intended to evaluate the effect of using optical flows instead of raw images to train single 2D-CNNs. For this, we trained two widely used architectures to perform steering angle regression directly from images and optical flows respectively. The main idea is to assess the capability of CNNs to learn temporal features from the displacement of objects between consecutive frames. The results of this experience are shown in table 1.

\begin{table}[hbt!]
  \caption{Test set RMSE and whiteness of single-CNN architectures.}
  \label{simple-table}
  \centering
  \begin{tabular}{ccccc}
    \toprule
    \begin{tabular}{lllll} & {Trained with raw images}  &  & {Trained with optical flows} &\\
    \cmidrule(r){2-5}
    Model & RMSE[degrees]& Whiteness & RMSE [degrees]& Whiteness\\
    \midrule
    Simple-CNN  \cite{Pilotnet} & 20.55  & 8.84 & \textbf{14.21}  & 8.74 \\
    Inception V3 & 17.76  & 7.05 & \textbf{12.58}  & 7.15 \\
    \bottomrule
    \end{tabular}
  \end{tabular}
\end{table}

%\footnotemark
%\footnotetext{Similar to \cite{Pilotnet}. }
The above results show that whichever CNN architecture is utilized, the optical flow produces significant accuracy gains in the test set compared to raw images. We also see some improvement associated to the architecture sophistication of Inception V3 \cite{Inception} compared to simple CNNs. These results suggest that the object displacement representation is convenient to learn specific features improving generalization. The fact that the whiteness did not improve, implies that the networks are learning features only as spatial representations.

In the second experiment, we evaluate the proposed architecture alongside other widely-used steering angle regression methods. Table 2 shows that Two-Stream Convolutional Network outperformed state-of-the-art 2D-CNN models in prediction accuracy and stability, with a whiteness closer to the human driver.

\begin{table}[ht!]
  \caption{Test set steering wheel angle regression error.}
  \label{sample-table}
  \centering
  \begin{tabular}{lll}
    \toprule
    \cmidrule(r){1-2}
    Architecture     & RMSE [degrees] & Whiteness [degrees/time]\\
    \midrule
    Human driver & N/A  & 4.36     \\
    Comma.ai 2D-CNN & 23.99  & 9.81     \\
    Nvidia PilotNet & 20.55 & 9.23      \\
    Inception V3    & 17.76 & 7.05      \\
    \textbf{Two-Stream Network} & \textbf{12.52}  & \textbf{4.97}      \\
    \bottomrule
  \end{tabular}
\end{table}

Figure 3 shows the scatter plot of the test set instances in paired coordinates of instantaneous whiteness (x-axis) and steering angle (y-axis), along with predictions of three different CNN regression models. As the whiteness is proportional to the square of the first time derivative, the figure is a spatiotemporal representation of the steering wheel angle task. The proposed architecture (right) helps to resolve the bias in the spatial and temporal axis of individual CNN streams trained with raw images (left) and optical flows (center). This improvement is achieved by the incorporation of a shared spatiotemporal representation and MTL from a time-related task. The learning of the previous steering angle as an auxiliary task, produces bias correction in the temporal domain, reducing the whiteness. This filtering effect would be difficult to achieve using only single-target individual CNNs.

\begin{figure}[h!]
  \centering
  \includegraphics[width=1\textwidth]{bias.png}
  \caption{Test set model bias of a spatial stream (left), temporal stream (center) and the proposed architecture (right).}
\end{figure}


\section{Discussion}
\label{others}
The main contribution of this work is to demonstrate that Two-Stream Convolutional Networks are a robust alternative to augment end-to-end learning with spatiotemporal cues. We found that the use of optical flows instead of raw images can significantly increase the accuracy of current self-driving systems. The main reason for this is that optical flows provide a convenient representation that can be easily learned by CNNs. However, the use of optical flows alone is not sufficient to incorporate temporal dependencies, as CNNs are biased to learning characteristics as spatial representations.

The proposed Two-Stream architecture learns how to effectively combine spatial and temporal information from a shared spatiotemporal representation that encodes the relative movement of objects between two consecutive frames. The introduction of a related auxiliary task (the previous steering angle) guides the network to discover short-term temporal dependencies via inductive transfer. Then, an important bias correction is achieved in the spatial and temporal domains improving generalization. The results show a competitive performance increase in prediction accuracy and stability of 30\%, when compared to widely-used regression methods trained on the Comma.ai data set. The next steps of this work will focus on the study of MTL for autonomous driving regression tasks.

%\subsubsection*{Acknowledgments}
%This research was conducted with the support of Axionable R\&D laboratory in Paris, France. Special thanks to Damien Menigaux, Carl Robinson and Luis Tobias for their support during the realization of this work.

\begin{thebibliography}{9}

\bibitem{DecisionMaking}
Schwarting, W., Alonso-Mora, J., \& Rus, D. (2018). Planning and Decision-Making for Autonomous Vehicles. Annual Review of Control, Robotics, and Autonomous Systems, 1, 187-210.

\bibitem{Pilotnet}
Bojarski, M., Del Testa, D., Dworakowski, D., Firner, B., Flepp, B., Goyal, P., ... \& Zhang, X. (2016). End to end learning for self-driving cars. arXiv preprint arXiv:1604.07316.

\bibitem{Pilotnet2}
Bojarski, M., Yeres, P., Choromanska, A., Choromanski, K., Firner, B., Jackel, L., \& Muller, U. (2017). Explaining how a deep neural network trained with end-to-end learning steers a car. arXiv preprint arXiv:1704.07911.

\bibitem{Simonyan}
Simonyan, K., \& Zisserman, A. (2014). Two-stream convolutional networks for action recognition in videos. In Advances in neural information processing systems (pp. 568-576).

\bibitem{Eraqi}
Eraqi, H. M., Moustafa, M. N., \& Honer, J. (2017). End-to-end deep learning for steering autonomous vehicles considering temporal dependencies. arXiv preprint arXiv:1710.03804.

\bibitem{3D}
Tran, D., Bourdev, L., Fergus, R., Torresani, L., \& Paluri, M. (2015). Learning spatiotemporal features with 3d convolutional networks. In Proceedings of the IEEE international conference on computer vision (pp. 4489-4497).


\bibitem{overfit}
Xie, S., Sun, C., Huang, J., Tu, Z., \& Murphy, K. (2018, September). Rethinking spatiotemporal feature learning: Speed-accuracy trade-offs in video classification. In Proceedings of the European Conference on Computer Vision (ECCV) (pp. 305-321).


\bibitem{Comma}
Santana, E., \& Hotz, G. (2016). Learning a driving simulator. arXiv preprint arXiv:1608.01230.

\bibitem{Inception}
Szegedy, C., Vanhoucke, V., Ioffe, S., Shlens, J., \& Wojna, Z. (2016). Rethinking the inception architecture for computer vision. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 2818-2826).

\bibitem{GAP}
Lin, M., Chen, Q., \& Yan, S. (2013). Network in network. arXiv preprint arXiv:1312.4400.
ISO 690

\bibitem{Caruana}
Caruana, R. (1997). Multitask learning. Machine learning, 28(1), 41-75.

\bibitem{Optical}
Farnebck, G. (2003). Two-frame motion estimation based on polynomial expansion. In Scandinavian conference on Image analysis (pp. 363-370). Springer, Berlin, Heidelberg.

\bibitem{Adam}
Kingma, D. P., \& Ba, J. (2014). Adam: A method for stochastic optimization. arXiv preprint arXiv:1412.6980.

\end{thebibliography}

\end{document}

%% bare_conf.tex
%% V1.4b
%% 2015/08/26
%% by Michael Shell
%% See:
%% http://www.michaelshell.org/
%% for current contact information.
%%
%% This is a skeleton file demonstrating the use of IEEEtran.cls
%% (requires IEEEtran.cls version 1.8b or later) with an IEEE
%% conference paper.
%%
%% Support sites:
%% http://www.michaelshell.org/tex/ieeetran/
%% http://www.ctan.org/pkg/ieeetran
%% and
%% http://www.ieee.org/

%%*************************************************************************
%% Legal Notice:
%% This code is offered as-is without any warranty either expressed or
%% implied; without even the implied warranty of MERCHANTABILITY or
%% FITNESS FOR A PARTICULAR PURPOSE!
%% User assumes all risk.
%% In no event shall the IEEE or any contributor to this code be liable for
%% any damages or losses, including, but not limited to, incidental,
%% consequential, or any other damages, resulting from the use or misuse
%% of any information contained here.
%%
%% All comments are the opinions of their respective authors and are not
%% necessarily endorsed by the IEEE.
%%
%% This work is distributed under the LaTeX Project Public License (LPPL)
%% ( http://www.latex-project.org/ ) version 1.3, and may be freely used,
%% distributed and modified. A copy of the LPPL, version 1.3, is included
%% in the base LaTeX documentation of all distributions of LaTeX released
%% 2003/12/01 or later.
%% Retain all contribution notices and credits.
%% ** Modified files should be clearly indicated as such, including  **
%% ** renaming them and changing author support contact information. **
%%*************************************************************************


% *** Authors should verify (and, if needed, correct) their LaTeX system  ***
% *** with the testflow diagnostic prior to trusting their LaTeX platform ***
% *** with production work. The IEEE's font choices and paper sizes can   ***
% *** trigger bugs that do not appear when using other class files.       ***                          ***
% The testflow support page is at:
% http://www.michaelshell.org/tex/testflow/


\documentclass[a4paper,conference]{IEEEtran}
\IEEEoverridecommandlockouts
% Some Computer Society conferences also require the compsoc mode option,
% but others use the standard conference format.
%
% If IEEEtran.cls has not been installed into the LaTeX system files,
% manually specify the path to it like:
% \documentclass[conference]{../sty/IEEEtran}


% Some very useful LaTeX packages include:
% (uncomment the ones you want to load)


% *** MISC UTILITY PACKAGES ***
%
%\usepackage{ifpdf}
% Heiko Oberdiek's ifpdf.sty is very useful if you need conditional
% compilation based on whether the output is pdf or dvi.
% usage:
% \ifpdf
%   % pdf code
% \else
%   % dvi code
% \fi
% The latest version of ifpdf.sty can be obtained from:
% http://www.ctan.org/pkg/ifpdf
% Also, note that IEEEtran.cls V1.7 and later provides a builtin
% \ifCLASSINFOpdf conditional that works the same way.
% When switching from latex to pdflatex and vice-versa, the compiler may
% have to be run twice to clear warning/error messages.


% *** CITATION PACKAGES ***
%
%\usepackage{cite}
% cite.sty was written by Donald Arseneau
% V1.6 and later of IEEEtran pre-defines the format of the cite.sty package
% \cite{} output to follow that of the IEEE. Loading the cite package will
% result in citation numbers being automatically sorted and properly
% "compressed/ranged". e.g., [1], [9], [2], [7], [5], [6] without using
% cite.sty will become [1], [2], [5]--[7], [9] using cite.sty. cite.sty's
% \cite will automatically add leading space, if needed. Use cite.sty's
% noadjust option (cite.sty V3.8 and later) if you want to turn this off
% such as if a citation ever needs to be enclosed in parenthesis.
% cite.sty is already installed on most LaTeX systems. Be sure and use
% version 5.0 (2009-03-20) and later if using hyperref.sty.
% The latest version can be obtained at:
% http://www.ctan.org/pkg/cite
% The documentation is contained in the cite.sty file itself.

\usepackage{graphicx}
\usepackage{times}
\usepackage{mathtools}
\usepackage{epsfig}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
% \usepackage{pmatrix}
\usepackage{makecell}
\usepackage{color, colortbl}
\definecolor{LightCyan}{rgb}{0.88,1,1}
\usepackage[shortlabels]{enumitem}
\usepackage{siunitx}


% Table footnote package
\usepackage{tablefootnote}
% *** GRAPHICS RELATED PACKAGES ***
%
\usepackage{graphicx}
\ifCLASSINFOpdf
  % \usepackage[pdftex]{graphicx}
  % declare the path(s) where your graphic files are
  % \graphicspath{{../pdf/}{../jpeg/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  % \DeclareGraphicsExtensions{.pdf,.jpeg,.png}
\else
  % or other class option (dvipsone, dvipdf, if not using dvips). graphicx
  % will default to the driver specified in the system graphics.cfg if no
  % driver is specified.
  % \usepackage[dvips]{graphicx}
  % declare the path(s) where your graphic files are
  % \graphicspath{{../eps/}}
  % and their extensions so you won't have to specify these with
  % every instance of \includegraphics
  % \DeclareGraphicsExtensions{.eps}
\fi
% graphicx was written by David Carlisle and Sebastian Rahtz. It is
% required if you want graphics, photos, etc. graphicx.sty is already
% installed on most LaTeX systems. The latest version and documentation
% can be obtained at:
% http://www.ctan.org/pkg/graphicx
% Another good source of documentation is "Using Imported Graphics in
% LaTeX2e" by Keith Reckdahl which can be found at:
% http://www.ctan.org/pkg/epslatex
%
% latex, and pdflatex in dvi mode, support graphics in encapsulated
% postscript (.eps) format. pdflatex in pdf mode supports graphics
% in .pdf, .jpeg, .png and .mps (metapost) formats. Users should ensure
% that all non-photo figures use a vector format (.eps, .pdf, .mps) and
% not a bitmapped formats (.jpeg, .png). The IEEE frowns on bitmapped formats
% which can result in "jaggedy"/blurry rendering of lines and letters as
% well as large increases in file sizes.
%
% You can find documentation about the pdfTeX application at:
% http://www.tug.org/applications/pdftex


% correct bad hyphenation here
\hyphenation{op-tical net-works semi-conduc-tor}


\begin{document}
%
% paper title
% Titles are generally capitalized except for words such as a, an, and, as,
% at, but, by, for, in, nor, of, on, or, the, to and up, which are usually
% not capitalized unless they are the first or last word of the title.
% Linebreaks \\ can be used within to get better formatting as desired.
% Do not put math or special symbols in the title.
% \title{\LARGE End-to-end Multi-modal Multi-tasking Vehicle Control \\
% for Self-Driving Cars with Visual Perceptions}
\title{End-to-end Multi-Modal Multi-Task Vehicle Control \\
for Self-Driving Cars with Visual Perceptions}


% author names and affiliations
% use a multiple column layout for up to three different
% affiliations
% \author{\IEEEauthorblockN{Michael Shell}
% \IEEEauthorblockA{School of Electrical and\\Computer Engineering\\
% Georgia Institute of Technology\\
% Atlanta, Georgia 30332--0250\\
% Email: http://www.michaelshell.org/contact.html}
% \and
% \IEEEauthorblockN{Homer Simpson}
% \IEEEauthorblockA{Twentieth Century Fox\\
% Springfield, USA\\
% Email: homer@thesimpsons.com}
% \and
% \IEEEauthorblockN{James Kirk\\ and Montgomery Scott}
% \IEEEauthorblockA{Starfleet Academy\\
% San Francisco, California 96678--2391\\
% Telephone: (800) 555--1212\\
% Fax: (888) 555--1212}}
% make the title area
\author{\IEEEauthorblockN{Zhengyuan Yang$^1$\textsuperscript{*}, Yixuan Zhang$^1$\textsuperscript{*}\thanks{\textsuperscript{*} Both authors contributed equally to this work.}, Jerry Yu$^2$, Junjie Cai$^2$ and Jiebo Luo$^1$}
\IEEEauthorblockA{$^1$Department of Computer Science,
University of Rochester, Rochester NY 14627, USA}
\IEEEauthorblockA{$^2$SAIC USA Innovation Center,
San Jose, CA 95134, USA}
\IEEEauthorblockA{$^1$Email: \{zyang39, jluo\}@cs.rochester.edu, \{yzh215\}@ur.rochester.edu}
\IEEEauthorblockA{$^2$Email: \{jyu, jcai\}@saicusa.com}
}
\maketitle

% As a general rule, do not put math, special symbols or citations
% in the abstract
\begin{abstract}
Convolutional Neural Networks (CNN) have been successfully applied to autonomous driving tasks, many in an end-to-end manner. Previous end-to-end steering control methods take an image or an image sequence as the input and directly predict the steering angle with CNN. Although single task learning on steering angles has reported good performances, the steering angle alone is not sufficient for vehicle control. In this work, we propose a multi-task learning framework to predict the steering angle and speed control simultaneously in an end-to-end manner. Since it is nontrivial to predict accurate speed values with only visual inputs, we first propose a network to predict discrete speed commands and steering angles with image sequences. Moreover, we propose a multi-modal multi-task network to predict speed values and steering angles by taking previous feedback speeds and visual recordings as inputs. Experiments are conducted on the public Udacity dataset and a newly collected SAIC dataset. Results show that the proposed model predicts steering angles and speed values accurately. Furthermore, we improve the failure data synthesis methods to solve the problem of error accumulation in real road tests. %Real car tests are deployed and evaluated.

\end{abstract}
\IEEEpeerreviewmaketitle

\section{Introduction}
In many traditional self-driving car solutions \cite{chen2015deepdriving,huval2015empirical,gurghian2016deeplanes,geiger20143d,zhang2013understanding}, vehicle controls are rule based where perception and vehicle control are two individual modules. Nvidia \cite{bojarski2016end} is the first to address the task of end-to-end steering angle control, where Convolutional Neural Networks (CNN) are used to regress steering angles directly from raw pixels recorded by front-view cameras. Xu et al. \cite{xu2016end} further propose to predict the steering angle and understand the scene simultaneously in an end-to-end fashion with an FCN-LSTM architecture.
%Despite the good steering predictions, understanding the network is also important.
A visual attention network \cite{kim2017interpretable} is proposed to help interpret the predictions with attention heatmaps. Other approaches \cite{zeiler2014visualizing,bojarski2017explaining}  are proposed to visualize the intermediate results in CNN.

Despite the fact that the end-to-end steering angle control has achieved good results and has been well interpreted, the steering angle alone is not sufficient for vehicle control. The lack of speed commands greatly limits the potential applications of the end-to-end methods. In this work, we propose to predict the steering angle and speed command simultaneously with a multi-task learning approach. Intuitively, it is challenging to predict an accurate speed value with only visual inputs. A correct turning angle can be predicted with sufficient training data on the road, since there is only one correct way to keep the vehicle on the road. However, the driving speed is determined by a number of other factors including driver's driving habits, surrounding traffic conditions, road conditions and so on. Many factors cannot be reflected solely through front-view cameras. Therefore, we start with an easier task of discrete speed command prediction. The task is to predict discrete speed control commands of accelerating, decelerating and maintaining speed. The discrete speed control commands can be adequately inferred from front-view cameras. For example, a decelerating command is predicted when there are obstacles in the front, and an accelerating command may be predicted when the road is clear and the vehicle speed is low.

Although discrete speed commands provide a preliminary version of vehicle speed control, there exist two shortcomings. First, the levels of accelerating and decelerating are pre-fixed, which limit the smoothness \cite{rajamani2011vehicle} of the vehicle control. Second, using only the visual inputs limits the command prediction accuracy under certain circumstances. For example, when the vehicle is already fast enough or at the speed limit, the accelerating command should not be made even if the road is clear. In the initial model, the speed is inferred automatically from the input image sequences, and the prediction may be inaccurate. To achieve a better vehicle control, we propose to take previous feedback speeds as an extra modality, and predict speeds and steering angles simultaneously. The proposed model is evaluated on the public Udacity dataset \cite{udacity} and the newly collected SAIC dataset. Experiment results show that the multi-modal multi-task network provides an accurate speed prediction while further improves the state-of-the-art steering angle prediction. Furthermore, we conduct real car tests on roads similar to the SAIC dataset's testing data. We also improve the failure case data synthesis methods to solve the problem of error accumulation.
%Details are presented in the experiment part.
% and the problem of error-accumulation are also discussed in the experiment part.

Our main contributions include the following:
\begin{itemize}[nosep]
\item We propose a multi-modal multi-task network for end-to-end steering angle and speed prediction.
\item We collect a new SAIC dataset containing the driving records during the day and night. The dataset will be released upon the publication of this work.
\item
% We propose a number of pre-processing methods to alleviate noise in real car tests.
We improve the failure case data synthesis methods to solve the problem of error accumulation in real car tests.
\end{itemize}

\section{Related Work}
% One of the early computer vision based self-driving car work is the DARPA Autonomous Vehicle (DAVE) \cite{lecun2004dave}, developed as a Defense Advanced Research Projects Agency (DARPA) seedling project.
ALVINN \cite{pomerleau1989alvinn} is one of the earliest successful neural network based self-driving vehicle project. The network is simple and shallow, but it manages to do well on simple roads with a few obstacles.
% DAVE-2 was inspired by the pioneering work of Pomerleau \cite{pomerleau1989alvinn} who in 1989 built the Autonomous Land Vehicle in a Neural Network (ALVINN) system
With the development of deep learning \cite{lecun2015deep,krizhevsky2012imagenet}, many systems use CNN for environment perception and steering angle prediction. Nvidia is the first to adopt Convolutional Neural Networks (CNN) for end-to-end steering angle prediction \cite{bojarski2016end}. They propose to predict steering angles with only three front-view cameras and manage to control the vehicle with the proposed system. There exist three main approaches: behavior reflex CNN, mediated perception and privileged training. Behavior reflex CNN \cite{bojarski2016end,kim2017interpretable,bojarski2017explaining,chi2017deep,bojarski2016visualbackprop,muller2006off} directly predict the steering angle from the visual inputs. The system has a low model complexity and can be robust with enough training data. Furthermore, it has a good generalization ability. However, the performance is limited in complicated environments and the results are difficult to interpret. Some systems propose visualization methods \cite{zeiler2014visualizing,bojarski2017explaining} and include attention mechanisms \cite{kim2017interpretable,xu2015show,chen2017brain} to better interpret the results. Mediated perception \cite{chen2015deepdriving} first maps visual inputs into several pre-defined parameters to depict the surroundings. Rule based methods then produce control commends with the estimated parameters. Such methods have a better vehicle control smoothness \cite{rajamani2011vehicle} but can only work in limited scenarios. Designing ideal control rules is also difficult. Privileged training \cite{xu2016end,sharmanska2013learning} is a multi-task approach that understands the scene and predicts vehicle commands simultaneously. The main limitation is the large amount of training data required. In this work, we expand the behavior reflex CNN with a multi-modal multi-task framework. Feedback speeds are used as an extra modality for steering angle and speed prediction.

\section{Method}
\begin{figure}[t]
\begin{center}
   \centerline{\includegraphics[width=9cm]{image/SAIC_base_model.pdf}}
\end{center}
\vspace{-0.2in}
\caption{End-to-end steering and discrete speed command model.}
\vspace{-0.1in}
\label{fig:base_mdoel}
\end{figure}

% To achieve basic speed control besides steering angle prediction, we propose the multi-tasking model to predict speed command and steering angle simultaneously. In this section, we first introduce the pre-processing methods used on real-car experiment. The ConvNet, Inertial Model and multi-tasking Model are then introduced.

In this section, we first introduce the base CNN model for end-to-end steering angle prediction. Based on the improved CNN structure, a multi-task network is proposed to predict the steering angle and discrete speed command simultaneously by taking an image sequence as the input. Finally, we propose a multi-modal multi-task network that takes previous feedback speeds as an extra modality and predicts the speed and steering angle simultaneously.
\subsection{Base Steering Model} \label{base steering model}
% Inspired by\cite{bojarski2016end}, the convolutional network has an outstanding performance of extracting visual features from images. We construct a base ConvNet model, shown in Fig 1., including Base ConvNet and steering control parts. Our network consists of 13 layers, including 5 convolutional layers, 2 normalization layers, 2 maxpooling layers, 3 fully connected layer and 2 dropout layer.

% The input of ConvNet includes processed images in HSV format, speed, angle in radius and speed command. Five convolutional layers are designed for feature extraction, as discussed  by Bojarski et al. \cite{bojarski2016end} After convolutional layers, we follow up with three fully connected layers, in shape of 1024, 50, 1 respectively. The first two fully connected layers lead to the prediction of steering angle. The last fully connected layer serve as the output of steering angle. Normalization layers allows the network to normalize data stream within the training process, and also accelerate GPU computation. Maxpooling layers can prevent spatial locational information loss, when the strongest activation propagates through the network, mentioned by Lee et al. \cite{lee2009convolutional} Both normalization layer and pooling layer are hard-coded, and do not involved in back-propagation. Dropout layer is utilized for avoiding overfitting, because of abundant volume of our dataset. \cite{srivastava2014dropout}

It is shown in \cite{bojarski2016end} that CNN has a good ability in extracting visual features and is capable of directly regressing the steering angle from raw pixels. Inspired by previous end-to-end steering angle prediction systems, we propose an improved CNN structure for this task with two improvements. As shown in Figure \ref{fig:base_mdoel} (a), the model consists of 9 layers including 5 convolutional layers and 4 fully connected layers. Unlike previous work \cite{bojarski2016end}, the convolutional layers are designed based on AlexNet \cite{krizhevsky2012imagenet,jia2014caffe} and a large kernel size is adopted in the first few layers. Experiments show that larger kernels are suitable for front-view cameras and can better capture the environment features. Another improvement is changing the aspect ratio of the input image to 1:1. Previous methods \cite{bojarski2016end,kim2017interpretable} resize the input with a fixed aspect ratio of around 2.5:1. The convolutional kernels with a same width and height are then adopted. According to human intuitions though, visual content distributed along the y-axis is more informative for steering angle prediction. This implies that CNN kernels should have a larger width than height. For simplicity, we squeeze the input images in width to an aspect ratio of 1:1 and continue using the square kernels. Experiments show that the two improvements, the larger kernel size and reshaped aspect ratio, improve the performance of the end-to-end steering angle prediction. We further combine these two improvements with larger networks like VGG \cite{simonyan2014very} and ResNet \cite{he2016deep}. Although the model tends to overfit on all the evaluated datasets, the combination is promising in the future when larger datasets are available.

The mean absolute error is adopted as the training loss function. In addition, We apply different loss weights to alleviate the problem of data imbalance, as going straight appears more frequently than turning. The data with a small steering angle has a small training loss weight and the turning data has a larger weight. This technique is applied to all steering angle prediction models in this paper.

\subsection{Discrete Speed Command Network} \label{speed command}
The end-to-end steering angle control successfully proves the feasibility of generating vehicle controls directly from front view cameras. However, the steering angle alone is not sufficient for vehicle control. The speed is another important parameter that needs to be predicted. Unlike the steering angle though, predicting the vehicle speed solely from a front view camera is counterintuitive, because even human drivers drive at different speeds given a similar road condition. Therefore, it is more reasonable to predict the speed control command from visual information, instead of directly predicting the desired speed values. For example, all drivers should slow down when the vehicle is too close to other cars or obstacles, and most drivers speed up when the road is clear. Based on this observation, we first propose a multi-task framework that predicts discrete speed commands and steering angles simultaneously. The model is called the speed command network.

As shown in Figure \ref{fig:base_mdoel} (b), the speed command network takes an image sequence as the input and predicts discrete speed commands and steering angles simultaneously. The convolutional layers have a same structure as in the base steering model. The encoded visual features are fed into an LSTM layer for temporal analysis. The output image sequence feature is used for both steering angle regression and speed command classification. As a first step, the speed commands contain three classes: "accelerating", "decelerating" and "maintaining speed". The cross entropy loss is used for speed command classification and the mean absolute error is calculated for steering angle prediction. A weighting term is added as a hyper-parameter to adjust the importance of the two tasks.
\subsection{Multi-modal Multi-task Network}
\begin{figure}[t]
\begin{center}
   \centerline{\includegraphics[width=8cm]{image/MMMT_model.pdf}}
\end{center}
\vspace{-0.2in}
\caption{End-to-end multi-modal multi-task vehicle control model. Different colors represent different modules.}
\vspace{-0.1in}
\label{fig:model}
\end{figure}
% To achieve a better vehicle control, we propose to take previous speed feedbacks as an extra modality and predict speed and steering angle simultaneously.
The speed command network provides an initial framework for vehicle speed control. However, the performance is limited due to the lack of input information. The visual contents from the front view cameras alone are not sufficient for accurate speed command prediction. For example, in most cases it is reasonable to speed up when the road is clear, but it is not the case when the vehicle is already at a high speed. Similarly, there is no need to slow down when the vehicle is already slow enough. These failure cases are observed in the experiments and vehicle speeds are necessary for making a good speed command prediction. Theoretically, the vehicle speed can be predicted from image sequences, but the prediction is difficult and inaccurate. A more reasonable solution is to directly adopt the feedback speeds. Therefore, we propose a multi-modal multi-task network to predict the values of steering angles and speeds simultaneously by taking previous feedback speeds as an extra modality.

The model structure is shown in Figure \ref{fig:model}. The network contains a visual encoder and a speed encoder. The visual encoder takes only one frame as inputs instead of using the CNN + LSTM structure. This greatly reduces the amount of computation, therefore guarantees a high FPS and a real-time performance even with low performance GPUs. The speed encoder encodes the pattern of previous feedback speed sequences. The encoded visual features are used for steering angle prediction, and the concatenation of visual features and feedback speed features are adopted for speed prediction. Both steering angle prediction and speed prediction apply mean absolute loss as a loss function, and a weighting parameter is tuned to adjust the weight between the two loss terms.

\section{Dataset}
% The proposed framework is first evaluated on several public dataset. Furthermore, the framework is also tested on our own collected data. The steering angle only model is selected as the baseline. Experiments show the multi-tasking setting both provide the extra speed command and improve the accuracy of steering angle prediction.
In this section, we first introduce the public Udacity dataset \cite{udacity}. The collection and statistics of the SAIC dataset is then discussed. Example frames of both datasets are shown in Figure \ref{fig:data_example}. Finally, we introduce the data pre-processing methods.
% Add statistics to SAIC dataset.
\subsection{Dataset}
\subsubsection{Udacity}
The Udacity dataset \cite{udacity} is originally provided for an online challenge. The dataset contains six video clips with a total duration of around 20 minutes. Speed values, steering angles and video streams from three front view cameras are recorded.
\begin{figure*}[t]
\begin{center}
   \centerline{\includegraphics[width=17cm]{image/data_example.pdf}}
\end{center}
\vspace{-0.3in}
\caption{Example frames and predictions on the Udacity and SAIC datasets. First row: the Udacity dataset. Second row: the SAIC dataset.}
\vspace{-0.1in}
\label{fig:data_example}
\end{figure*}
\subsubsection{SAIC} \label{saic}
In order to obtain a larger data size and find regions for real road test, we record and build the SAIC dataset. The dataset includes five hours of driving data in north San Jose area, mostly on urban roads. The dataset contains the driving data in both day and night. The vehicle goes between several nodes and each trip between the nodes has a duration of around ten minutes. Parking, waiting at traffic lights and some other conditions are considered as noisy parts and filtered out. After filtering out the noisy videos, two hours' data is split into training, validation and testing set. A whole video of a certain trip between two nodes is atomic in set splits. Three drivers are included to avoid biasing towards a specific driving behavior. Similarly, video streams, speed values and steering angles are recorded. The video streams contain videos from one center and two side front view cameras with a frame rate of 30 frames per second.
\subsection{Data Pre-Processing} \label{preprocessing}
\subsubsection{Image Pre-Processing}
We adopt several image pre-processing and data augmentation techniques to improve the robustness and prediction accuracy of the proposed system. The robustness under various lighting conditions is a major challenge for camera-based systems. We show that converting frames into different color spaces can improve the robustness towards lighting changes. The input frames are converted from RGB color space to HSV. A small rotation angle is randomly added to simulate the camera vibrations on vehicles. For data augmentation, random horizontal flips are first adopted. Another important technique is data synthesis with side cameras, which generates simulated failure cases for training.
\subsubsection{Speed Command Generating}
% Since our objective is to make a classification of three driving conditions of Speed up, Slow down and No change, we generate labels of speed command from both our own dataset and public datasets. For some public dataset, such as Comma.ai which have already provided with acceleration at each frame, we simply convert the numerical acceleration into three specific speed commands, "1", "-1", "0", respectively represent speed up, slow down and no change. For our own dataset, SAIC, there is no given acceleration data, but speed is given. Therefore, we generate the labels of speed command on our own. In detail, for those dataset given acceleration, we simply set a threshold to distinguish the boundary of three speed conditions. After a considerable visual judgment, we choose 0.25m/$s{^2}$ and -0.25m/$s{^2}$ to be the upper boundary and lower boundary respectively. In this way, any acceleration larger than 0.25m/$s{^2}$ would be considered as "Speed up", and any acceleration smaller than -0.25m/$s{^2}$ would be considered as "Slow down". And this standard applies on all the dataset we use in our experiment.
We introduce the methods for generating discrete speed commands. We first calculate acceleration from speed sequences with the following equation:
\begin{equation}
acce = \frac{speed_e - speed_s}{interval}
\end{equation}
where $acce$ is the calculated acceleration, $speed_e$ is the speed at the end of the interval, $speed_s$ is the speed at the start of the interval. The $interval$ is set to one second in our experiment. Two acceleration thresholds are then selected to generate the labels for the three classes: "accelerating", "decelerating" and "maintaining speed". According to manual visual observations and domain experts' suggestions, 0.25m/$s{^2}$ and -0.25m/$s{^2}$ are selected as the upper and lower thresholds, respectively. The accelerations larger than 0.25m/$s{^2}$ are labeled as "Accelerating", and the values smaller than -0.25m/$s{^2}$ is tagged with "Decelerating". Remaining minor speed changes are labeled as "Maintaining Speed".
% For our own dataset, we consider that human would have a short amount of time to take action, and usually it is between 0.7 and 3 seconds, and some accident reconstruction specialists use 1.5 seconds. Thus, we calculate the acceleration at the time interval of 1 second. Concretely speaking, we use the speed of 20th frame to minus the speed of 1st frame at each second (No need to divided by time period, mathematically divided by 1 does not affect the calculation).

\section{Experiment}
The proposed method is evaluated on the public Udacity dataset \cite{udacity} and the collected SAIC dataset. We first present the results of steering angle prediction. The performances of speed command predictions and speed value estimations are then evaluated. Finally, we introduce real car tests and an improved data synthesis method that solves the error accumulation problem in vehicle tests.

% We could add some visualizations, if space is enough. Like in \cite{chi2017deep}.

\begin{table}[!t]
\caption{Experiment results of steering angle prediction on Udacity}
\label{table:angle}
\centering
\begin{tabular}{|c|c|c|}
\hline
Method & Angle (MAE in $degree$) \\
\hline
Nvidia's PilotNet \cite{bojarski2016end} & 4.26 \\
\hline
Cg Network \cite{cg23}  & 4.18\\
% \tablefootnote{One of community models in Udacity Self-Driving challenge}
\hline
Base Steering Model & 2.84 \\
\hline
Discrete Speed Command Network & 1.85 \\
\hline
Multi-modal Multi-task Network & 1.26 \\
\hline
\end{tabular}
\end{table}

% \begin{table}[!t]
% \caption{Experiment results of steering angle prediction on Udacity dataset}
% \label{table:angle}
% \centering
% \begin{tabular}{|c|c|c|}
% \hline
% \multicolumn{3}{|c|}{Angle (MAE in $degree$)}\\
% \hline
% Method & Udacity & SAIC \\
% \hline
% Nvidia's PilotNet \cite{bojarski2016end} & 4.260 & \\
% \hline
% Cg Network \tablefootnote{One of community models in Udacity Self-Driving challenge} & 4.175 & \\
% \hline
% Base Steering Model & 2.844 & \\
% \hline
% Multi-modal Multi-task Network & 1.255 & \\
% \hline
% \end{tabular}
% \end{table}

\begin{table}[!t]
\caption{Results of speed value prediction on the Udacity dataset and the SAIC dataset with Multi-modal Multi-task Network}
\label{table:speed}
\centering
\begin{tabular}{|c|c|}
\hline
Dataset & Speed (MAE in $m/s$) \\
\hline
Udacity \cite{udacity} & 0.19 \\
\hline
SAIC & 0.45 \\
\hline
\end{tabular}
\end{table}

\subsection{Steering Angle Prediction}
% As to the steering angle prediction, we conduct five experiments by applying both existing models, for instance\cite{bojarski2016end} Cg Network, and our own models.
We first evaluate the performance of end-to-end steering angle prediction. The proposed multi-modal multi-task model is compared with several state-of-the-art models and the proposed improved single task network. Nvidia's PilotNet\cite{bojarski2016end} and the Cg Network \cite{cg23} proposed in the Udacity Self-Driving challenge is reimplemented and selected for comparison. As a regression task, the performance is reported in terms of MAE (Mean Absolute Error) in degree. Furthermore, we discard low speed data that is slower than $4 m/s$. It is observed that steering angles tend to be much larger when vehicles are almost stopped, which are considered as noise in steerings.
% since low-speed data might result from various circumstances which are probably outside our research purpose

The models are first evaluated on the Udacity dataset. As shown in Table \ref{table:angle}, the propose model is compared to the reimplemented Nvidia's PilotNet\cite{bojarski2016end} and the Cg Network \cite{cg23} from the Udacity Self-Driving challenge. Nvidia's PilotNet has five convolutional layers and five fully connected layers with an input of $200*66$. The Cg Network is even simpler with three convolutional layers and two fully connected layers. Furthermore, the proposed base steering model and the speed command network are compared in order to protrude the advantage of the proposed Multi-modal Multi-task network.
% we also experiment on a base steering model(detail in Section\ref{base steering model}). This model architecture is part of Multi-task network, and focus on steering angle prediction.
% Although the network structures of PilotNet and our base steering model are inspired by AlexNet\cite{krizhevsky2012imagenet}, major difference is that we have a more concrete and considerable data pre-processing, as mentioned in Section \ref{preprocessing}.
% As to our final proposed model, Multi-modal Multi-task Network, shown in Figure \ref{fig:model}, not only have images, but also previous feedback speed as input, compared with base steering model.
% In Table \ref{table:angle}, we compare the prediction performance with four models. Encouragingly, the performance of our Base Steering model has already exceeded PilotNet and Cg Network. As mentioned previously, our considerable data pre-processing significantly improves the performance of steering angle prediction, even with a similar model. Our final proposed model, Multi-task model even outperforms over base steering model.

As shown in Table \ref{table:angle}, the improved base steering mode outperforms the reimplemented Nvidia's PilotNet\cite{bojarski2016end} and the Cg Network \cite{cg23}. This proves the effectiveness of the proposed CNN structure with larger kernel sizes and adjusted aspect ratios. PilotNet is proposed to work on other unpublished datasets, which might limit its performance in our evaluations.

By comparing the multi-task speed command model to the base steering model, we observe a further improvement in the steering accuracy from \ang{2.84} to \ang{1.85}. This shows that the multi-task model provides additional speed prediction while further improves the performance of the steering angle prediction task. The multi-modal multi-task model further improves the steering accuracy from \ang{1.85} to \ang{1.26}. As an extension, the multi-modal multi-task model takes previous feedback speeds as an extra modality of inputs and predict the speed and steering angle simultaneously. The extra modality and task help the model better understand the vehicle condition and thus generate a more accurate steering angle prediction.

% The reason is that Multi-task model has extra input information, previous feedback speed, and simultaneously it is making prediction of speed of next frame. This provides the steering angle prediction with a reasonable scenario, where people's driving is depended on both surroundings and how fast they go. For instance, with a high speed, the inertial effect would cause steering control much harder than a regular driving speed condition. Our training data is from human drivers, therefore the model should take these possible situations into consideration. Our experiment result proves our idea, where the performance of Multi-task model has improved about 70\% over PilotNet, 69\% over Cg Network, 56\% over Base Steering model.

Furthermore, we apply single exponential smoothing with thresholds \cite{hyndman2008forecasting,kim2017interpretable} on the final steering angle output. The intuition is to improve the vehicle control smoothness. The smoothing process adopts the following equation:
\begin{equation}
% angle_f = \alpha * angle_p + (1 - \alpha) * angle_b
% UPDATE EQUATION
\hat{\theta}_t = \alpha * \theta_t + (1 - \alpha) * \hat{\theta}_{t-1}
\end{equation}
where $\hat{\theta}_t$ is the smoothed steering angle output at the current frame, $\theta_t$ is the steering angle prediction at the current frame and $\hat{\theta}_{t-1}$ is the smoothed steering angle at the last timestamp. $\alpha$ is the smoothing factor and is set to 0.2.

Experiments are also conducted on the newly collected SAIC dataset. We achieve a steering angle prediction accuracy of \ang{0.17} with the multi-modal multi-task network.
% One of the most common problem of steering angle prediction in Autonomous Driving is error-accumulation. The prediction generated with both current and past steering angle, applying a proper weight, can effectively reduce the influence of error-accumulation. As Figure \ref{table:angle} shown, although the predicted steering angle is fluctuate around the ground truth, it is maintained in a reasonable range. Based on our experiment(On Udacity and SAIC dataset), we find that when $\alpha$ is 0.2, the performance is the best.

\subsection{Discrete Speed Command Prediction}
\begin{figure}[t]
\begin{center}
   \centerline{\includegraphics[width=10cm,height=7cm]{image/Angle.pdf}}
\end{center}
\vspace{-0.6in}
\caption{Steering angle prediction results by the multi-modal multi-task network on the Udaicty dataset \cite{udacity}.}
\vspace{-0.1in}
\label{fig:angle}
\end{figure}

As introduced in Section \ref{speed command}, we first simplify the speed prediction problem into a multi-class classification problem where the classes are discrete speed commands. Experiments are conducted on the Udacity dataset and the SAIC dataset with the model structure shown in Figure \ref{fig:base_mdoel} (b).
% The input becomes a sequence of previous image frames.
We convert acceleration value sequences into discrete speed command sequences containing the labels of `accelerating', `decelerating' and `maintaining speed'. All discrete command labels are transferred into one-hot vectors.
% Although the idea of Discrete Speed Command Prediction is more intuitive, the experiment result is not as appealing as we expect. The speed command classification accuracy on Udacity dataset is about 65\%.

On the Udacity dataset, we achieve a speed command classification accuracy of 65.0\%. Furthermore, the multi-task model improves the steering angle prediction accuracy from \ang{2.84} to \ang{1.85}. Despite the improvements in steering angle prediction, the results are limited. After observing the error classes, we find two major reasons for the failure cases. First, the generated speed commands are noisy with the human factors-related speed changes. Increasing the interval in calculating the acceleration can alleviate the problem, but it leads to a delay in generating the speed command. Another problem is that it is inherently difficult to predict the speed command with only the visual inputs. As mentioned earlier, there is no need to slow down when the vehicle is already slow enough even if the obstacles are close to the vehicle. To solve these problems, we further propose the multi-modal multi-task network.
% After comprehensive analysis, we think that the poor performance results from our own handcrafted labeling, and the dataset is not generated for speed command. At some frames, the speed has a great fluctuation. For example, within one time interval, a drastic speed change can lead to more than one speed command. However, our labeling strategy will only consider one speed command class at this circumstance. Meanwhile, a very short time interval would also lead to a over-fluctuated condition of speed command. As a result, our speed command labels will include too much noise. Additionally, we explore the raw acceleration data, but it still seems not as good as we expect. From our perspective, an on-purpose generated dataset would help correcting this fault-labeling problem, and a better result may come out later.

\subsection{Speed Control Value Prediction}
\begin{figure}[t]
\begin{center}
   \centerline{\includegraphics[width=9.5cm,height=7cm]{image/Speed.pdf}}
\end{center}
\vspace{-0.6in}
\caption{Speed value prediction results by the multi-modal multi-task network on the Udaicty dataset \cite{udacity}.}
\vspace{-0.1in}
\label{fig:speed}
\end{figure}

The multi-modal multi-task network, shown in Figure \ref{fig:model}, directly predicts the speed value of the next frame by utilizing both visual inputs and feedback speed inputs. Different from speed command prediction, the ground truth labels of speed values are numerical values in unit of $m/s$ and the problem is now modeled as a regression task. For inputs, the visual input is one single frame and the feedback speeds contain the speeds of 10 previous timestamps. Similar to steering angle prediction, the low speed data (less than $4 m/s$) is discarded to ensure a consistent driving condition. Experiments are conducted on both the Udacity and the SAIC datasets. The speed prediction performance of the multi-modal multi-task model is shown in Table \ref{table:speed}. We achieve an MAE of $0.19 m/s$ on the Udacity dataset and an MAE of $0.45 m/s$ on the SAIC dataset. Since the speed prediction task is novel, we did not find any baselines for comparison. The speed prediction results are plotted in Figure \ref{fig:speed} and the predicted values match well with the ground truth. Furthermore, an improvement in steering angle prediction is observed with the multi-modal multi-task model.
% Considering that Udacity dataset is generated from a more complex driving condition, 0.193 $m/s$ in MAE is an extremely low error. From Figure \ref{fig:speed}, we can find that the speed prediction fits the ground truth well. Considering that SAIC dataset is generated from a more smooth driving condition, 0.056 $m/s$ in MAE is reasonably lower than the MAE of Udacity dataset. However, there is no relevant publication about camera-based autonomous driving multi-task model, we cannot directly compare our performance with others. Intuitively, regardless of whether experiment performance of Udacity dataset or SAIC dataset has already become the state of the art in this certain area.

\subsection{Road Tests and Data Synthesis}
Despite the good simulation results, we further discuss the challenges and corresponding solutions used in road tests. The major challenge in road tests is error accumulation. The accumulated error in the steering angle reflects as a shift vertical to the road and finally leads to the drift away of the vehicle. Similar error accumulation is also observed in speed control, as the feedback speeds have been used for future speed predictions. Therefore, the input data should contain adequate samples of recovering from failures. However, failure case data collection is dangerous and infeasible, since human drivers would have to frequently drive off the road and recover.

Inspired by \cite{bojarski2016end}, we use side cameras to synthesize the failure case data for steering angle prediction. An artificial recovering angle is added with the following equation:
\begin{equation}
\theta_f = \theta_r + \arctan(\frac{d_y}{s*t_r})
\end{equation}
where $\theta_f$ is the simulated steering angle with a recovering angle added, $\theta_r$ is the driver's steering angle corresponding to the center camera, $d_y$ is the distance between the side and center cameras, $s$ is the current speed and $t_r$ is the time of the whole recovering process. In our experiments, the camera offset $d_y$ is 20 inches (50.8 cm). Based on expert knowledge, we adopt a recovering time of one second in our experiments. Furthermore, we extend the data synthesis methods to speed data synthesis. Experiments on real cars show that vehicles would drift away without the data synthesis method. With the synthesized failure cases added, vehicles manage to drive autonomously on the road under a similar condition in SAIC.

\section{Conclusion}
In this paper, we address the challenging task of end-to-end vehicle control in terms of both the speed and steering angle. A multi-modal multi-task framework is proposed for the joint task. The model takes front-view camera recordings and feedback speed sequences as the input. Experiments show that the proposed multi-task framework predicts the speed value accurately and further improves the accuracy of steering angle prediction. A new SAIC dataset is collected  for evaluation and further studies. Finally, the error accumulation problem in real vehicle road tests are introduced. An extended data synthesis method is proposed for failure case simulation, which help solve the error accumulation problem.
\section*{Acknowledgments}
We thank the support of New York State through the Goergen Institute for Data Science, and SAIC USA.

% conference papers do not normally have an appendix
% use section* for acknowledgment
%\section*{Acknowledgment}
%The authors would like to thank...

% trigger a \newpage just before the given reference
% number - used to balance the columns on the last page
% adjust value as needed - may need to be readjusted if
% the document is modified later
%\IEEEtriggeratref{8}
% The "triggered" command can be changed if desired:
%\IEEEtriggercmd{\enlargethispage{-5in}}

% references section

% can use a bibliography generated by BibTeX as a .bbl file
% BibTeX documentation can be easily obtained at:
% http://mirror.ctan.org/biblio/bibtex/contrib/doc/
% The IEEEtran BibTeX style support page is at:
% http://www.michaelshell.org/tex/ieeetran/bibtex/
% argument is your BibTeX string definitions and bibliography database(s)

\bibliographystyle{IEEEtran}
\bibliography{IEEEfull.bib}
%
% <OR> manually copy in the resultant .bbl file
% set second argument of \begin to the number of references
% (used to reserve space for the reference number labels box)
% \begin{thebibliography}{1}

% \bibitem{IEEEhowto:kopka}
% H.~Kopka and P.~W. Daly, \emph{A Guide to \LaTeX}, 3rd~ed.\hskip 1em plus
%   0.5em minus 0.4em\relax Harlow, England: Addison-Wesley, 1999.

% \end{thebibliography}


% that's all folks
\end{document}

}

%+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
%
% File: body.tex
%
% Brief: Main content of paper
%
% DAVE-2 paper intended for ArXiV and submission to the Journal of
% Field robotics.
%
% Author: Content by Larry Jackel (larry.jakcel@north-c.com)
%         LaTex formatting by Urs Muller (umuller@nvidia.com)
%         Created April 8, 2016
%
% $Id$
%
% Copyright (c) 2016 NVIDIA
%
%+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++*/

\begin{abstract}
We trained a \gls{cnn} to map raw pixels from a single front-facing
camera directly to steering commands. This end-to-end approach proved
surprisingly powerful. With minimum training data from humans the
system learns to drive in traffic on local roads with or without lane
markings and on highways. It also operates in areas with unclear
visual guidance such as in parking lots and on unpaved roads.

The system automatically learns internal representations of the
necessary processing steps such as detecting useful road features with
only the human steering angle as the training signal. We never
explicitly trained it to detect, for example, the outline of roads.

Compared to explicit decomposition of the problem, such as lane
marking detection, path planning, and control, our end-to-end system
optimizes all processing steps simultaneously.  We argue that this
will eventually lead to better performance and smaller systems. Better
performance will result because the internal components self-optimize
to maximize overall system performance, instead of optimizing
human-selected intermediate criteria, \eg, lane detection.  Such
criteria understandably are selected for ease of human interpretation
which doesn't automatically guarantee maximum system performance.
Smaller networks are possible because the system learns to solve the
problem with the minimal number of processing steps.

We used an NVIDIA DevBox and Torch~7 for training and an NVIDIA
\drivepx{} self-driving car computer also running Torch~7 for
determining where to drive. The system operates at 30 \gls{fps}.
\end{abstract}


\clearpage
\section{Introduction}
\glspl{cnn} \cite{lecun-89e} have revolutionized pattern recognition
\cite{krizhevsky-2012}. Prior to the widespread adoption of
\glspl{cnn}, most pattern recognition tasks were performed using an
initial stage of hand-crafted feature extraction followed by a
classifier. The breakthrough of \glspl{cnn} is that features are
learned automatically from training examples. The \gls{cnn} approach
is especially powerful in image recognition tasks because the
convolution operation captures the 2D nature of images. Also, by using
the convolution kernels to scan an entire image, relatively few
parameters need to be learned compared to the total number of
operations.

While \glspl{cnn} with learned features have been in commercial use
for over twenty years \cite{jackel-1995}, their adoption has exploded
in the last few years because of two recent developments.  First,
large, labeled data sets such as the \gls{ilsvrc} \cite{ilsvrc} have
become available for training and validation. Second, \gls{cnn}
learning algorithms have been implemented on the massively parallel
\glspl{gpu} which tremendously accelerate learning and inference.

In this paper, we describe a \gls{cnn} that goes beyond pattern
recognition. It learns the entire processing pipeline needed to steer
an automobile.  The groundwork for this project was done over 10 years
ago in a \gls{darpa} seedling project known as \gls{dave}
\cite{dave-report-04} in which a sub-scale \gls{rc} car drove through
a junk-filled alley way. \gls{dave} was trained on hours of human
driving in similar, but not identical environments. The training data
included video from two cameras coupled with left and right steering
commands from a human operator.

In many ways, DAVE-2 was inspired by the pioneering work of Pomerleau
\cite{pomerleau-1989} who in 1989 built the \gls{alvinn} system. It
demonstrated that an end-to-end trained neural network can indeed
steer a car on public roads. Our work differs in that 25 years of
advances let us apply far more data and computational power to the
task. In addition, our experience with \glspl{cnn} lets us make use of
this powerful technology. (\gls{alvinn} used a fully-connected network
which is tiny by today's standard.)

While \gls{dave} demonstrated the potential of end-to-end learning,
and indeed was used to justify starting the \gls{darpa} \gls{lagr}
program \cite{wikipedia-lagr}, \gls{dave}'s performance was not
sufficiently reliable to provide a full alternative to more modular
approaches to off-road driving. \gls{dave}'s mean distance between
crashes was about 20~meters in complex environments.

Nine months ago, a new effort was started at NVIDIA that sought to
build on DAVE and create a robust system for driving on public
roads. The primary motivation for this work is to avoid the need to
recognize specific human-designated features, such as lane markings,
guard rails, or other cars, and to avoid having to create a collection
of ``if, then, else'' rules, based on observation of these features.
This paper describes preliminary results of this new effort.


\section{Overview of the DAVE-2 System}
\label{sec-overview}
Figure~\ref{fig-data-collection-system} shows a simplified block
diagram of the collection system for training data for \mbox{DAVE-2}.
Three cameras are mounted behind the windshield of the
data-acquisition car. Time-stamped video from the cameras is captured
simultaneously with the steering angle applied by the human
driver. This steering command is obtained by tapping into the
vehicle's \gls{can} bus. In order to make our system independent of
the car geometry, we represent the steering command as $^1/_r$ where
$r$ is the turning radius in meters. We use $^1/_r$ instead of $r$ to
prevent a singularity when driving straight (the turning radius for
driving straight is infinity).  $^1/_r$ smoothly transitions through
zero from left turns (negative values) to right turns (positive
values).

\begin{figure}[htb]
  \hfil
  \includegraphics[scale=0.85]{figures/data-collection-system}
  \caption{High-level view of the data collection system.}
  \label{fig-data-collection-system}
\end{figure}

Training data contains single images sampled from the video, paired
with the corresponding steering command ($^1/_r$). Training with data
from only the human driver is not sufficient. The network must learn
how to recover from mistakes. Otherwise the car will slowly drift off
the road. The training data is therefore augmented with additional
images that show the car in different shifts from the center of the
lane and rotations from the direction of the road.

Images for two specific off-center shifts can be obtained from the
left and the right camera. Additional shifts between the cameras and
all rotations are simulated by viewpoint transformation of the image
from the nearest camera. Precise viewpoint transformation requires 3D
scene knowledge which we don't have. We therefore approximate the
transformation by assuming all points below the horizon are on flat
ground and all points above the horizon are infinitely far away. This
works fine for flat terrain but it introduces distortions for objects
that stick above the ground, such as cars, poles, trees, and
buildings. Fortunately these distortions don't pose a big problem for
network training. The steering label for transformed images is
adjusted to one that would steer the vehicle back to the desired
location and orientation in two seconds.

A block diagram of our training system is shown in
Figure~\ref{fig-training}.  Images are fed into a \gls{cnn} which then
computes a proposed steering command.  The proposed command is
compared to the desired command for that image and the weights of the
\gls{cnn} are adjusted to bring the \gls{cnn} output closer to the
desired output. The weight adjustment is accomplished using back
propagation as implemented in the Torch~7 machine learning package.

\begin{figure}[htb]
  \hfil
  \includegraphics[scale=0.85]{figures/training}
  \caption{Training the neural network.}
  \label{fig-training}
\end{figure}

Once trained, the network can generate steering from the video images
of a single center camera. This configuration is shown in
Figure~\ref{fig-inference}.

\begin{figure}[htb]
  \hfil
  \includegraphics[scale=0.85]{figures/inference}
  \caption{The trained network is used to generate steering commands
    from a single front-facing center camera.}
  \label{fig-inference}
\end{figure}


\section{Data Collection}
Training data was collected by driving on a wide variety of roads and
in a diverse set of lighting and weather conditions. Most road data
was collected in central New Jersey, although highway data was also
collected from Illinois, Michigan, Pennsylvania, and New York. Other
road types include two-lane roads (with and without lane markings),
residential roads with parked cars, tunnels, and unpaved roads. Data
was collected in clear, cloudy, foggy, snowy, and rainy weather, both
day and night. In some instances, the sun was low in the sky,
resulting in glare reflecting from the road surface and scattering
from the windshield.

Data was acquired using either our drive-by-wire test vehicle, which
is a 2016 Lincoln MKZ, or using a 2013 Ford Focus with cameras placed
in similar positions to those in the Lincoln. The system has no
dependencies on any particular vehicle make or model. Drivers were
encouraged to maintain full attentiveness, but otherwise drive as they
usually do. As of March 28, 2016, about 72 hours of driving data was
collected.


\section{Network Architecture}
We train the weights of our network to minimize the mean squared error
between the steering command output by the network and the command of
either the human driver, or the adjusted steering command for
off-center and rotated images (see Section~\ref{sec-augmentation}).
Our network architecture is shown in
Figure~\ref{fig-cnn-architecture}.  The network consists of 9 layers,
including a normalization layer, 5 convolutional layers and 3 fully
connected layers. The input image is split into YUV planes and passed
to the network.

The first layer of the network performs image normalization. The
normalizer is hard-coded and is not adjusted in the learning process.
Performing normalization in the network allows the normalization
scheme to be altered with the network architecture and to be
accelerated via \gls{gpu} processing.

The convolutional layers were designed to perform feature extraction
and were chosen empirically through a series of experiments that
varied layer configurations. We use strided convolutions in the first
three convolutional layers with a 2\x2 stride and a 5\x5 kernel and a
non-strided convolution with a 3\x3 kernel size in the last two
convolutional layers.

We follow the five convolutional layers with three fully connected
layers leading to an output control value which is the inverse turning
radius. The fully connected layers are designed to function as a
controller for steering, but we note that by training the system
end-to-end, it is not possible to make a clean break between which
parts of the network function primarily as feature extractor and which
serve as controller.

\begin{figure}[htb]
  \hfil
  \includegraphics[scale=0.55]{figures/cnn-architecture}
  \caption{\gls{cnn} architecture. The network has about 27 million
    connections and 250 thousand parameters.}
  \label{fig-cnn-architecture}
\end{figure}


\section{Training Details}

\subsection{Data Selection}
The first step to training a neural network is selecting the frames to
use. Our collected data is labeled with road type, weather condition,
and the driver's activity (staying in a lane, switching lanes,
turning, and so forth). To train a \gls{cnn} to do lane following we
only select data where the driver was staying in a lane and discard
the rest. We then sample that video at 10~\gls{fps}. A higher
sampling rate would result in including images that are highly similar
and thus not provide much useful information.

To remove a bias towards driving straight the training data includes a
higher proportion of frames that represent road curves.


\subsection{Augmentation}
\label{sec-augmentation}
After selecting the final set of frames we augment the data by adding
artificial shifts and rotations to teach the network how to recover
from a poor position or orientation. The magnitude of these
perturbations is chosen randomly from a normal distribution. The
distribution has zero mean, and the standard deviation is twice the
standard deviation that we measured with human drivers.  Artificially
augmenting the data does add undesirable artifacts as the magnitude
increases (see Section~\ref{sec-overview}).


\section{Simulation}
\label{sec-simulation}
Before road-testing a trained \gls{cnn}, we first evaluate the networks
performance in simulation. A simplified block diagram of the
simulation system is shown in Figure~\ref{fig-simulator}.

The simulator takes pre-recorded videos from a forward-facing on-board
camera on a human-driven data-collection vehicle and generates images
that approximate what would appear if the \gls{cnn} were, instead,
steering the vehicle. These test videos are time-synchronized with
recorded steering commands generated by the human driver.

Since human drivers might not be driving in the center of the lane all
the time, we manually calibrate the lane center associated with each
frame in the video used by the simulator. We call this position the
``ground truth''.

The simulator transforms the original images to account for departures
from the ground truth. Note that this transformation also includes any
discrepancy between the human driven path and the ground truth. The
transformation is accomplished by the same methods described in
Section~\ref{sec-overview}.

The simulator accesses the recorded test video along with the
synchronized steering commands that occurred when the video was
captured. The simulator sends the first frame of the chosen test
video, adjusted for any departures from the ground truth, to the input
of the trained \gls{cnn}. The \gls{cnn} then returns a steering
command for that frame.  The \gls{cnn} steering commands as well as
the recorded human-driver commands are fed into the dynamic model
\cite{wang-2001} of the vehicle to update the position and orientation
of the simulated vehicle.

The simulator then modifies the next frame in the test video so that
the image appears as if the vehicle were at the position that resulted
by following steering commands from the \gls{cnn}. This new image is then
fed to the \gls{cnn} and the process repeats.

The simulator records the off-center distance (distance from the car
to the lane center), the yaw, and the distance traveled by the virtual
car. When the off-center distance exceeds one meter, a virtual human
intervention is triggered, and the virtual vehicle position and
orientation is reset to match the ground truth of the corresponding
frame of the original test video.

\begin{figure}[htb]
  \hfil
  \includegraphics[scale=0.85]{figures/simulator}
  \caption{Block-diagram of the drive simulator.}
  \label{fig-simulator}
\end{figure}

\begin{figure}[htb]
  \hfil
  \includegraphics[width=1.0\textwidth,height=.4717\textwidth]{figures/simulator-roberts}
  \caption{Screen shot of the simulator in interactive mode. See
    Section~\ref{sec-sim-tests} for explanation of the performance
    metrics. The green area on the left is unknown because of the
    viewpoint transformation. The highlighted wide rectangle below the
    horizon is the area which is sent to the \gls{cnn}.}
\end{figure}


\section{Evaluation}
\label{sec-evaluation}
Evaluating our networks is done in two steps, first in simulation, and
then in on-road tests.

In simulation we have the networks provide steering commands in our
simulator to an ensemble of prerecorded test routes that correspond to
about a total of three hours and 100~miles of driving in Monmouth
County, NJ.  The test data was taken in diverse lighting and weather
conditions and includes highways, local roads, and residential
streets.


\subsection{Simulation Tests}
\label{sec-sim-tests}
We estimate what percentage of the time the network could drive the
car (autonomy).  The metric is determined by counting simulated human
interventions (see Section~\ref{sec-simulation}). These interventions
occur when the simulated vehicle departs from the center line by more
than one meter. We assume that in real life an actual intervention
would require a total of six seconds: this is the time required for a
human to retake control of the vehicle, re-center it, and then restart
the self-steering mode. We calculate the percentage autonomy by
counting the number of interventions, multiplying by 6 seconds,
dividing by the elapsed time of the simulated test, and then
subtracting the result from 1:
\begin{equation}
  \label{eq-autonomy}
  \text{autonomy}=(1-\frac{(\text{number of interventions})
    \cdot6\text{ seconds}}
    {\text{elapsed time [seconds]}})\cdot100
\end{equation}
Thus, if we had 10 interventions in 600 seconds, we would have an
autonomy value of
\begin{equation*}
  (1-\frac{10\cdot6}{600})\cdot100=90\%
\end{equation*}


\subsection{On-road Tests}
After a trained network has demonstrated good performance in the
simulator, the network is loaded on the \drivepx{} in our test car and
taken out for a road test. For these tests we measure performance as
the fraction of time during which the car performs autonomous
steering. This time excludes lane changes and turns from one road to
another.  For a typical drive in Monmouth County NJ from our office in
Holmdel to Atlantic Highlands, we are autonomous approximately 98\% of
the time. We also drove 10~miles on the Garden State Parkway (a
multi-lane divided highway with on and off ramps) with zero
intercepts.

A video of our test car driving in diverse conditions can be seen in
\cite{dave2-video}.


\begin{figure}[p]
  \hfil
  \includegraphics{figures/feature-maps-road}
  \caption{How the \gls{cnn} ``sees'' an unpaved road. Top: subset of
    the camera image sent to the \gls{cnn}. Bottom left: Activation of
    the first layer feature maps. Bottom right: Activation of the
    second layer feature maps. This demonstrates that the \gls{cnn}
    learned to detect useful road features on its own, \ie, with only
    the human steering angle as training signal. We never explicitly
    trained it to detect the outlines of roads.}
  \label{fig-feature-maps-road}
\end{figure}

\begin{figure}[p]
  \hfil
  \includegraphics{figures/feature-maps-woods}
  \caption{Example image with no road. The activations of the first
    two feature maps appear to contain mostly noise, \ie, the
    \gls{cnn} doesn't recognize any useful features in this image.}
  \label{fig-feature-maps-woods}
\end{figure}


\subsection{Visualization of Internal \gls{cnn} State}
Figures~\ref{fig-feature-maps-road} and ~\ref{fig-feature-maps-woods}
show the activations of the first two feature map layers for two
different example inputs, an unpaved road and a forest. In case of the
unpaved road, the feature map activations clearly show the outline of
the road while in case of the forest the feature maps contain mostly
noise, \ie, the \gls{cnn} finds no useful information in this image.

This demonstrates that the \gls{cnn} learned to detect useful road
features on its own, \ie, with only the human steering angle as
training signal. We never explicitly trained it to detect the outlines
of roads, for example.


\clearpage
\section{Conclusions}
We have empirically demonstrated that \glspl{cnn} are able to learn
the entire task of lane and road following without manual
decomposition into road or lane marking detection, semantic
abstraction, path planning, and control. A small amount of training
data from less than a hundred hours of driving was sufficient to train
the car to operate in diverse conditions, on highways, local and
residential roads in sunny, cloudy, and rainy conditions. The
\gls{cnn} is able to learn meaningful road features from a very sparse
training signal (steering alone).

The system learns for example to detect the outline of a road without
the need of explicit labels during training.

More work is needed to improve the robustness of the network, to find
methods to verify the robustness, and to improve visualization of the
network-internal processing steps.

\pdfoutput=1
\documentclass{article} %

\usepackage{iclr2021_conference,times}
\newcommand{\quotespacestart}{\vspace{-.2cm}}
\newcommand{\quotespaceend}{\vspace{-.15cm}}
\renewcommand{\paragraph}[1]{\textbf{#1} }
\title{Structured Prediction as Translation \\ between Augmented Natural Languages}


\input{math_commands.tex}

\usepackage{hyperref}
\usepackage{url}

\usepackage[T1]{fontenc}    %

\usepackage{csquotes}
\usepackage{xspace}
\usepackage{cleveref}
\usepackage{hyphenat}

\usepackage{array}
\usepackage{caption}
\captionsetup{width=0.9\textwidth}
\usepackage{booktabs}
\usepackage{graphicx}
\usepackage{tabularx}
\usepackage{tabulary}
\usepackage{siunitx}
\usepackage{soul}
\usepackage{arydshln}
\usepackage{wasysym}


\newcommand{\tabledagger}{\makebox[0pt][l]{$^\dagger$}}
\newcommand{\tableast}{\makebox[0pt][l]{*}}

\usepackage{xcolor}
\definecolor{green}{HTML}{268B07}
\definecolor{blue}{HTML}{4077ab}
\definecolor{red}{HTML}{CC8E7F}
\definecolor{magenta}{HTML}{A748C3}
\definecolor{redorange}{HTML}{F46A4E}

\newcommand{\eat}[1]{\ignorespaces}


\newcommand{\entitybegin}{\textbf{[}\xspace}
\newcommand{\entityend}{\textbf{]}\xspace}
\newcommand{\separator}{\textbf{|}\xspace}
\newcommand{\equals}{\textbf{=}\xspace}
\newcommand{\taskseparator}{\textbf{:}\xspace}

\newcommand{\ourmodel}{{TANL}}

\usepackage{array}
\usepackage{soul}
\usepackage{multirow}
\usepackage{enumerate}
\usepackage{scrextend}
\usepackage{xspace}
\usepackage{subcaption}
\usepackage{float}
\usepackage{tipa}
\usepackage{comment}
\usepackage{enumitem}
\usepackage{makecell}
\usepackage{longtable}
\newcommand{\head}{$\bf{h}$\xspace}
\newcommand{\tail}{$\bf{t}$\xspace}
\newcommand{\bert}{{\sc BERT}\xspace}
\newcommand{\x}{$\bf{x}$\xspace}
\newcommand{\tpm}{$\pm$\xspace}
\renewcommand{\O}{\mathcal{O}}
\newcommand{\pretrained}{pre-trained\xspace}
\newcommand{\pretraining}{pre-training\xspace}
\usepackage{csquotes}
\usepackage{contour}
\usepackage[normalem]{ulem}

\usepackage{mathtools}

\renewcommand{\ULdepth}{1.8pt}
\contourlength{0.8pt}

\newcommand{\myuline}[1]{%
  \uline{\phantom{#1}}%
  \llap{\contour{white}{#1}}%
}

\newcommand{\entity}[1]{\myuline{#1}}
\newcommand{\attribute}[1]{\textit{#1}}


\newcommand{\xx}{\bf{x}}

\newenvironment{customquote}
   {\list{}{\rightmargin=0.3cm \leftmargin=0.3cm}%
   \item\relax}
  {\endlist}


\author{Giovanni Paolini, Ben Athiwaratkun, Jason Krone, Jie Ma, Alessandro Achille, \\
\textbf{Rishita Anubhai, Cicero Nogueira dos Santos, Bing Xiang, Stefano Soatto} \\[0.1cm]
Amazon Web Services \\[0.1cm]
\texttt{\{paoling,benathi,kronej,jieman,aachille,ranubhai,cicnog,} \\
\texttt{bxiang,soattos\}@amazon.com}
}


\newcommand{\fix}{\marginpar{FIX}}
\newcommand{\new}{\marginpar{NEW}}


\iclrfinalcopy %
\begin{document}


\maketitle

\begin{abstract}
We propose a new framework, \emph{Translation between Augmented Natural Languages} (\ourmodel), to solve many structured prediction language tasks including joint entity and relation extraction, nested named entity recognition, relation classification, semantic role labeling, event extraction, coreference resolution, and dialogue state tracking. Instead of tackling the problem by training task-specific discriminative classifiers, we frame it as a translation task between \emph{augmented natural languages}, from which the task-relevant information can be easily extracted. Our approach can match or outperform task-specific models on all tasks, and in particular, achieves new state-of-the-art results on joint entity and relation extraction (CoNLL04, ADE, NYT, and ACE2005 datasets), relation classification (FewRel and TACRED), and semantic role labeling (CoNLL-2005 and CoNLL-2012). We accomplish this while using the same architecture and hyperparameters for all tasks and even when training a single model to solve all tasks at the same time (multi-task learning). Finally, we show that our framework can also significantly improve the performance in a low-resource regime, thanks to better use of label semantics.


\end{abstract}


\input{introduction.tex}


\section{Related work} \label{sec:related} %


Many classical methods for structured prediction (SP) in NLP are generalizations of traditional classification algorithms and include, among others, Conditional Random Fields
\citep{crf2001}, Structured Perceptron \citep{collins-2002}, and Structured Support Vector Machines
\citep{ioannis2004}.
More recently, multiple efforts to integrate SP into deep learning methods have been proposed. Common approaches include placing an SP layer as the final layer of a neural net \citep{collobert2011} and incorporating SP directly into DL models \citep{dyer-2015-dp}.

Current state-of-the-art approaches for SP in NLP train a task-specific classifier on top of the features learned by a \pretrained language model, such as BERT \citep{bert}.
In this line of work, BERT MRC \citep{bert_mrc} performs NER using two classification modules to predict respectively the first and the last tokens corresponding to an entity for a given input sentence.
For joint entity and relation extraction, SpERT \citep{spert} uses a similar approach to detect token spans corresponding to entities, followed by a relation classification module.
In the case of coreference resolution, many approaches employ a higher-order coreference model \citep{lee-etal-2018-higher} which learns a probability distribution over all possible antecedent entity token spans.

Also related to this work are papers on sequence-to-sequence (seq2seq) models for multi-task learning and SP.
\citet{t5} describe a framework to cast problems such as translation and summarization as text-to-text tasks in natural language, leveraging the transfer learning power of a transformer-based language model.
Other sequence-to-sequence approaches solve specific structured prediction tasks by generating the desired output directly: see for example WDec \citep{Wdec} for entity and relation extraction, and SimpleTOD \citep{hosseiniasl2020simple} and SOLOIST \citep{SOLOIST} for dialogue state tracking.
Closer to us, GSL \citep{gsl}, which introduced the term \emph{augmented natural language}, showed early applications of the generative approach in sequence labeling tasks such as slot labeling, intent classification, and named entity recognition without nested entities.
Our approach is also related to previous works that use seq2seq approaches to perform parsing \citep{oriol_NIPS2015,dyer-etal-2016-recurrent,choe-charniak-2016-parsing,Rongali_2020}, with the main difference that we propose a general framework that uses augmented natural languages as a way to unify multiple tasks and exploit label semantics.
In some cases (e.g., relation classification), our output format resembles that of a question answering task \citep{multitask-learning-QA}.
This paradigm has recently proved to be effective for some structured prediction tasks, such as entity and relation extraction and coreference resolution \citep{multiturnQA, MRC4ERE, CorefQA}.
Additional task-specific prior work is discussed in \Cref{sec:sp-tasks-appendix}.

Finally, TANL enables easy multi-task structured prediction (\Cref{sec:exp_multitask}). Recent work has highlighted benefits of multi-task learning \citep{multitask_learning_sequence_tagging} and transfer learning \citep{transfer-learning-NLP} in NLP, especially in low-resource scenarios.


\section{Method} \label{sec:method}


We frame structured prediction tasks as text-to-text translation problems.
Input and output follow specific augmented natural languages that are appropriate for a given task, as shown in \Cref{fig:main_model}.
In this section, we describe the format design concept and the decoding procedure we use for inference.


\paragraph{Augmented natural languages.} %
We use the joint entity and relation extraction task as our guiding example for augmented natural language formats.
Given a sentence, this task aims to extract a set of \emph{entities} (one or more consecutive tokens) and a set of \emph{relations} between pairs of entities.
Each predicted entity and relation has to be assigned to an entity or a relation type. %
In all the datasets considered, the relations are asymmetric; {\em i.e.}, it is important which entity comes first in the relation (the \emph{head} entity) and which comes second (the \emph{tail} entity).
Below is the augmented natural language designed for this task (also shown in \Cref{fig:main_model}):

\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input:} Tolkien's epic novel The Lord of the Rings was published in 1954-1955, years after the book was completed.

    \textbf{Output:}
    \nohyphens{\entitybegin \entity{Tolkien} \separator \attribute{person} \entityend's epic novel \entitybegin \entity{The Lord of the Rings} \separator \attribute{book} \separator \attribute{author} \equals Tolkien \entityend was published in 1954-1955, years after the book was completed.}
    \quotespaceend
\end{customquote}
\end{footnotesize}
Specifically, the desired output replicates the input sentence and augments it with patterns that can be decoded into structured objects. %
For this task, each group consisting of an entity and possibly some relations is enclosed by the special tokens \entitybegin \entityend.
A sequence of \separator-separated tags describes the entity type and a list of relations in the format ``X \equals Y'', where X is the relation type, and Y is another entity (the \emph{tail} of the relation).
Note that the objects of interest are all within the enclosed patterns ``\entitybegin $\ldots$ | $\ldots$ \entityend''. However, we replicate all words in the input sentence, as it
helps reduce ambiguity when the sentence contains more than one occurrence of the same entity.
It also improves learning, as shown by our ablation studies (\Cref{sec:exp_ablation} and \Cref{sec:ablation-studies}).
In the target output sentence, entity and relation types are described in natural words (e.g.\ \emph{person}, \emph{location})
--- not abbreviations such as \emph{PER}, \emph{LOC} ---
to take full advantage of the latent knowledge that a \pretrained model has about those words.

For certain tasks, additional information can be provided as part of the input, such as the span of relevant entities in semantic role labeling or coreference resolution (see \Cref{fig:main_model}).
We detail the input/output formats for all structured prediction tasks in \Cref{sec:sp_tasks}.


\paragraph{Nested entities and multiple relations.}
Nested patterns allow us to represent hierarchies of entities.
In the following example from the ADE dataset, the entity ``lithium toxicity'' is of type \textit{disease}, and has a sub-entity ``lithium'' of type \textit{drug}.
The entity ``lithium toxicity'' is involved in multiple relations: one of type \textit{effect} with the entity ``acyclovir'', and another of type \textit{effect} with the entity ``lithium''.
In general, the relations in the output can occur in any order.

\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input:}
    Six days after starting acyclovir she exhibited signs of lithium toxicity.

    \textbf{Output:}
    Six days after starting \entitybegin \entity{acyclovir} \separator \attribute{drug} \entityend she exhibited signs of \entitybegin \entitybegin \entity{lithium} \separator \attribute{drug} \entityend \entity{toxicity} \separator \attribute{disease} \separator \attribute{effect} \equals acyclovir \separator \attribute{effect} \equals lithium \entityend .
\quotespaceend
\end{customquote}
\end{footnotesize}

\paragraph{Decoding structured objects.}
Once the model generates an output sentence in an augmented natural language format, we decode the sentence to obtain the predicted structured objects, as follows.
\begin{enumerate}[leftmargin=*]
\vspace{-.05cm}
    \itemsep0em
    \item We remove all special tokens and extract entity types and relations, to produce a cleaned output.
    If part of the generated sentence has an invalid format, that part is discarded.
    \item %
    We match the input sentence and the cleaned output sentence at the token levels using the dynamic programming (DP) based Needleman-Wunsch alignment algorithm \citep{needleman1970general}.
    We then use this alignment to identify the tokens corresponding to entities in the original input sentence.
    This process improves the robustness against potentially imperfect generation by the model, as shown by our ablation studies (\Cref{sec:exp_ablation} and \Cref{sec:ablation-studies}).
    \item
    For each relation proposed in the output, we search for the closest entity that exactly matches the predicted tail entity.
    If such an entity does not exist, the relation is discarded.
    \item We discard entities or relations whose predicted type does not belong to the dataset-dependent list of types.
\vspace{-.05cm}
\end{enumerate}

To better explain the DP alignment in step 2, consider the example below where the output contains a misspelled entity word, ``Aciclovir'' (instead of ``acyclovir'').
The cleaned output containing the word ``Aciclovir'', tokenized as ``A-cicl-o-vir'', is matched to ``a-cycl-o-vir'' in the input, from which we deduce that it refers to ``acyclovir''.


\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Generated output:}
        Six days after starting \entitybegin \entity{Aciclovir} \separator \attribute{drug} \entityend she exhibited signs of \entitybegin \entitybegin \entity{lithium} \separator \attribute{drug} \entityend \entity{toxicity} \separator \attribute{disease} \separator \attribute{effect} \equals Aciclovir \separator \attribute{effect} \equals lithium \entityend .


    \textbf{Cleaned output:}
            Six days after starting \entity{Aciclovir} she exhibited signs of \entity{lithium} \entity{toxicity} .
\quotespaceend
\end{customquote}
\end{footnotesize}


\paragraph{Multi-task learning.} Our method naturally allows us to train a single model on multiple datasets that can cover many structured prediction tasks.
In this setting, we add the dataset name followed by the task separator \taskseparator (for example, ``ade \taskseparator '') as a prefix to each input sentence.


\paragraph{Categorical prediction tasks.}
For tasks such as relation prediction, where there is a limited number of valid outputs, an alternative way to perform classification is to compute class scores of all possible outputs and predict the class with the highest score.
We demonstrate that we can use the output sequence likelihood as a proxy for such score.
This method offers a more robust way to perform the evaluation in low resource scenarios where generation can be imperfect (see \Cref{supp:relation-classification}).
This approach is similar to the method proposed by \cite{dos_santos_2020_beyond} for ranking with language models.


\section{Structured prediction tasks} \label{sec:sp_tasks}


\paragraph{Joint entity and relation extraction.}
Format and details for this task are provided in \Cref{sec:method}.


\paragraph{Named entity recognition (NER).}%
This is an entity-only particular case of the previous task. %


\paragraph{Relation classification.}
For this task, we are given an input sentence with \emph{head} and \emph{tail} entities
and seek to classify the type of relation between them, choosing from a predefined set of relations. %
Since the head entity does not necessarily precede the tail entity in the input sentence, we add a phrase
``The relationship between \entitybegin head  \entityend and \entitybegin tail  \entityend is'' after the original input sentence.
The output repeats this phrase, followed by the relation type. In the following example, the head and tail entities are ``Carmen Melis'' and ``soprano'' which have a \emph{voice type} relation.

\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input:}
    \nohyphens{
    Born in Bologna, Orlandi was a student of the famous Italian \entitybegin \entity{soprano} \entityend and voice teacher \entitybegin \entity{Carmen Melis} \entityend in Milan.
    The relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend is}

    \textbf{Output:}
    relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend \equals \attribute{voice type}
\quotespaceend
\end{customquote}
\end{footnotesize}


\paragraph{Semantic role labeling (SRL).}
Here we are given an input sentence along with a \emph{predicate}, and seek to predict a list of arguments and their types. %
Every argument corresponds to a span of tokens that correlates with the predicate in a specific manner (e.g.\ subject, location, or time).
The predicate is marked in the input, whereas arguments are marked in the output and are assigned an argument type.
In the following example, ``sold'' is the predicate of interest.


\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input:}
    \nohyphens{The luxury auto maker last year \entitybegin \entity{sold} \entityend 1,214 cars in the U.S.}

    \textbf{Output:}
    \entitybegin \entity{The luxury auto maker} \separator \attribute{subject} \entityend \entitybegin \entity{last year} \separator \attribute{temporal} \entityend
    sold
     \entitybegin \entity{1,214 cars} \separator \attribute{object} \entityend \entitybegin \entity{in the U.S.} \separator \attribute{location} \entityend
\quotespaceend
\end{customquote}
\end{footnotesize}


\paragraph{Event extraction.}
This task requires extracting
(1) event triggers, each indicating the occurrence of a real-world event
and (2) trigger arguments indicating the attributes associated with each trigger.
In the following example, there are two event triggers, ``attacked'' of type \attribute{attack} and ``injured'' of type \attribute{injury}. We perform trigger detection using the same format as in NER, as shown below.
To perform argument extraction, we consider a single trigger as input at a time.
We mark the trigger (with its type) in the input, and we use an output format similar to joint entity and relation extraction.
Below, we show an argument extraction example for the trigger ``attacked'', where two arguments need to be extracted, namely, ``Two soldiers'' of type \attribute{target} and ``yesterday'' of type \attribute{attack time}.


\begin{footnotesize}
\begin{customquote}
    \quotespacestart
    \textbf{Trigger extraction input:}
    \nohyphens{Two soldiers were attacked and injured yesterday.}

    \textbf{Trigger extraction output:}
    \nohyphens{Two soldiers were \entitybegin \entity{attacked} \separator \attribute{attack} \entityend and \entitybegin \entity{injured} \separator \attribute{injury} \entityend yesterday.}

    \textbf{Argument extraction input:}
    \nohyphens{Two soldiers were \entitybegin \entity{attacked} \separator \attribute{attack} \entityend and injured yesterday.}

    \textbf{Argument extraction output:}
    \entitybegin \entity{Two soldiers} \separator \attribute{individual} \separator \attribute{target} \equals attacked \entityend were attacked and injured \entitybegin \entity{yesterday} \separator \attribute{time} \separator \attribute{attack time} \equals attacked \entityend.
\quotespaceend
\end{customquote}
\end{footnotesize}


\paragraph{Coreference resolution.}
This is the task of grouping individual text spans (\emph{mentions}) referring to the same real-world entity.
For each mention that is not the first occurrence of a group, we reference with the first mention. In the following example, ``his'' refers to ``Barack Obama'' and is marked as \entitybegin \entity{his} \separator \attribute{Barack Obama} \entityend in the output.


\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input:}
    Barack Obama nominated Hillary Rodham Clinton as his secretary of state on Monday. He chose her because she had foreign affairs experience as a former First Lady.

    \textbf{Output:}
   \nohyphens{\entitybegin \entity{Barack Obama} \entityend nominated \entitybegin \entity{Hillary Rodham Clinton} \entityend as \entitybegin \entity{his} \separator \attribute{Barack Obama} \entityend \entitybegin \entity{secretary of state} \separator \attribute{Hillary Rodham Clinton} \entityend on Monday.	\entitybegin \entity{He} \separator \attribute{Barack Obama} \entityend chose \entitybegin\entity{her} \separator \attribute{Hillary Rodham Clinton} \entityend because \entitybegin \entity{she} \separator \attribute{Hillary Rodham Clinton} \entityend had foreign affairs	experience as a	former \entitybegin \entity{First Lady} \separator \attribute{Hillary Rodham Clinton} \entityend.}
\quotespaceend
\end{customquote}
\end{footnotesize}


\paragraph{Dialogue state tracking (DST).}
Here we are given as input a history of dialogue turns,
typically between a user (trying to accomplish a goal) and an agent (trying to help the user).
The desired output is the dialogue state, consisting of a value for each key (or \emph{slot name}) from a predefined list.
In the input dialogue history, we add the prefixes ``\entitybegin user \entityend\taskseparator'' and ``\entitybegin agent \entityend\taskseparator'' to delineate user and agent turns, respectively.
Our output format consists of a list of all slot names with their predicted values.
We add ``\entitybegin belief \entityend'' delimiters to help the model know when to stop generating the output sequence.
We tag slots that are not mentioned in the dialogue history with the value ``not given'' (we do not show them in the example below, for brevity).

\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input:}
    \nohyphens{\entitybegin user \entityend\taskseparator I am looking for a cheap place to stay \entitybegin agent \entityend\taskseparator How long? } \entitybegin user \entityend\taskseparator Two

    \textbf{Output:}
    \nohyphens{\entitybegin belief \entityend \attribute{hotel price range} \entity{cheap}, \attribute{hotel type} \entity{hotel}, \attribute{duration} \entity{two} \entitybegin belief \entityend}
    \quotespaceend
\end{customquote}
\end{footnotesize}


\section{Experiments}
\label{sec:experiments}


\renewcommand{\ourmodel}{{\textbf{TANL}}}
\newcolumntype{C}{>{\centering\arraybackslash}X}
\newcommand{\modelcolwidth}{4.55 cm}
\newcommand{\tanlspace}{0.085cm}  %
\newcommand{\betweentablespace}{\vspace{0.16cm}}
\newcommand{\best}[1]{\textbf{#1}}
\newcommand{\topleftdesc}[2]{\multirow{#2}{*}{ \rotatebox{90}{ \textbf{#1} }  } & }
\newcommand{\sidedescwidth}{0.2cm}
\newcommand{\pad}{ & }
\begin{table}[]
\centering
\small
\setlength{\tabcolsep}{5pt}
\vspace{-.3cm}
\caption{Results on all tasks. All numbers indicate F1 scores except noted otherwise. Datasets marked with an asterisk (*) have nested entities.
}
\label{tab:results}

\begin{tabularx}{\textwidth}{p{\sidedescwidth} p{\modelcolwidth} *{8}{C}}

 \topleftdesc{Entity Relation Extr.}{13}
    & \multicolumn{2}{c}{\bf CoNLL04} & \multicolumn{2}{c}{{\bf ADE}*} & \multicolumn{2}{c}{\bf NYT} &
    \multicolumn{2}{c}{\bf ACE2005} \\

   \cmidrule(lr){3-4} \cmidrule(lr){5-6} \cmidrule(lr){7-8} \cmidrule(lr){9-10}
 &  & Entity & Rel. & Entity & Rel. & Entity & Rel. & Entity & Rel. \\
    \cmidrule(lr){2-10}

 &  SpERT \citep{spert} & 88.9 & 71.5 & 89.3 & 78.8 && \\
 &  DyGIE \citep{dygie} & && && && 88.4 & 63.2 \\

 &  MRC4ERE \citep{MRC4ERE} & 88.9 & 71.9 & && && 85.5 & 62.1  \\
 &  RSAN \citep{RSAN} & & & & & & 84.6 & &  \\
 [\tanlspace]


 &  \ourmodel                   & 89.4 & 71.4 &  90.2 &  80.6 & \best{94.9} & \best{90.8} &  \best{88.9} &  \best{63.7} \\
 &  \ourmodel{} (multi-dataset) & 89.8 & \bf 72.6 &	90.0 &	80.0 &	94.7 &	90.5 &	88.2 &	62.5
 \\
 &  \ourmodel{} (multi-task)    & \best{90.3} & 70.0 & \best{91.2} & \best{83.8} & 94.7 & 90.7 \\ %
\end{tabularx}


\betweentablespace


\begin{tabularx}{\textwidth}{p{\sidedescwidth} p{\modelcolwidth} *{4}{C}}
    \topleftdesc{NER}{10}
     & {\bf CoNLL03} & {\bf OntoNotes} & {{\bf GENIA}*} & {{\bf ACE2005}*} \\

    \cmidrule(lr){2-6}

 &  BERT-MRC \citep{bert_mrc} & 93.0 & 91.1 &  \best{83.8} & \best{86.9} \\
 &  BERT-MRC+DSC \citep{dice_loss} & 93.3 & \best{92.1} & \\
 &  Cloze-CNN \citep{cloze_pretrain} & \best{93.5} && \\
 &  GSL \citep{gsl} & 90.7 & 90.2 & \\
    [\tanlspace]
 &  \ourmodel{} & 91.7 & 89.8 & 76.4 & 84.9 \\
 &  \ourmodel{} (multi-dataset) & 92.0 & 89.8 & 75.9 & 84.4 \\
 &  \ourmodel{} (multi-task) & 91.7 & 89.4 & 76.4 & \\ %
\end{tabularx}

\betweentablespace


\begin{tabularx}{\textwidth}{p{\sidedescwidth} p{\modelcolwidth} *{6}{C}}
    \topleftdesc{Relation Class.}{14}
    & %
    & \multicolumn{4}{c}{\textbf{FewRel 1.0} (validation)} \\
    \cmidrule(lr){4-7}

\pad & \multirow{2}{*}{\bf TACRED } & 5-way 1-shot & 5-way 5-shot & 10-way 1-shot & 10-way 5-shot \\

    \cmidrule(lr){2-7}

\pad BERT-EM \citep{matching_the_blank} & 70.1 & 88.9 && 82.8 \\
\pad BERT$_{\text{EM}}$+MTB \citep{matching_the_blank} & 71.5 & 90.1 && \best{83.4} \\
\pad DG-SpanBERT \citep{dg-spanbert} & 71.5 & \\
\pad BERT-PAIR \citep{bertpair_fewrel} & & 85.7 & 89.5 & 76.8 & 81.8 \\
    [\tanlspace]
\pad \ourmodel{} & \best{88.4} & \best{94.0 \tpm 4.1}  & \best{96.4 \tpm 4.2}  & 82.6 \tpm 4.5  & \best{88.2 \tpm 5.9} \\
\pad \ourmodel{} (multi-task) &  88.1 & \\ %
\end{tabularx}

\betweentablespace


\begin{tabularx}{\textwidth}{p{\sidedescwidth} p{\modelcolwidth} *{3}{C}}
    \topleftdesc{SRL}{9}
    & \textbf{CoNLL05 WSJ}          & \textbf{CoNLL05 Brown}         & \textbf{CoNLL2012}               \\
    \cmidrule(lr){2-5}
\pad Dep and Span \citep{span_dependency_srl} & 86.3 & 76.4 & 83.1 \\
\pad BERT SRL \citep{bert_re_srl}               & 88.8 & 82.0   & \,86.5 \\
[\tanlspace]
\pad \ourmodel{}                    &    89.3                      &  82.0              &            \textbf{87.7}      \\
\pad \ourmodel{} (multi-dataset) & \best{89.4}  & \best{84.3}  & 87.6  \\
\pad \ourmodel{} (multi-task) & 89.1  & 84.1  & 87.7  \\ %
\end{tabularx}

\betweentablespace

\begin{tabularx}{\textwidth}{p{\sidedescwidth} p{\modelcolwidth} *{4}{C}}
    \topleftdesc{Event Extr.}{10}
    & \multicolumn{4}{c}{\textbf{ACE2005}} \\
    \cmidrule(lr){3-6} %

\pad &
    {\makecell{Trigger Id.}} &  {\makecell{Trigger Cl.}} & {\makecell{Argument Id.}} &  {\makecell{Argument Cl.}} \\
    \cmidrule(lr){2-6} %


\pad J3EE \citep{nguyen2019one} & 72.5 & \best{69.8} & \best{59.9} & 52.1 \\
\pad DyGIE++ \citep{wadden2019entity}  &  & 69.7 & 55.4 & \,\best{52.5} \\
    [\tanlspace]
\pad \ourmodel{} & \best{72.9} & 68.4 & 50.1 & 47.6  \\
\pad \ourmodel{} (multi-task) & 71.8 & 68.5 & 48.5 & 48.5 \\
\end{tabularx}

\betweentablespace

\begin{tabularx}{\textwidth}{p{\sidedescwidth} p{5.2cm} *{8}{C}}
    \topleftdesc{Coreference Res.}{12}
    & \multicolumn{8}{c}{\textbf{CoNLL-2012}* (BERT-base $\brokenvert$ BERT-large) } \\
    \cmidrule(lr){3-10} %
    \pad \bf & \multicolumn{2}{c}{MUC} & \multicolumn{2}{c}{B$^{3}$} & \multicolumn{2}{c}{CEAF$_{\phi_{4}}$} & \multicolumn{2}{c}{Avg.\ F1} \\
    \cmidrule(lr){2-10} %
\pad Higher-order c2f-coref \citep{lee-etal-2018-higher} & \multicolumn{1}{c:}{80.4} & & \multicolumn{1}{c:}{70.8} &  & \multicolumn{1}{c:}{67.6} &  & \multicolumn{1}{c:}{73.0} & \\
\pad SpanBERT
\citep{spanbert} & \multicolumn{1}{c:}{} & 85.3 & \multicolumn{1}{c:}{} & 78.1 & \multicolumn{1}{c:}{} & 75.3 & \multicolumn{1}{c:}{} & 79.6 \\
\pad BERT+c2r-coref \citep{joshi2019bert} & \multicolumn{1}{c:}{81.4} & 83.5 & \multicolumn{1}{c:}{71.7} & 75.3 & \multicolumn{1}{c:}{68.8} & 71.9 & \multicolumn{1}{c:}{73.9} & 76.9 \\
\pad CorefQA+SpanBERT \citep{CorefQA} & \multicolumn{1}{c:}{\bf 86.3} & \bf 88.0 & \multicolumn{1}{c:}{\bf 77.6} & \bf 82.2 & \multicolumn{1}{c:}{\bf 75.8} & \bf 79.1 & \multicolumn{1}{c:}{\bf 79.9} & \bf \,83.1 \\
    [\tanlspace]
\pad \ourmodel{}             & \multicolumn{1}{c}{81.0} & & \multicolumn{1}{c}{69.0} & & \multicolumn{1}{c}{68.4} & & \multicolumn{1}{c}{72.8} & \\
\pad \ourmodel{} (multi-task) & \multicolumn{1}{c}{78.7} &  & \multicolumn{1}{c}{65.7} & & \multicolumn{1}{c}{63.8} & & \multicolumn{1}{c}{69.4} & \\
\end{tabularx}

\betweentablespace

\begin{tabularx}{\textwidth}{p{\sidedescwidth}  p{5.2cm} *{2}{C}}
    \topleftdesc{DST}{7}
    & \multicolumn{2}{c}{\textbf{MultiWOZ 2.1} (Joint Accuracy)} \\
    \cmidrule(lr){2-4}
\pad TRADE \citep{WuTradeDST2019} & 45.6 \\
\pad SimpleTOD \citep{hosseiniasl2020simple} & \bf \,55.7  \\
    [\tanlspace]
\pad \ourmodel{} & 50.5 \\
\pad \ourmodel{} (multi-task) & 51.4 \\
\end{tabularx}
\end{table}
\renewcommand{\ourmodel}{TANL}


In this section, we show that our TANL framework, with the augmented natural languages outlined in \Cref{sec:sp_tasks}, can effectively solve the structured prediction tasks considered and exceeds the previous state of the art on multiple datasets.


All our experiments start from a \pretrained T5-base model \citep{t5}. %
To keep our framework as simple as possible, hyperparameters are the same across all experiments, except for some dataset-specific ones, such as the maximum sequence length.
Details about the experimental setup, datasets, and baselines are described in \Cref{sec:experimental-setup}.


\begin{figure*}[]
\vspace{-.3cm}
\centering
    \newcommand{\plotwidth}{0.48\textwidth}
  \begin{subfigure}[t]{\plotwidth}
    \includegraphics[trim=20 0 30 21, clip, width=1.\textwidth]{figures/low_resourse_relation.pdf}
    \caption{Low-resource scenarios} \label{fig:low_resource_conll04_relation}
  \end{subfigure}
  \begin{subfigure}[t]{\plotwidth}
    \includegraphics[trim=20 0 30 21, clip, width=1.\textwidth]{figures/ablation_relation_diff.pdf}
    \caption{Ablation studies }\label{fig:ablation_relation}
  \end{subfigure}
\vspace{-.2cm}
\caption{
Experiments on the CoNLL04 dataset.
(a) Our model outperforms the previous state-of-the-art
model SpERT, in low-resource scenarios.
(b) Ablation studies where we remove label semantics (numeric labels), augmented natural language format (abridged output) or dynamic programming alignment (no DP alignment), and plot the score difference with the non-ablated \ourmodel{}.
}
\label{fig:low-resource-conll}
\vspace{-.4cm}
\end{figure*}

\subsection{Single-task and Multi-task experiments}  \label{sec:exp_singletask}  \label{sec:exp_multitask}
We use three data settings in our experiments: (1) single dataset, (2) multiple datasets for the same task (multi-dataset), and (3) all datasets across all tasks (multi-task).
\Cref{tab:results} shows the results.

With the single-task setup, we achieve state-of-the-art performance on the following datasets: ADE, NYT, and ACE2005 (joint entity and relation extraction), FewRel and TACRED (relation classification), CoNLL-2005 and CoNLL-2012 (semantic role labeling).
For example, we obtain a +6.2 absolute improvement in F1 score on the NYT dataset over the previous state of the art.
Interestingly, this result is higher than the performance of models that use ground-truth entities to perform relation extraction, such as REDN \citep{REDN}, which achieves a relation F1 score of 89.8. %
In coreference resolution, \ourmodel{} performs similarly to previous approaches that employ a BERT-base model, except for CorefQA \citep{CorefQA}.
To the best of our knowledge, ours is the first end-to-end approach to coreference resolution not requiring a separate mention proposal module and not enforcing a maximum mention length.


For other datasets, we obtain a competitive performance within a few points of the best baselines.
We highlight that our approach uses a single model architecture that can be trained to perform \emph{any} of the tasks without model modification. This is in stark contrast with typical discriminative models, which tend to be task-specific, as can be seen from \Cref{tab:results}.


In fact, under this unified framework, a single model can be trained to perform multiple or all tasks at once, with the performance being on par or even better than the single-task setting. %
In particular, when the dataset sizes are small such as in ADE or CoNLL04, we obtain sizable improvements and become the new state of the art (from 80.6 to 83.7 for ADE relation F1, and from 89.4 to 90.6 for CoNLL04 entity F1).
The only case where our multi-task model has notably lower scores is coreference resolution, where the input documents are much longer than in the other tasks.
Since the maximum sequence length in the multi-task experiment (512 tokens) is smaller than in the single-dataset coreference experiment (1,536 tokens for input and 2,048 for output), the input documents need to be split into smaller chunks, and this hurts the model's ability to connect multiple mentions of the same entity across different chunks.
From the multi-task experiment, we leave out all datasets based on ACE2005 except for event extraction due to overlap between train and test splits for different tasks.
We discuss our experiments in more detail in \Cref{sec:sp-tasks-appendix}.


All results presented in this paper are obtained from a \pretrained T5-base model.
In principle, any \pretrained generative language model can be used, such as BART \citep{bart} or GPT-2 \citep{gpt2}.
It would be interesting to check whether these models are as capable as T5 (or even better) at learning to translate between our augmented languages.
We leave this as a direction for future investigation.


\subsection{Low-resource settings} \label{sec:exp_low_resource}
Multiple experiments suggest that \ourmodel{} is data-efficient compared to other baselines.
On the FewRel dataset, a benchmark for few-shot relation classification, our model outperforms the best baselines BERT$_{\text{EM}}$ and BERT$_{\text{EM}}$+MTB \citep{bert,matching_the_blank}, where the MTB version uses a large entity-linked text corpus for \pretraining.
On the TACRED relation classification dataset, our model also significantly improves upon the best baselines (from 71.5 to 88.4).
While TACRED is not specifically a few-shot dataset, we observe that there are many label types that rarely appear in the training set, some of them having less than 30 appearances out of approximately 70,000 training label instances.
We show the occurrence statistics for all label types in the appendix (\Cref{tab:sup_tacred}), demonstrating that the dataset is highly imbalanced.
Nonetheless, we find that our model performs well, even on instances involving scarce label types.
This ability distinguishes our models from other few-shot approaches such as prototypical networks \citep{protonet} or matching networks \citep{matching_net}, which are designed only for few-shot scenarios but do not scale well on real-world data which often contains a mix of high and low-resource label types.


Our low-resource study on the joint entity and relation extraction task also confirms that our approach is more data-efficient compared to other methods.
We experiment on the CoNLL04 dataset, using only 0.8\% (9 sentences) to 6\% (72 sentences) of the training data. Our approach outperforms SpERT (a state-of-the-art discriminative model for joint entity and relation extraction) in this low-resource regime, whereas the performance is similar when using the full training set.

Thanks to the unified framework, we can easily train on a task, potentially with larger resources, and adapt to other low-resource end tasks (transfer learning). To show this, we train a model with a large dataset from joint entity and relation extraction (NYT) and fine-tune it on a limited portion of the CoNLL04 dataset (\Cref{fig:low-resource-conll}), obtaining a significant increase in performance (up to +9 relation F1).

Finally, in \Cref{sec:appendix-errors} we analyze how the size of the training dataset affects the number of generation errors of our model.


\subsection{Ablation studies} \label{sec:exp_ablation}
We conduct ablation studies to demonstrate that label semantics, augmented natural language format, and optimal alignment all contribute to the effectiveness of \ourmodel{} (\Cref{fig:ablation_relation}).
Further details on these ablation studies can be found in \Cref{sec:ablation-studies}.


\textbf{Numeric labels:} To prevent the model from understanding the task through label semantics, we use numeric labels. This substantially hurts the performance, especially in a low-resource setting where transfer learning is more important.
\textbf{Abridged output:} Second, to determine the impact of the augmented natural language format outlined in \Cref{sec:sp_tasks}, we experiment with a format which does not repeat the entire input sentence.
We find that this abridged format consistently hurts model performance, especially in low-resource scenarios.
In other tasks, we generally find that a more natural-looking format usually performs better (see \Cref{supp:relation-classification}).
\textbf{No DP alignment:} We use exact word matching instead of the dynamic programming alignment described in \Cref{sec:method}.


\section{Discussion and Conclusion} \label{sec:discussion}

We have demonstrated that our unified text-to-text approach to structured prediction can handle all the considered tasks within a simple framework and offers additional benefits in low-resource settings.
Unlike discriminative models common in the literature, \ourmodel{} is generative as it translates from an input to an output in augmented natural languages.
These augmented languages are flexible and can be designed to handle a variety of tasks, some of which are complex and previously required sophisticated prediction modules. %
By streamlining all tasks to be compatible with a single model, multi-task learning becomes seamless and yields state-of-the-art performance for many tasks.


Generative models, and in particular sequence-to-sequence models, have been used successfully in many NLP problems such as machine translation, text summarization, etc. %
These tasks involve mappings from one \emph{natural} language input to another \emph{natural} language output. %
However, the use of sequence modeling for structured prediction has received little consideration.
This is perhaps due to the perception that the generative approach is too unconstrained and that it would not be a robust way to generate a precise output format that corresponds to structured objects, or that it may add an unnecessary layer of complexity with respect to discriminative models.
We demonstrate that this is quite the opposite.
The generative approach can easily handle disparate tasks, even at the same time, by outputting specific structures appropriate for each task with little, if any, format error.


We note that one drawback of the current generative approach is that the time complexity for each token generation is $\O(L^2)$ where $L$ is the sentence length. %
However, there have been recent advances in the attention mechanism that reduce the complexity to $\mathcal{O}(L \log L)$ as in Reformer \citep{reformer}, or to $\mathcal{O}(L)$ as in Linformer \citep{linformer}.
Incorporating these techniques in the future can significantly reduce computation time and allow us to tackle more complex tasks, as well as improve on datasets with long input sequences such as in coreference resolution.


Based on our findings, we believe that generative modeling is highly promising but has been an understudied topic in structured prediction.
Our findings corroborate a recent trend where tasks typically treated with discriminative methods have been successfully solved using generative approaches \citep{gpt3,fusion_in_decoder,small_lm_fewshot}.
We hope our results will foster further research in the generative direction.


\bibliography{bibliography}
\bibliographystyle{iclr2021_conference}


\clearpage
\appendix


\section{Experimental setup, datasets, and baselines}
\label{sec:experimental-setup}


In all experiments, we fine-tune a \pretrained T5-base model \citep{t5}, to exploit prior knowledge of the natural language.
The family of T5 models was specially designed for downstream text-to-text tasks, making them suitable for our needs.
The T5-base model has about 220 million parameters. For comparison, both encoder and decoder are similar in size to BERT-base \citep{bert}.
We use the implementation of HuggingFace's Transformers library \citep{Wolf2019HuggingFacesTS}.

To keep our framework as simple as possible, hyperparameters are the same across the majority of our experiments.
We use: 8 V100 GPUs with a batch size of 8 per GPU; the AdamW optimizer \citep{adam, loshchilov2018decoupled}; linear learning rate decay starting from 0.0005;
maximum input/output sequence length equal to 256 tokens at training time (longer sequences are truncated), except for coreference resolution and dialogue state tracking (see below).
The number of fine-tuning epochs is adjusted depending on the size of the dataset, as described later.
With these settings, one fine-tuning step takes approximately 0.8 seconds. This translates into 15 seconds per epoch for the (relatively small) CoNLL04 dataset (joint entity-relation extraction) and 16 minutes per epoch for the (much larger) OntoNotes dataset (NER).
At inference time, we employ beam search with 8 beams, and we adjust the maximum sequence length depending on the length of the sentences in each dataset.
Note that beam search is not an essential part of our framework, as we find that greedy decoding gives almost identical results.

In the rest of this section, we describe datasets and baselines for each structured prediction task, as well as additional insights on particular experiments.
Results of all experiments are given in
\Cref{tab:results}.
Unless otherwise specified, micro-F1 scores are reported.
Most experiments are run more than once, as described below, and the average result is reported.
\Cref{table:multi-task-input-output} shows input-output examples from different datasets.


For the multi-task experiment, we train for 50 epochs on 80 GPUs, with a batch size of 3 per GPU. The maximum input/output sequence length is set to 512 for all tasks.


\label{sec:sp-tasks-appendix}

\subsection{Joint entity-relation extraction}
\label{supp:re}

\paragraph{Datasets.}
We experiment on the following datasets: CoNLL04 \citep{conll04}, ADE \citep{ade}, NYT \citep{nyt}, and ACE2005 \citep{ace2005}.

\begin{itemize}
    \item The \textbf{CoNLL04} dataset consists of sentences extracted from news articles, with four entity types (\emph{location}, \emph{organization}, \emph{person}, \emph{other}) and five relation types (\emph{work for}, \emph{kill}, \emph{organization based in}, \emph{live in}, \emph{located in}).
    As in previous work, we use the training (922 sentences), validation (231 sentences), and test (288 sentences) split by \citet{gupta2016table}.
    We train for 200 epochs and report our test results averaged over 10 runs.

    \item The \textbf{ADE} dataset consists of $4,272$ sentences extracted from medical reports, with two entity types (\emph{drug}, \emph{disease}) and a single relation type (\emph{effect}).
    This dataset has sentences with nested entities.
    As in previous work, we conduct a 10-fold cross-validation and report the average macro-F1 results across all 10 splits (except for the multi-task experiment, which is carried out once and uses the first split of the ADE dataset).
    We train for 200 epochs.

    \item The \textbf{NYT} dataset \citep{zeng2018extracting} is based on the New York Times corpus and was automatically labeled with distant supervision by \citet{nyt}.
    We use the preprocessed version of \citet{ETLspan}.
    This dataset has three entity types (\emph{location}, \emph{organization}, \emph{person}) and 24 relation types (such as \emph{place of birth}, \emph{nationality}, \emph{company}).
    It consists of 56,195 sentences for training, 5,000 for validation, and 5,000 for testing.
    We train for 50 epochs and report our test results averaged over 5 runs.

    \item The \textbf{ACE2005} dataset is derived from the ACE2005 corpus \citep{ace2005} and consists of sentences from a variety of domains, including news and online forums.
    We use the processing code of \citet{dygie}. After filtering out the sentences without entities, we get 7,477 sentences for training, 1789 for validation, and 1517 for testing.
    It has seven entity types (\emph{location}, \emph{organization}, \emph{person}, \emph{vehicle}, \emph{geographical entity}, \emph{weapon}, \emph{facility}) and six relation types (\emph{PHYS}, \emph{ART}, \emph{ORG-AFF}, \emph{GEN-AFF}, \emph{PER-SOC}, \emph{PART-WHOLE}).
    The natural labels we use for the relation types are: \emph{physical}, \emph{artifact}, \emph{employer}, \emph{affiliation}, \emph{social}, \emph{part of}.
    We train for 100 epochs and report our test results averaged over 10 runs.
\end{itemize}

For all single-dataset experiments, \Cref{table:std} shows the number of training epochs, the number of runs, and the standard deviations, in addition to the average results, which are already reported in \Cref{tab:results}.


\begin{table}[tp]
\centering
\small
\caption{Details about the single-dataset experiments in joint entity-relation extraction and named entity recognition.}
\label{table:std}
\begin{tabularx}{\textwidth}{p{5cm} *{4}{C}}
    \bf Dataset & \bf \# Epochs & \bf \# Runs & \multicolumn{2}{c}{\bf Results} \\
    \midrule
    \em Joint entity-relation extraction &&& Entity F1 & Relation F1 \\
    \cmidrule(lr){4-5}

    CoNLL04     & 200 & 10 & 89.4 \tpm 0.3 & 71.4 \tpm 1.1 \\
    ADE         & 200 & 10 & 90.2 \tpm 0.7 & 80.6 \tpm 1.5 \\
    NYT         & \phantom{0}50 & \phantom{0}5  & 94.9 \tpm 0.1 & 90.8 \tpm 0.1 \\
    ACE2005     & 100 & 10 & 88.9 \tpm 0.1 & 63.7 \tpm 0.7 \\[0.3cm]

    \em Named entity recognition &&& \multicolumn{2}{c}{Entity F1} \\
    \cmidrule(lr){4-5}

    CoNLL03     & \phantom{0}50 & 10 & \multicolumn{2}{c}{91.7 \tpm 0.1} \\
    OntoNotes   & \phantom{0}20 & 10 & \multicolumn{2}{c}{89.8 \tpm 0.1} \\
    GENIA       & \phantom{0}50 & 10 & \multicolumn{2}{c}{76.4 \tpm 0.4} \\
    ACE2005     & \phantom{0}50 & 10 & \multicolumn{2}{c}{84.9 \tpm 0.2} \\[0.05cm]
\end{tabularx}
\end{table}


\paragraph{Baselines.}
SpERT \citep{spert} is a BERT-based model which performs span classification and then relation classification.
Multi-turn QA \citep{multiturnQA} casts the problem as a multi-turn question answering task.
ETL-Span \citep{ETLspan} uses BiLSTM and decomposes the problem into two tagging sub-problems: head entity extraction, and tail entity and relation extraction.
WDec \citep{Wdec} uses an encoder-decoder architecture to directly generate a list of relation tuples.
MRC4ERE \citep{MRC4ERE} improves on the question answering approach by leveraging a diverse set of questions.
RSAN \citep{RSAN} is a sequence labeling approach which utilizes a relation-aware attention mechanism.


\begin{figure*}[]
\vspace{-.3cm}
\centering
    \newcommand{\plotwidth}{0.48\textwidth}
    \begin{subfigure}[t]{\plotwidth}
    \includegraphics[trim=20 0 30 21, clip, width=1.\textwidth]{figures/low_resourse_entity.pdf}
  \end{subfigure}
  \begin{subfigure}[t]{\plotwidth}
    \includegraphics[trim=20 0 30 21, clip, width=1.\textwidth]{figures/low_resourse_relation.pdf}
  \end{subfigure}
\vspace{-.2cm}
\caption{
Low-resource experiments on the CoNLL04 dataset.
}
\label{fig:low-resource-appendix}
\vspace{-.4cm}
\end{figure*}

\paragraph{Low-resource experiments.}
As outlined in \Cref{sec:exp_low_resource}, we experiment on the CoNLL04 dataset with only a limited portion of the training set available and plot our results in \Cref{fig:low-resource-appendix}.
Comparison is made with SpERT \citep{spert}, a state-of-the-art discriminative model.
\ourmodel{} performs better than SpERT with fewer data, especially on the more complex task of relation extraction (right plot).
We also show our method's performance with preliminary fine-tuning on the NYT dataset for one epoch, which significantly improves the performance on both entity and relation extraction.
To account for the small dataset size, we fine-tune on CoNLL04 for 2,000 epochs (10$\times$ the number of epochs we use to train on the full CoNLL04 dataset).
For a fair comparison, we train SpERT for 20, 200, and 2000 epochs (respectively 1$\times$, 10$\times$, and 100$\times$ the number of epochs suggested in the paper), and report the best result among the three, which is always obtained with 200 epochs.
We plot mean and standard deviation over 10 runs (each model being fine-tuned on the same 10 subsets of the training set and evaluated on the entire test set).
For reference, the smallest training set has only 9 sentences (0.8\% of the total), effectively consisting in a few-shot learning scenario.


\paragraph{Multi-dataset experiments.}
We train a single model on all four datasets for 20 epochs and report the average over 10 runs.
We use a different split of the ADE dataset in each run.


\subsection{Named entity recognition}


\paragraph{Datasets.}
We experiment on two flat NER datasets, CoNLL03 \citep{conll03} and OntoNotes \citep{ontonotes5}, and two nested NER datasets, GENIA \citep{genia} and ACE2005 \citep{ace2005}.


\begin{itemize}
    \item For the \textbf{CoNLL03} dataset \citep{conll03} we use the same processing and splits as \citet{bert_mrc}, resulting in 14,041 sentences for training, 3,250 for validation, and 3,453 for testing.
    This dataset has four entity types (\emph{location}, \emph{organization}, \emph{person}, \emph{miscellaneous}).
    We train for 50 epochs and report our test results averaged over 10 runs.

    \item The English \textbf{OntoNotes} dataset \citep{ontonotes5} consists of 59,924 sentences for training, 8,528 for validation, and 8,262 for testing.
    It has 18 entity types (such as \emph{person}, \emph{organization}, \emph{date}, \emph{percent}).
    We train for 20 epochs and report our test results averaged over 10 runs.

    \item The \textbf{GENIA} dataset \citep{genia} consists of sentences from the molecular biology domain.
    As in previous work, we use the processing and splits of \citet{nested_ner} resulting in 14,824 sentences for training, 1,855 for validation, and 1,854 for testing.
    There are five entity types (\emph{protein}, \emph{DNA}, \emph{RNA}, \emph{cell line}, \emph{cell type}).
    We train for 50 epochs and report our test results averaged over 10 runs.

    \item The \textbf{ACE2005} dataset for nested NER is based on the ACE2005 corpus \citep{ace2005}, but is different from the one used for joint entity-relation extraction.
    We use the same processing and splits of \citet{bert_mrc}, resulting in 7,299 sentences for training, 971 for validation, and 1,060 for testing.
    It has the same seven entity types as the ACE2005 dataset used for joint entity-relation extraction.
    We train for 50 epochs and report our test results averaged over 10 runs.
\end{itemize}

As for joint entity-relation extraction, \Cref{table:std} summarizes our setup and results (with standard deviations) for the single-dataset experiments.


\paragraph{Baselines.}
State-of-the-art results on popular NER datasets are mostly detained by BERT-MRC \citep{bert_mrc} and BERT-MRC + DSC \citep{dice_loss}, which formulate the problem as a machine reading comprehension task, solved by asking multiple questions.
ClozeCNN \citep{cloze_pretrain} leverages a cloze-driven \pretraining.
Seq2seq-BERT \citep{seq2seq-BERT} uses a seq2seq model to output the list of entity types.
Second-best learning and decoding \citep{second_best} iteratively decodes nested entities starting from the outermost ones, using the Viterbi algorithm.
For flat NER, our approach is similar to GSL \citep{gsl}.


\paragraph{Multi-dataset experiments.}
We train a single model on all four datasets for 10 epochs and report our results averaged over 5 runs.


\subsection{Relation classification}
\label{supp:relation-classification}

\paragraph{Datasets.} We experiment on FewRel \citep{fewrel} and TACRED \citep{tacred}.
\begin{itemize}
    \item \textbf{FewRel} consists of 100 relations with 7 instances for each relation.
    The standard evaluation for this benchmark uses few-shot $N$-way $K$-shot settings, which we follow. The entire dataset is split into train (64 relations), validation (16 relations) and test set (20 relations).
    We train our model on the meta-training set, which has no overlapping classes with the evaluation set.
    At evaluation time, given a support set and a query set on a new task, we fine-tune the model on the support set to learn the new task and evaluate on the query set.


    \item \textbf{TACRED}
    is a large-scale relation classification dataset with 106,264 examples, covering 41 relation types.
\end{itemize}


\paragraph{Baselines.} We compare our approach with the following two models in the literature.
The first is \bert-pair \citep{bertpair_fewrel}, a sequence classification model based on BERT, which learns to optimize the scores indicating the relation between a query instance and other supporting instances for the same relation.
The second is BERT$_\text{EM}$ + Matching the Blanks (MTB) \citep{matching_the_blank}. BERT$_\text{EM}$ uses entity markers indicating the start and the end of the head and tail entities in the input sentence.
MTB is a \pretraining based on an additional large corpus of relation data.
Nevertheless, our model is able to outperform BERT$_\text{EM}$+MTB in certain cases, such as the 5-way 1-shot setting on FewRel.

\paragraph{Augmented natural language formats.}
We experiment with many augmented natural language formats, as shown below:

\begin{footnotesize}
\begin{customquote}
\quotespacestart
    \textbf{Input (chosen):}
    \nohyphens{
    Born in Bologna, Orlandi was a student of the famous Italian \entitybegin \entity{soprano} \entityend and voice teacher \entitybegin \entity{Carmen Melis} \entityend in Milan.
    The relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend is}

    \textbf{Output (chosen) :}
    relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend \equals \attribute{voice type}


    \smallskip
    \textbf{Input (alternative 1):}
    \nohyphens{
    Born in Bologna, Orlandi was a student of the famous Italian \entitybegin \entity{soprano} \entityend and voice teacher \entitybegin \entity{Carmen Melis} \entityend in Milan.
    The relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend is}

    \textbf{Output (alternative 1):}
    \attribute{voice type}


    \smallskip
    \textbf{Input (alternative 2):}
    \nohyphens{
    Born in Bologna, Orlandi was a student of the famous Italian \entitybegin \entity{soprano} \separator \attribute{tail}  \entityend and voice teacher \entitybegin \entity{Carmen Melis} \separator \attribute{head}  \entityend in Milan.}

    \textbf{Output (alternative 2):}
    relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend \equals \attribute{voice type}
\quotespaceend
\end{customquote}
\end{footnotesize}

The alternative 1 version has a shorted output which only produces the keyword such as  \emph{voice type} corresponding to the predicted relation.
However, we find that it does not perform as well as the chosen format.
We hypothesize that it is due to the rich semantics of the sentence ``relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend'', and possibly softer gradient information on the longer sequence which improves training.


The alternative 2 version annotates the head vs.\ tail information for the entities directly in the input, instead of using a phrase such as ``relationship between \entitybegin \entity{Carmen Melis} \entityend and \entitybegin \entity{soprano} \entityend'' to specify that ``\entity{Carmen Melis}'' is the head entity.
However, this format also does not perform as well, possibly because the meaning of the words \attribute{head} and \attribute{tail} are not fully understood in this context.
Overall, the chosen format sounds the most natural out of all options and is closer to natural language, which we use as our guiding principle to design our augmented natural language formats.


\paragraph{TACRED results and label sparsity.}
We obtain an F1 score of 88.39 on the TACRED dataset, exceeding the previous state of the art by +16.9.
A major factor for such a big improvement is the shared semantics across different labels, which is particularly beneficial in the case of sparse labels.
In \Cref{tab:sup_tacred} we show the relation types in natural words, the number of training examples, which can be quite small, and the test recall (\emph{i.e.}, out of all ground truth relations for a given type, how many we predict correctly).  We can see that even though some relation types such as \emph{date of birth} have as little as 64 labels in the training set (less than 0.1\% of the entire set), our model is able to correctly predict this relation type with recall 77.8\%.


The ability to handle few-shot cases despite the label scarcity allows our approach to perform well in real-world data such as TACRED, where the labels can be highly imbalanced. As seen in Table \ref{tab:sup_tacred}, only a few instances such as \emph{employee of}, \emph{top members employees}, \emph{title}, and \emph{no relation} dominate the majority of the training set (approximately 60,000 out of 68,000), where the rest can be considered scarce.
Our model is different from other approaches specifically designed for few-shot scenarios in that it scales across different levels of data.


\begin{table}[tp]
\centering
\small
\caption{TACRED recall by relation type, with number of train and test examples.} \label{tab:sup_tacred}
\begin{tabular}{l r r c}
                          \textbf{Relation type} & \textbf{\# Train} &  \textbf{\# Test} & \textbf{Test recall} \\
\midrule
                  country of death &        7 &       9 &        33.3 \\
                         dissolved &       24 &       2 &        50.0 \\
                  country of birth &       29 &       5 &        20.0 \\
        state or province of birth &       39 &       8 &        50.0 \\
        state or province of death &       50 &      14 &        35.7 \\
                          religion &       54 &      47 &        55.3 \\
                     date of birth &       64 &       9 &        77.8 \\
                     city of birth &       66 &       5 &        40.0 \\
                           charges &       73 &     103 &        83.5 \\
       number of employees members &       76 &      19 &        57.9 \\
                      shareholders &       77 &      13 &         \phantom{0}7.7 \\
                     city of death &       82 &      28 &        32.1 \\
                           founded &       92 &      37 &        89.2 \\
   political religious affiliation &      106 &      10 &        40.0 \\
                           website &      112 &      26 &        73.1 \\
                    cause of death &      118 &      52 &        38.5 \\
                         member of &      123 &      18 &         \phantom{0}0.0 \\
                        founded by &      125 &      68 &        86.8 \\
                     date of death &      135 &      54 &        64.8 \\
                  schools attended &      150 &      30 &        46.7 \\
                          siblings &      166 &      55 &        63.6 \\
                           members &      171 &      31 &        35.5 \\
                      other family &      180 &      60 &        40.0 \\
                          children &      212 &      37 &        70.3 \\
 state or province of headquarters &      230 &      51 &        82.4 \\
                            spouse &      259 &      66 &        66.7 \\
                      subsidiaries &      297 &      44 &        31.8 \\
                            origin &      326 &     132 &        57.6 \\
   state or provinces of residence &      332 &      81 &        49.4 \\
               cities of residence &      375 &     189 &        46.0 \\
              city of headquarters &      383 &      82 &        72.0 \\
                               age &      391 &     200 &        95.0 \\
                           parents &      439 &     150 &        64.7 \\
            countries of residence &      446 &     148 &        42.6 \\
           country of headquarters &      469 &     108 &        48.1 \\
                   alternate names &      913 &     224 &        84.4 \\
                       employee of &     1,525 &     264 &        67.0 \\
             top members employees &     1,891 &     346 &        77.7 \\
                             title &     2,444 &     500 &        84.6 \\
                       no relation &    55,113 &   12,184 &        93.9 \\
\midrule
                       \bf Total       & 68,164      & 15,509 &
\end{tabular}
\end{table}

\paragraph{Few-shot experiments.}
For the FewRel dataset, we perform meta-training by training the model on the training set of FewRel for 1 epoch. During evaluation, we fine-tune the model on the support set for each episode for 2,500 epochs in the 1-shot cases, and for 500 epochs in the 5-shot cases.

\paragraph{Likelihood-based prediction.}
In relation classification, we aim to predict one class out of a pre-defined set of classes, so we can perform prediction by using sequence likelihoods as class scores.
This helps improve the performance particularly in the case of few-shot scenarios, where the generation of label types can be imperfect since the model has seen only one or few instances of each type.
With the likelihood evaluation, we obtain a slight improvement across the board.
For instance, we improve from an F1 score of 95.6 \tpm 4.8 to 96.4 \tpm 4.2 for the 5-way 5-shot case. All our reported numbers on the FewRel dataset are obtained by using this evaluation.
For TACRED, using the likelihood approach does not yield an improvement, possibly due to the fact that the model can generate exact label types given enough training resources, unlike in the few-shot case.


\subsection{Semantic role labeling}
\label{supp:srl}


\paragraph{Datasets.} We use CoNLL-2005 \citep{conll05_srl} and the CoNLL-2012 English subset of OntoNotes 5.0 \citep{ontonotes5} in our experiments.
See also \citet{conll05_srl, conll12_coref}.
These tasks have highly specific label types, and their natural words might be cumbersome for training. Therefore, we use the raw label types from the original datasets as presented below.


\begin{itemize}
    \item \textbf{CoNLL-2005} focuses on the semantic roles given verb predicates. The argument notation is the following.
    V: verb; A0: acceptor;  A1: thing accepted; A2: accepted from;  A3: attribute;  AM-MOD: modal; AM-NEG: negation.

    \item \textbf{CoNLL-2012}.
    The argument notation, taken from \citet{conll12_coref}, is as follows.

    Numbered arguments (A0-A5, AA): Arguments defining verb-specific roles. Their semantics depends on the verb and the verb usage in a sentence, or verb sense. The most frequent roles are A0 and A1. Commonly, A0 stands for the agent, and A1 corresponds to the patient or theme of the proposition. However, no consistent generalization can be made across different verbs or different senses of the same verb. PropBank takes the definition of verb senses from VerbNet, and for each verb and each sense defines the set of possible roles for that verb usage, called the roleset. The definition of rolesets is provided in the PropBank Frames files, made available for the shared task as an official resource to develop systems.


    Adjuncts (AM-): General arguments that any verb may take optionally. The following are the 13 types of adjuncts.
    AM-ADV: general-purpose;
    AM-CAU: cause;
    AM-DIR: direction;
    AM-DIS: discourse marker;
    AM-EXT: extent;
    AM-LOC: location;
    AM-MNR: manner;
    AM-MOD: modal verb;
    AM-NEG: negation marker;
    AM-PNC: purpose;
    AM-PRD: predication;
    AM-REC: reciprocal;
    AM-TMP: temporal.

    References (R-): Arguments representing arguments realized in other parts of the sentence. The role of a reference is the same as the role of the referenced argument. The label is an R-tag prefixed to the label of the referent, e.g., R-A1.
\end{itemize}


\paragraph{Baselines.}
We compare our results with Dependency and Span SRL \citep{span_dependency_srl}, which uses a Bi-LSTM with highway connection and biaffine scorers, and BERT-SRL \citep{bert_re_srl}, \bert-based model which predicts the spans based on the contextual and positional embeddings.


\paragraph{Multi-dataset experiments.}
We train a single model on all datasets for 50 epochs and report our results averaged over 5 runs.


\subsection{Event extraction}

\paragraph{Datasets.}
We use the \textbf{ACE2005} English event data \citep{ace2005} in our experiments, following standard event extraction literature.
We use the same split as previous work \citep{ji-grishman-2008-refining, li-etal-2013-joint} with 529 documents for training, 30 for validation, and 40 for testing. Since the majority of event triggers and their corresponding arguments are within the same sentence, we perform the event extraction task only at the sentence level.
We fine-tune our model for 50 epochs on this dataset.

\paragraph{Baselines.}
We compare our method with the following two baseline models in the literature.
The first is J3EE \citep{nguyen2019one}, a Bi-GRU based model that jointly performs event trigger detection, event mention detection, and event argument classification. J3EE performs event trigger detection and event mention detection as sequence tagging problems, and event argument classification as a classification problem, given any trigger and candidate argument pair.
The second baseline is DyGIE++ \citep{wadden2019entity}, a BERT based multi-task learning framework for the tasks of coreference resolution, relation extraction, named entity recognition, and event extraction.
DyGIE++ enumerates all possible phrases within a sentence and predicts the best entity type and trigger type for each of these phrases. Argument roles are then predicted for each trigger and entity pair.


\subsection{Coreference resolution}

\paragraph{Datasets.}
We use the standard OntoNotes benchmark defined in the \textbf{CoNLL-2012} shared task \citep{conll12_coref}.
It consists of 2,802 documents for training, 343 for validation, and 348 for testing, for a total of about one million words.
Since documents can be large (up to 4,000 words), we split each document into (partially overlapping) chunks up to 1,024 words long (and 128 words for the multi-task experiment).
At test time, we merge groups from different chunks if they have at least one mention in common in order to obtain document-level predictions.
As in prior work, evaluation is done by computing the average F1 score of the three standard metrics for coreference resolution: MUC, B$^3$, CEAF$_{\phi_4}$.
We train for 100 epochs, with a maximum sequence length equal to 1,536 tokens for input and 2,048 for output, and a batch size of 1 per GPU.


\paragraph{Baselines.}
The e2e-coref model \citep{e2e-coref} is among the first end-to-end approaches to coreference resolution. It considers all spans as potential mentions and learns a distribution over possible antecedents for each span.
Higher-order c2f-coref \citep{lee-etal-2018-higher} iteratively refines span representations taking into account higher-order relations between mentions.
BERT + c2f-coref \citep{joshi2019bert} combines the previous approach with BERT.
SpanBERT \citep{spanbert} introduces a new pretraining method which is designed to better represent and predict spans of text.
CorefQA \citep{CorefQA} generate queries for each mention from a mention proposal network and uses a question answering framework to extract text spans of coreferences.


\subsection{Dialogue state tracking}

\paragraph{Datasets.}
We use the \textbf{MultiWOZ 2.1} \citep{eric-etal-2020-multiwoz} task oriented dialogue dataset in our experiments.
It consists of 8,420 conversations for training, 1,000 for validation, and 999 for testing. We follow the pre-processing procedure put forward in \citep{WuTradeDST2019} for dialogue state tracking. In addition, we remove the ``police'' and ``hospital'' domains from the training set since they are not present in the test set. Removing these two domains reduces the training set size from 8,420 to 7,904.  %
We fine-tune for 100 epochs, with maximum sequence length set to 512 tokens.
We train a single generative model that predicts the dialogue state for the entire dialogue history up to the current turn.
Following prior work, we report the joint accuracy.

\paragraph{Baselines.} We compare our performance on MultiWOZ 2.1 against SimpleTOD \citep{hosseiniasl2020simple}, the current state of the art for MultiWOZ dialogue state tracking. SimpleTOD uses a sequence to sequence approach based on the GPT-2 \citep{gpt2} language model. Unlike our approach, SimpleTOD is trained to jointly generate actions and responses as well as dialogue states.


\section{Ablation studies}
\label{sec:ablation-studies}

As outlined in \Cref{sec:exp_ablation}, we conduct ablation studies on the CoNLL04 dataset (joint entity and relation extraction) to demonstrate the importance of label semantics, natural output format, and optimal alignment.
We compare \ourmodel{} with the following three variations.
\begin{itemize}
    \item Numeric labels: we use numbers (1, 2, 3, \dots) to indicate entity and relation types in the output sentences, as in the following example.
    \begin{footnotesize}
    \begin{customquote}
        \textbf{Output:}
        \entitybegin Boston University \separator 2 \entityend 's \entitybegin Michael D. Papagiannis \separator 3 \separator 1 \equals Boston University \entityend said he believes the crater was created \entitybegin 100 million years \separator 4 \entityend ago when a 50-mile-wide meteorite slammed into the \entitybegin Earth \separator 1 \entityend .
    \end{customquote}
    \end{footnotesize}

    \item Abridged output: here, the output consists of a list of entities, enclosed between \entitybegin \entityend tokens, without text between them.
    \begin{footnotesize}
    \begin{customquote}
        \textbf{Output:}
        \entitybegin Boston University \separator organization \entityend
        \entitybegin Michael D. Papagiannis \separator person \separator works for \equals Boston University \entityend
        \entitybegin 100 million years \separator other \entityend
        \entitybegin Earth \separator location \entityend
    \end{customquote}
    \end{footnotesize}

    \item No alignment: we process output sentences without the alignment module. For each predicted entity or relation, we look for the first exact match in the input sentence (the entity or relation is discarded if no exact match is found).
\end{itemize}

The outcomes of these experiments are shown in \Cref{fig:low-resource-conll,fig:sup_ablation}, and \Cref{table:ablation-alignment}.
We run all experiments using a variable amount of training data, from 100\% (1,153 sentences) down to 0.8\% (9 sentences), and always evaluate on the entire test set (288 sentences).
To account for the variable size of the training dataset, we adjust the number of training epochs as follows: 200 epochs when using all training data; 400 epochs for 50\% of the training data; 800 epochs for 25\%; 1,600 epochs for 12.5\%; 2,000 epochs for all remaining cases (6.3\%, 3.1\%, 1.6\%, 0.8\%).

Results show that all three components (label semantics, natural output format, and alignment) positively contribute to the effectiveness of \ourmodel{}.
The impact of label semantics is not noticeable
when using the full CoNLL04 training dataset (natural and numeric labels give similar F1 scores), but it becomes statistically relevant when using 50\% of the training data, or less.
On the other hand, the impact of alignment is higher when the training dataset is larger.
Interestingly, for entity extraction (left plot of \Cref{fig:sup_ablation}), repeating the input sentence is more important than using natural labels, whereas the opposite is true for relation extraction (right plot).

From these experiments, we deduce that: (1) the model indeed uses latent knowledge about label semantics, especially when the amount of training data is low; (2) using a ``natural'' output format (which replicates the input sentence as much as possible) allows the model to make more accurate predictions, likely by encouraging the use of the entire input as context; (3) alignment helps in locating the correct entity spans in the input sentence, and in correcting mistakes made by the model when replicating the input.

\begin{table}[tp]
\begin{center}
\caption{Ablation studies on the CoNLL04 dataset (using the full training set, and using only 50\% of the training sentences).
We report mean and standard deviation over 10 runs.
}
\label{table:ablation-alignment}
\small
\begin{tabular}{p{3.5cm}cccc}
    & \multicolumn{2}{c}{\bf CoNLL04} & \multicolumn{2}{c}{\textbf{CoNLL04} (50\%)} \\
    \cmidrule(lr){2-3} \cmidrule(lr){4-5}

    \textbf{Model} & Entity F1 & Relation F1 & Entity F1 & Relation F1 \\
    \midrule

    \ourmodel{} & \textbf{89.44} \tpm 0.30 & 71.44 \tpm 1.15 & \textbf{87.15} \tpm 1.08 & \textbf{68.30} \tpm 1.47 \\

    \midrule

    \ourmodel{} (numeric labels) & 89.13 \tpm 0.45 & \textbf{71.57} \tpm 0.89 & 86.59 \tpm 0.94 & 66.12 \tpm 1.31 \\

    \ourmodel{} (abridged output) & 88.42 \tpm 0.67 & 70.98 \tpm 1.12 & 86.11 \tpm 0.55 & 67.18 \tpm 1.18 \\

    \ourmodel{} (no alignment) & 87.88 \tpm 0.31 & 69.72 \tpm 1.31 & 85.56 \tpm 1.01 & 66.64 \tpm 1.54 \\
\end{tabular}
\end{center}
\end{table}

\begin{figure*}[tp]
\vspace{-.3cm}
\centering
    \newcommand{\plotwidth}{0.48\textwidth}
  \begin{subfigure}[t]{\plotwidth}
    \includegraphics[trim=20 0 30 21, clip, width=1.\textwidth]{figures/ablation_entity.pdf}
    \end{subfigure}
  \begin{subfigure}[t]{\plotwidth}
    \includegraphics[trim=20 0 30 21, clip, width=1.\textwidth]{figures/ablation_relation.pdf}
  \end{subfigure}
\vspace{-.2cm}
\caption{
Ablation studies on CoNLL04, using different portions of the training dataset.
}
\label{fig:sup_ablation}
\vspace{-.4cm}
\end{figure*}


\begin{figure*}[t]
\vspace{-.3cm}
\centering
\includegraphics[width=0.65\textwidth]{figures/errors.png}
\caption{
Percentage of output sentences presenting different kinds of errors, when training with a variable portion of the CoNLL04 training dataset.
}
\label{fig:errors}
\vspace{-.4cm}
\end{figure*}


\section{Analysis of generation errors}
\label{sec:appendix-errors}

The performance of \ourmodel{} crucially depends on the quality of the generated output sentences.
\Cref{fig:errors} shows how often the following kinds of generation errors occur on the CoNLL04 dataset.
\begin{itemize}
    \item Reconstruction errors: the output sentence does not exactly replicate the input sentence.
    \item Format errors: the augmented natural language format is invalid.
    \item Entity errors: there is at least one relation whose predicted tail entity does not match any predicted entity.
    \item Label errors: there is at least one predicted entity or relation type that does not exactly match any of the dataset's possible types.
\end{itemize}
Reconstruction errors are by far the most common, but they are mitigated by our alignment step.
When using the full CoNLL04 training dataset, other errors appear very infrequently; therefore, it is not necessary to add further post-processing steps to mitigate them.
We perform this generation error analysis on the CoNLL04 because it is the smallest of the benchmarks we consider, and as a result, the generation errors on CoNLL04 are likely to be the most significant.
Yet when training on only a limited portion of the training data, format, and entity errors do occur.
In this low-resource setting, \ourmodel{} would benefit from additional post-processing. We leave the investigation of such post-processing strategies aimed at low-resource scenarios for future work.


\begin{table}[p]
\centering
\caption{Input-output examples for all structured prediction datasets.}
\label{table:multi-task-input-output}
\scriptsize
\renewcommand{\arraystretch}{1.2}
\nohyphens{
\begin{tabular}{p{1cm}p{4.6cm}p{7cm}}
    \bf Dataset & \bf Input & \bf Output \\
    \midrule
    CoNLL04 & Boston University's Michael D.\ Papagiannis said he believes the crater was created 100 million years ago when a 50-mile-wide meteorite slammed into the Earth.
    &
    \entitybegin \entity{Boston University} \separator \attribute{organization} \entityend 's \entitybegin \entity{Michael D.\ Papagiannis} \separator \attribute{person} \separator works for \equals Boston University \entityend said he believes the crater was created \entitybegin \entity{100 million years} \separator \attribute{other} \entityend ago when a 50-mile-wide meteorite slammed into the \entitybegin \entity{Earth} \separator \attribute{location} \entityend. \\

    \midrule

    ADE & Progressive hypoxemia mandated endotracheal intubation 1 week after rituximab administration and led to death 4 weeks after admission.
    &
    \entitybegin \entity{Progressive hypoxemia} \separator \attribute{disease} \separator \attribute{effect} \equals rituximab \entityend mandated endotracheal intubation 1 week after \entitybegin \entity{rituximab} \separator \attribute{drug} \entityend administration and led to death 4 weeks after admission.

    \\ \midrule

    NYT & At the Triboro Coach depot in East Elmhurst, Queens, this morning, about 20 workers wore or carried red union bandannas and held placards with messages like, ``The Mayor Lied, There Goes Your Ride'' and ``On Strike.''
    &
    At the Triboro Coach depot in \entitybegin \entity{East Elmhurst} \separator \attribute{location} \separator \attribute{neighborhood of} \equals Queens \entityend, \entitybegin \entity{Queens} \separator \attribute{location} \separator \attribute{contains} \equals East Elmhurst \entityend, this morning, about 20 workers wore or carried red union bandannas and held placards with messages like, ``The Mayor Lied, There Goes Your Ride'' and ``On Strike.'' \\

    \midrule

    ACE2005 (entity-rel extraction) & %
    that is the very joyous town of palestine, west virginia, on the news that jessica lynch is eventually going to come home.
    &
    \entitybegin \entity{that} \separator \attribute{geographical entity} \entityend is the very joyous \entitybegin \entity{town} \separator \attribute{geographical entity} \entityend of \entitybegin \entity{palestine} \separator \attribute{geographical entity} \separator \attribute{part of} \equals west virginia \entityend, \entitybegin \entity{west virginia} \separator \attribute{geographical entity} \entityend, on the news that \entitybegin \entity{jessica lynch} \separator \attribute{person} \separator \attribute{located in} \equals home \entityend is eventually going to come \entitybegin \entity{home} \separator \attribute{geographical entity} \entityend. \\ \midrule

    CoNLL03 & Charlton, 61, and his wife, Peggy, became citizens of Ireland when they formally received Irish passports from deputy Prime Minister Dick Spring who said the honour had been made in recognition of Charlton's achievements as the national soccer manager.
    &
    \entitybegin \entity{Charlton} \separator \attribute{person} \entityend, 61, and his wife, \entitybegin \entity{Peggy} \separator \attribute{person} \entityend, became citizens of \entitybegin \entity{Ireland} \separator \attribute{location} \entityend when they formally received \entitybegin \entity{Irish} \separator \attribute{miscellaneous} \entityend passports from deputy Prime Minister \entitybegin \entity{Dick Spring} \separator \attribute{person} \entityend who said the honour had been made in recognition of \entitybegin \entity{Charlton} \separator \attribute{person} \entityend 's achievements as the national soccer manager. \\ \midrule

    OntoNotes & The eventual court decision could become a landmark in Dutch corporate law because the lawsuit ASKO plans to file would be the first to challenge the entire principle and practice of companies issuing voting preferred shares to management - controlled trusts to dilute voting power of common stockholders.
    &
    The eventual court decision could become a landmark in \entitybegin \entity{Dutch} \separator \attribute{nationality religious political group} \entityend corporate law because the lawsuit \entitybegin \entity{ASKO} \separator \attribute{organization} \entityend plans to file would be the \entitybegin \entity{first} \separator \attribute{ordinal} \entityend to challenge the entire principle and practice of companies issuing voting preferred shares to management - controlled trusts to dilute voting power of common stockholders.

    \\ \midrule
    GENIA & Activation of CD4 positive T cells is a primary requirement for human immunodeficiency virus (HIV) entry, efficient HIV replication, and progression to AIDS, Utilizing CD4 positive T cell lines and purified T cells from normal individuals, we have demonstrated that native envelope glycoproteins of HIV, gp 160, can induce activation of transcription factor, activated protein - 1 (AP - 1).
    &
    Activation of \entitybegin \entity{CD4 positive} \entitybegin \entity{T cells} \separator \attribute{cell type} \entityend \separator \attribute{cell type} \entityend is a primary requirement for human immunodeficiency virus (HIV) entry, efficient HIV replication, and progression to AIDS, Utilizing \entitybegin \entity{CD4 positive T cell lines} \separator \attribute{cell line} \entityend and \entitybegin \entity{purified} \entitybegin \entity{T cells} \separator \attribute{cell type} \entityend \separator \attribute{cell type} \entityend from normal individuals, we have demonstrated that \entitybegin \entity{native envelope glycoproteins} \separator protein \entityend of HIV, \entitybegin \entity{gp 160} \separator \attribute{protein} \entityend, can induce activation of \entitybegin \entity{transcription factor, activated protein - 1} \separator \attribute{protein} \entityend (\entitybegin \entity{AP-1} \separator \attribute{protein} \entityend).
    \\ \midrule
    ACE2005 (NER) & While Starbucks does partner (airlines, airports, Barnes \& Noble), most of its stores are company owned.
    &
    While \entitybegin \entity{Starbucks} \separator \attribute{organization} \entityend does partner (\entitybegin \entity{airlines} \separator \attribute{organization} \entityend, \entitybegin \entity{airports} \separator \attribute{facility} \entityend, \entitybegin \entity{Barnes \& Noble} \separator \attribute{organization} \entityend), \entitybegin most of \entitybegin \entitybegin \entity{its} \separator \attribute{organization} \entityend \entity{stores} \separator \attribute{facility} \entityend \separator \attribute{facility} \entityend are \entitybegin \entity{company} \separator \attribute{organization} \entityend owned.
    \\ \midrule

    TACRED & The leader of the group, \entitybegin \entity{Laura Silsby} \entityend, a businesswoman who describes herself as a missionary as well, has also come under scrutiny at home in \entitybegin \entity{Idaho} \entityend, where employees complain of unpaid wages and the state has placed liens on her company bank account. The relationship between \entitybegin \entity{Laura Silsby} \entityend and \entitybegin \entity{Idaho} \entityend is
    &
    relationship between \entitybegin \entity{Laura Silsby} \entityend and \entitybegin \entity{Idaho} \entityend \equals \attribute{state or provinces of residence}
    \\ \midrule

    FewRel & In June 2017 President of Catalonia Carles Puigdemont announced that a \entitybegin \entity{referendum} \entityend on \entitybegin \entity{Catalan independence} \entityend would be held on 1 October 2017. The relationship between \entitybegin \entity{referendum} \entityend and \entitybegin \entity{Catalan independence} \entityend is
    &
    relationship between \entitybegin \entity{referendum} \entityend and \entitybegin \entity{Catalan independence} \entityend \equals \attribute{main subject}

\end{tabular}
}
\end{table}

\begin{table}[p]
\centering
\scriptsize
\renewcommand{\arraystretch}{1.2}
\nohyphens{
\begin{tabular}{p{1.7cm}p{4.5cm}p{6.4cm}}
    \bf Dataset & \bf Input & \bf Output \\
    \midrule

    CoNLL2005 (SRL) & Still, one federal appeals court has signaled it's willing to entertain the notion, and the lawyers have renewed their arguments in Texas and eight other states where the defense is \entitybegin \entity{permitted} \entityend under state law.
    &
    Still, one federal appeals court has signaled it's willing to entertain the notion, and the lawyers have renewed their arguments in \entitybegin \entity{Texas and eight other states} \separator \attribute{AM-LOC} \entityend \entitybegin \entity{where} \separator \attribute{R-AM-LOC} \entityend \entitybegin \entity{the defense} \separator \attribute{A1} \entityend is permitted \entitybegin \entity{under state law} \separator \attribute{AM-LOC} \entityend.

    \\ \midrule
    CoNLL2012 (SRL) & I'm just as delighted now, nine years later, to be able to welcome you here and to learn about the great changes which have \entitybegin \entity{occurred} \entityend in your country since I was there.
    &
    I'm just as delighted now, nine years later, to be able to welcome you here and to learn about \entitybegin \entity{the great changes} \separator \attribute{ARG1} \entityend \entitybegin \entity{which} \separator \attribute{R-ARG1} \entityend have occurred \entitybegin \entity{in your country} \separator \attribute{location} \entityend \entitybegin \entity{since I was there} \separator \attribute{ARGM-TMP} \entityend.

    \\ \midrule

    ACE2005 (event trigger id.)
    &
    Hoon said Saddam's regime was crumbling under the pressure of a huge air assault.
    &
    Hoon said Saddam 's regime was \entitybegin \entity{crumbling} \separator \attribute{end organization} \entityend under the pressure of a huge air \entitybegin \entity{assault} \separator \attribute{attack} \entityend.

    \\ \midrule
    ACE2005 (event argument cl.)
    &
    Chairman Jack Welch is seeking work-related documents of his estranged wife in his high-stakes \entitybegin \entity{divorce} \separator \attribute{divorce} \entityend case.
    &
    Chairman Jack Welch is seeking work-related documents of \entitybegin \entity{his estranged wife} \separator \attribute{individual} \separator \attribute{person} \equals divorce \entityend in \entitybegin \entity{his} \separator \attribute{individual} \separator \attribute{person} = divorce \entityend high-stakes divorce case.

    \\ \midrule
    CoNLL2012 (coreference res.)
    &
    What's your new TV series coming up? Oh it's a new show for FX called Beef. Beef? Beef. You start shooting? Um I'm going to Shriport tomorrow. Shriport Louisiana.
    &
    What's \entitybegin \entitybegin \entity{your} \entityend \entity{new TV series coming up} \entityend? Oh \entitybegin \entity{it} \separator \attribute{your new TV series coming up} \entityend's a new show for FX called Beef. \entitybegin \entity{Beef?} \separator \attribute{it} \entityend \entitybegin \entity{Beef.} \separator \attribute{Beef?} \entityend \entitybegin \entity{You} \separator \attribute{your} \entityend start shooting? Um \entitybegin \entity{I} \separator \attribute{You} \entityend'm going to \entitybegin \entity{Shriport} \entityend tomorrow. \entitybegin \entity{Shriport Louisiana.} \separator \attribute{Shriport} \entityend

    \\ \midrule
    MultiWOZ & \entitybegin user \entityend \taskseparator am looking for a place to to stay that has cheap price range it should be in a type of hotel \entitybegin agent \entityend \taskseparator okay, do you have a specific area you want to stay in? \entitybegin user \entityend \taskseparator no, i just need to make sure it s cheap. oh, and i need parking
    &
    \entitybegin belief \entityend \attribute{hotel area} \entity{not given}, \attribute{hotel book day} \entity{not given}, \attribute{hotel book people} \entity{not given}, \attribute{hotel book stay} \entity{not given}, \attribute{hotel internet} \entity{not given}, \attribute{hotel name} \entity{not given}, \attribute{hotel parking} \entity{yes}, \attribute{hotel price range} \entity{cheap}, \attribute{hotel stars} \entity{not given}, \attribute{hotel type} \entity{hotel} \entitybegin belief \entityend

\end{tabular}
}
\end{table}


\end{document}