mybib.bib


@article{nielsen_neural_2015,
	title = {Neural {Networks} and {Deep} {Learning}},
	url = {http://neuralnetworksanddeeplearning.com},
	language = {en},
	urldate = {2020-12-01},
	author = {Nielsen, Michael A.},
	year = {2015},
}

@misc{noauthor_jupyter_nodate,
	title = {Jupyter},
	url = {https://www.jupyter.org},
	abstract = {The Jupyter Notebook is a web-based interactive computing platform. The notebook combines live code, equations, narrative text, visualizations, interactive dashboards and other media.},
	urldate = {2020-11-29},
}

@misc{noauthor_ipython_nodate,
	title = {{IPython}},
	url = {https://ipython.org/},
	urldate = {2020-11-29},
}

@misc{noauthor_python_nodate,
	title = {Python},
	url = {https://www.python.org/},
	abstract = {The official home of the Python Programming Language},
	language = {en},
	urldate = {2020-11-29},
}

@misc{noauthor_numpy_nodate,
	title = {{NumPy}},
	url = {https://numpy.org/},
	abstract = {Why NumPy? Powerful n-dimensional arrays. Numerical computing tools. Interoperable. Performant. Open source.},
	urldate = {2020-11-29},
}

@misc{noauthor_tensorflow_nodate,
	title = {{TensorFlow}},
	url = {https://www.tensorflow.org/},
	abstract = {An end-to-end open source machine learning platform for everyone. Discover TensorFlow's flexible ecosystem of tools, libraries and community resources.},
	language = {en},
	urldate = {2020-11-29},
	journal = {TensorFlow},
}

@book{aggarwal_neural_2018,
	address = {Cham},
	title = {Neural networks and deep learning: a textbook},
	isbn = {978-3-319-94462-3 978-3-319-94463-0},
	shorttitle = {Neural networks and deep learning},
	language = {eng},
	publisher = {Springer},
	author = {Aggarwal, Charu C.},
	year = {2018},
}

@article{noauthor_notitle_nodate,
}

@article{schulman_proximal_2017,
	title = {Proximal {Policy} {Optimization} {Algorithms}},
	url = {http://arxiv.org/abs/1707.06347},
	abstract = {We propose a new family of policy gradient methods for reinforcement learning, which alternate between sampling data through interaction with the environment, and optimizing a "surrogate" objective function using stochastic gradient ascent. Whereas standard policy gradient methods perform one gradient update per data sample, we propose a novel objective function that enables multiple epochs of minibatch updates. The new methods, which we call proximal policy optimization (PPO), have some of the benefits of trust region policy optimization (TRPO), but they are much simpler to implement, more general, and have better sample complexity (empirically). Our experiments test PPO on a collection of benchmark tasks, including simulated robotic locomotion and Atari game playing, and we show that PPO outperforms other online policy gradient methods, and overall strikes a favorable balance between sample complexity, simplicity, and wall-time.},
	urldate = {2020-06-15},
	journal = {arXiv:1707.06347 [cs]},
	author = {Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
	month = aug,
	year = {2017},
	keywords = {Computer Science - Machine Learning},
}

@book{noauthor_going_nodate,
	title = {Going {Deeper} {Into} {Reinforcement} {Learning}: {Fundamentals} of {Policy} {Gradients}},
	url = {https://danieltakeshi.github.io/2017/03/28/going-deeper-into-reinforcement-learning-fundamentals-of-policy-gradients/},
	abstract = {As I stated in my last blog post, I am feverishly trying to read moreresearch papers. One category of papers that seems to be coming up a lotrecently are tho...},
	urldate = {2020-05-30},
}

@book{grigsby_overview_2018,
	title = {An overview of {Improvements} to deep {Q} learning algorithms},
	shorttitle = {Advanced {DQNs}},
	url = {https://towardsdatascience.com/advanced-dqns-playing-pac-man-with-deep-reinforcement-learning-3ffbd99e0814},
	abstract = {In 2013, DeepMind published the first version of its Deep Q-Network (DQN), a computer program capable of human-level performance on a…},
	language = {en},
	urldate = {2020-05-30},
	author = {Grigsby, Jake},
	month = oct,
	year = {2018},
	note = {Publication Title: Medium},
}

@book{noauthor_different_2018,
	title = {Different {Policy} {Gradient} {Algorithms}},
	url = {https://lilianweng.github.io/2018/04/08/policy-gradient-algorithms.html},
	abstract = {Abstract: In this post, we are going to look deep into policy gradient, why it works, and many new policy gradient algorithms proposed in recent years: vanilla policy gradient, actor-critic, off-policy actor-critic, A3C, A2C, DPG, DDPG, D4PG, MADDPG, TRPO, PPO, ACER, ACTKR, SAC, TD3 \& SVPG.},
	language = {en},
	urldate = {2020-05-30},
	month = apr,
	year = {2018},
	note = {Publication Title: Lil'Log},
}

@inproceedings{glorot_understanding_2010,
	title = {Understanding the difficulty of training deep feedforward neural networks},
	url = {http://proceedings.mlr.press/v9/glorot10a.html},
	abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental resul...},
	language = {en},
	urldate = {2020-05-30},
	booktitle = {Proceedings of the {Thirteenth} {International} {Conference} on {Artificial} {Intelligence} and {Statistics}},
	author = {Glorot, Xavier and Bengio, Yoshua},
	month = mar,
	year = {2010},
	pages = {249--256},
}

@book{dellinger_weight_2019,
	title = {Weight {Initialization} in {Neural} {Networks}: {A} {Journey} {From} the {Basics} to {Kaiming}},
	shorttitle = {Weight {Initialization} in {Neural} {Networks}},
	url = {https://towardsdatascience.com/weight-initialization-in-neural-networks-a-journey-from-the-basics-to-kaiming-954fb9b47c79},
	abstract = {Exploring the evolution of initializing layer weights in neural networks: from old-school to Xavier, and arriving finally at Kaiming init.},
	language = {en},
	urldate = {2020-05-30},
	author = {Dellinger, James},
	month = apr,
	year = {2019},
	note = {Publication Title: Medium},
}

@book{noauthor_unity-technologiesml-agents_2020,
	title = {Unity-{Technologies}/ml-agents},
	copyright = {Apache-2.0},
	url = {https://github.com/Unity-Technologies/ml-agents},
	abstract = {Unity Machine Learning Agents Toolkit. Contribute to Unity-Technologies/ml-agents development by creating an account on GitHub.},
	urldate = {2020-05-30},
	publisher = {Unity Technologies},
	month = may,
	year = {2020},
	keywords = {deep-learning, deep-reinforcement-learning, neural-networks, reinforcement-learning, unity, unity3d},
}

@book{goodfellow_deep_2016,
	address = {Cambridge, Massachusetts},
	series = {Adaptive computation and machine learning},
	title = {Deep learning},
	isbn = {978-0-262-03561-3},
	publisher = {The MIT Press},
	author = {Goodfellow, Ian and Bengio, Yoshua and Courville, Aaron},
	year = {2016},
	keywords = {Machine learning},
}

@book{sutton_reinforcement_2018,
	address = {Cambridge, Massachusetts},
	edition = {Second edition},
	series = {Adaptive computation and machine learning series},
	title = {Reinforcement learning: an introduction},
	isbn = {978-0-262-03924-6},
	shorttitle = {Reinforcement learning},
	abstract = {"Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms."–},
	publisher = {The MIT Press},
	author = {Sutton, Richard S. and Barto, Andrew G.},
	year = {2018},
	keywords = {Reinforcement learning},
}

@article{wang_dueling_2016,
	title = {Dueling {Network} {Architectures} for {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/1511.06581},
	abstract = {In recent years there have been many successes of using deep representations in reinforcement learning. Still, many of these applications use conventional architectures, such as convolutional networks, LSTMs, or auto-encoders. In this paper, we present a new neural network architecture for model-free reinforcement learning. Our dueling network represents two separate estimators: one for the state value function and one for the state-dependent action advantage function. The main benefit of this factoring is to generalize learning across actions without imposing any change to the underlying reinforcement learning algorithm. Our results show that this architecture leads to better policy evaluation in the presence of many similar-valued actions. Moreover, the dueling architecture enables our RL agent to outperform the state-of-the-art on the Atari 2600 domain.},
	urldate = {2020-05-30},
	journal = {arXiv:1511.06581 [cs]},
	author = {Wang, Ziyu and Schaul, Tom and Hessel, Matteo and van Hasselt, Hado and Lanctot, Marc and de Freitas, Nando},
	month = apr,
	year = {2016},
	keywords = {Computer Science - Machine Learning},
}

@article{fortunato_noisy_2019,
	title = {Noisy {Networks} for {Exploration}},
	url = {http://arxiv.org/abs/1706.10295},
	abstract = {We introduce NoisyNet, a deep reinforcement learning agent with parametric noise added to its weights, and show that the induced stochasticity of the agent's policy can be used to aid efficient exploration. The parameters of the noise are learned with gradient descent along with the remaining network weights. NoisyNet is straightforward to implement and adds little computational overhead. We find that replacing the conventional exploration heuristics for A3C, DQN and dueling agents (entropy reward and \${\textbackslash}textbackslashepsilon\$-greedy respectively) with NoisyNet yields substantially higher scores for a wide range of Atari games, in some cases advancing the agent from sub to super-human performance.},
	urldate = {2020-05-26},
	journal = {arXiv:1706.10295 [cs, stat]},
	author = {Fortunato, Meire and Azar, Mohammad Gheshlaghi and Piot, Bilal and Menick, Jacob and Osband, Ian and Graves, Alex and Mnih, Vlad and Munos, Remi and Hassabis, Demis and Pietquin, Olivier and Blundell, Charles and Legg, Shane},
	month = jul,
	year = {2019},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
}

@article{arulkumaran_brief_2017,
	title = {A {Brief} {Survey} of {Deep} {Reinforcement} {Learning}},
	volume = {34},
	issn = {1053-5888},
	url = {http://arxiv.org/abs/1708.05866},
	doi = {10.1109/MSP.2017.2743240},
	abstract = {Deep reinforcement learning is poised to revolutionise the field of AI and represents a step towards building autonomous systems with a higher level understanding of the visual world. Currently, deep learning is enabling reinforcement learning to scale to problems that were previously intractable, such as learning to play video games directly from pixels. Deep reinforcement learning algorithms are also applied to robotics, allowing control policies for robots to be learned directly from camera inputs in the real world. In this survey, we begin with an introduction to the general field of reinforcement learning, then progress to the main streams of value-based and policy-based methods. Our survey will cover central algorithms in deep reinforcement learning, including the deep \$Q\$-network, trust region policy optimisation, and asynchronous advantage actor-critic. In parallel, we highlight the unique advantages of deep neural networks, focusing on visual understanding via reinforcement learning. To conclude, we describe several current areas of research within the field.},
	number = {6},
	urldate = {2020-05-30},
	journal = {IEEE Signal Processing Magazine},
	author = {Arulkumaran, Kai and Deisenroth, Marc Peter and Brundage, Miles and Bharath, Anil Anthony},
	month = nov,
	year = {2017},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning},
	pages = {26--38},
}

@article{schaul_prioritized_2016,
	title = {Prioritized {Experience} {Replay}},
	url = {http://arxiv.org/abs/1511.05952},
	abstract = {Experience replay lets online reinforcement learning agents remember and reuse experiences from the past. In prior work, experience transitions were uniformly sampled from a replay memory. However, this approach simply replays transitions at the same frequency that they were originally experienced, regardless of their significance. In this paper we develop a framework for prioritizing experience, so as to replay important transitions more frequently, and therefore learn more efficiently. We use prioritized experience replay in Deep Q-Networks (DQN), a reinforcement learning algorithm that achieved human-level performance across many Atari games. DQN with prioritized experience replay achieves a new state-of-the-art, outperforming DQN with uniform replay on 41 out of 49 games.},
	urldate = {2020-05-30},
	journal = {arXiv:1511.05952 [cs]},
	author = {Schaul, Tom and Quan, John and Antonoglou, Ioannis and Silver, David},
	month = feb,
	year = {2016},
	keywords = {Computer Science - Machine Learning},
}

@article{van_hasselt_deep_2015,
	title = {Deep {Reinforcement} {Learning} with {Double} {Q}-learning},
	url = {http://arxiv.org/abs/1509.06461},
	abstract = {The popular Q-learning algorithm is known to overestimate action values under certain conditions. It was not previously known whether, in practice, such overestimations are common, whether they harm performance, and whether they can generally be prevented. In this paper, we answer all these questions affirmatively. In particular, we first show that the recent DQN algorithm, which combines Q-learning with a deep neural network, suffers from substantial overestimations in some games in the Atari 2600 domain. We then show that the idea behind the Double Q-learning algorithm, which was introduced in a tabular setting, can be generalized to work with large-scale function approximation. We propose a specific adaptation to the DQN algorithm and show that the resulting algorithm not only reduces the observed overestimations, as hypothesized, but that this also leads to much better performance on several games.},
	urldate = {2020-05-26},
	journal = {arXiv:1509.06461 [cs]},
	author = {van Hasselt, Hado and Guez, Arthur and Silver, David},
	month = dec,
	year = {2015},
	keywords = {Computer Science - Machine Learning},
}

@article{mnih_human-level_2015,
	title = {Human-level control through deep reinforcement learning},
	volume = {518},
	issn = {0028-0836, 1476-4687},
	url = {http://www.nature.com/articles/nature14236},
	doi = {10.1038/nature14236},
	language = {en},
	number = {7540},
	urldate = {2020-05-26},
	journal = {Nature},
	author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Rusu, Andrei A. and Veness, Joel and Bellemare, Marc G. and Graves, Alex and Riedmiller, Martin and Fidjeland, Andreas K. and Ostrovski, Georg and Petersen, Stig and Beattie, Charles and Sadik, Amir and Antonoglou, Ioannis and King, Helen and Kumaran, Dharshan and Wierstra, Daan and Legg, Shane and Hassabis, Demis},
	month = feb,
	year = {2015},
	pages = {529--533},
}

@article{wang_dueling_2016-1,
	title = {Dueling {Network} {Architectures} for {Deep} {Reinforcement} {Learning}},
	url = {http://arxiv.org/abs/1511.06581},
	abstract = {In recent years there have been many successes of using deep representations in reinforcement learning. Still, many of these applications use conventional architectures, such as convolutional networks, LSTMs, or auto-encoders. In this paper, we present a new neural network architecture for model-free reinforcement learning. Our dueling network represents two separate estimators: one for the state value function and one for the state-dependent action advantage function. The main benefit of this factoring is to generalize learning across actions without imposing any change to the underlying reinforcement learning algorithm. Our results show that this architecture leads to better policy evaluation in the presence of many similar-valued actions. Moreover, the dueling architecture enables our RL agent to outperform the state-of-the-art on the Atari 2600 domain.},
	urldate = {2020-05-26},
	journal = {arXiv:1511.06581 [cs]},
	author = {Wang, Ziyu and Schaul, Tom and Hessel, Matteo and van Hasselt, Hado and Lanctot, Marc and de Freitas, Nando},
	month = apr,
	year = {2016},
	keywords = {Computer Science - Machine Learning},
}

@book{Goodfellow-et-al-2016,
    title={Deep Learning},
    author={Ian Goodfellow and Yoshua Bengio and Aaron Courville},
    publisher={MIT Press},
    note={\url{http://www.deeplearningbook.org}},
    year={2016}
}

@misc{juliani2020unity,
      title={Unity: A General Platform for Intelligent Agents}, 
      author={Arthur Juliani and Vincent-Pierre Berges and Ervin Teng and Andrew Cohen and Jonathan Harper and Chris Elion and Chris Goy and Yuan Gao and Hunter Henry and Marwan Mattar and Danny Lange},
      year={2020},
      eprint={1809.02627},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@InProceedings{pmlr-v9-glorot10a, title = {Understanding the difficulty of training deep feedforward neural networks}, author = {Xavier Glorot and Yoshua Bengio}, booktitle = {Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics}, pages = {249--256}, year = {2010}, editor = {Yee Whye Teh and Mike Titterington}, volume = {9}, series = {Proceedings of Machine Learning Research}, address = {Chia Laguna Resort, Sardinia, Italy}, month = {13--15 May}, publisher = {JMLR Workshop and Conference Proceedings}, pdf = {http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf}, url = {http://proceedings.mlr.press/v9/glorot10a.html}, abstract = {Whereas before 2006 it appears that deep multi-layer neural networks were not successfully trained, since then several algorithms have been shown to successfully train them, with experimental results showing the superiority of deeper vs less deep architectures. All these experimental results were obtained with new initialization or training mechanisms. Our objective here is to understand better why standard gradient descent from random initialization is doing so poorly with deep neural networks, to better understand these recent relative successes and help design better algorithms in the future. We first observe the influence of the non-linear activations functions. We find that the logistic sigmoid activation is unsuited for deep networks with random initialization because of its mean value, which can drive especially the top hidden layer into saturation. Surprisingly, we find that saturated units can move out of saturation by themselves, albeit slowly, and explaining the plateaus sometimes seen when training neural networks. We find that a new non-linearity that saturates less can often be beneficial. Finally, we study how activations and gradients vary across layers and during training, with the idea that training may be more difficult when the singular values of the Jacobian associated with each layer are far from 1. Based on these considerations, we propose a new initialization scheme that brings substantially faster convergence.} }

@misc{chollet2015keras,
  title={Keras},
  author={Chollet, Fran\c{c}ois and others},
  year={2015},
  howpublished={\url{https://keras.io}},
}

@misc{kingma2017adam,
      title={Adam: A Method for Stochastic Optimization}, 
      author={Diederik P. Kingma and Jimmy Ba},
      year={2017},
      eprint={1412.6980},
      archivePrefix={arXiv},
      primaryClass={cs.LG}
}

@misc{ramachandran2017searching,
      title={Searching for Activation Functions}, 
      author={Prajit Ramachandran and Barret Zoph and Quoc V. Le},
      year={2017},
      eprint={1710.05941},
      archivePrefix={arXiv},
      primaryClass={cs.NE}
}

@InProceedings{He_2015_ICCV,
author = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
title = {Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision (ICCV)},
month = {December},
year = {2015}
}

@article{binomial_confidence,
author = {Wallis, Sean},
year = {2013},
month = {07},
pages = {},
title = {Binomial Confidence Intervals and Contingency Tests: Mathematical Fundamentals and the Evaluation of Alternative Methods},
volume = {20},
journal = {Journal of Quantitative Linguistics},
doi = {10.1080/09296174.2013.799918}
}