diff --git a/CHANGES.md b/CHANGES.md index 1e6c8f88..e6910ff7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,9 +18,11 @@ * [#311]: Changed how Flintrock manages its own security groups to reduce the likelihood of hitting any limits on the number of rules per security group. * [#329]: Dropped support for Python 3.5 and added automated testing for Python 3.8 and 3.9. +* [#334]: Flintrock now ensures that `python3` is available on launched clusters and sets that as the default Python that PySpark will use. [#311]: https://github.com/nchammas/flintrock/pull/311 [#329]: https://github.com/nchammas/flintrock/pull/329 +[#334]: https://github.com/nchammas/flintrock/pull/334 ## [1.0.0] - 2020-01-11 diff --git a/flintrock/core.py b/flintrock/core.py index 2ce65d5d..d17f11aa 100644 --- a/flintrock/core.py +++ b/flintrock/core.py @@ -666,6 +666,17 @@ def setup_node( cluster.storage_dirs.root = storage_dirs['root'] cluster.storage_dirs.ephemeral = storage_dirs['ephemeral'] + # TODO: Move Python and Java setup to new service under services.py. + # New service to cover Python/Scala/Java: LanguageRuntimes (name?) + ssh_check_output( + client=ssh_client, + command=( + """ + set -e + sudo yum install -y python3 + """ + ) + ) ensure_java(ssh_client, java_version) for service in services: diff --git a/flintrock/templates/spark/conf/spark-env.sh b/flintrock/templates/spark/conf/spark-env.sh index e63fb10a..46cb4455 100644 --- a/flintrock/templates/spark/conf/spark-env.sh +++ b/flintrock/templates/spark/conf/spark-env.sh @@ -20,3 +20,6 @@ export SPARK_PUBLIC_DNS="$(curl --silent http://169.254.169.254/latest/meta-data # Need to find a way to do this, since "sudo ulimit..." doesn't fly. # Probably need to edit some Linux config file. # ulimit -n 1000000 + +# Should this be made part of a Python service somehow? +export PYSPARK_PYTHON="python3"