-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDockerfile
104 lines (85 loc) · 3.64 KB
/
Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
FROM apache/airflow:2.10.1
# pip installs
COPY requirements.txt .
RUN pip install -r requirements.txt
USER root
# Install Java
WORKDIR /opt/java
RUN curl https://download.java.net/java/GA/jdk15.0.2/0d1cfde4252546c6931946de8db48ee2/7/GPL/openjdk-15.0.2_linux-x64_bin.tar.gz -o openjdk-15.0.2_linux-x64_bin.tar.gz
RUN tar -xzf openjdk-15.0.2_linux-x64_bin.tar.gz \
&& rm -rf openjdk-15.0.2_linux-x64_bin.tar.gz
ENV JAVA_HOME=/opt/java/jdk-15.0.2
ENV PATH="$PATH:$JAVA_HOME/bin"
# Install Apache Hadoop
ARG HADOOP_VERSION=3.3.6
ENV HADOOP_HOME=/opt/hadoop
ENV HADOOP_CONF_DIR=/etc/hadoop
ENV MULTIHOMED_NETWORK=1
ENV USER=root
RUN HADOOP_URL="https://archive.apache.org/dist/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz" \
&& curl 'https://dist.apache.org/repos/dist/release/hadoop/common/KEYS' | gpg --import - \
&& curl -fSL "$HADOOP_URL" -o /tmp/hadoop.tar.gz \
&& curl -fSL "$HADOOP_URL.asc" -o /tmp/hadoop.tar.gz.asc \
&& gpg --verify /tmp/hadoop.tar.gz.asc \
&& mkdir -p "${HADOOP_HOME}" \
&& tar -xvf /tmp/hadoop.tar.gz -C "${HADOOP_HOME}" --strip-components=1 \
&& rm /tmp/hadoop.tar.gz /tmp/hadoop.tar.gz.asc \
&& ln -s "${HADOOP_HOME}/etc/hadoop" /etc/hadoop \
&& mkdir "${HADOOP_HOME}/logs" \
&& mkdir /hadoop-data
ENV PATH="$HADOOP_HOME/bin/:$PATH"
# Install Apache Spark
ARG spark_uid=185
RUN groupadd --system --gid=${spark_uid} spark && \
useradd --system --uid=${spark_uid} --gid=spark spark
RUN set -ex; \
apt-get update; \
apt-get install -y gnupg2 wget bash tini libc6 libpam-modules krb5-user libnss3 procps net-tools gosu libnss-wrapper; \
mkdir -p /opt/spark; \
mkdir /opt/spark/python; \
mkdir -p /opt/spark/examples; \
mkdir -p /opt/spark; \
chmod g+w /opt/spark; \
touch /opt/spark/RELEASE; \
chown -R spark:spark /opt/spark; \
echo "auth required pam_wheel.so use_uid" >> /etc/pam.d/su; \
rm -rf /var/lib/apt/lists/*
# https://downloads.apache.org/spark/KEYS
ENV SPARK_TGZ_URL=https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz \
SPARK_TGZ_ASC_URL=https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz.asc \
GPG_KEY=D76E23B9F11B5BF6864613C4F7051850A0AF904D
RUN set -ex; \
export SPARK_TMP="$(mktemp -d)"; \
cd $SPARK_TMP; \
wget -nv -O spark.tgz "$SPARK_TGZ_URL"; \
wget -nv -O spark.tgz.asc "$SPARK_TGZ_ASC_URL"; \
export GNUPGHOME="$(mktemp -d)"; \
gpg --batch --keyserver hkps://keys.openpgp.org --recv-key "$GPG_KEY" || \
gpg --batch --keyserver hkps://keyserver.ubuntu.com --recv-keys "$GPG_KEY"; \
gpg --batch --verify spark.tgz.asc spark.tgz; \
gpgconf --kill all; \
rm -rf "$GNUPGHOME" spark.tgz.asc; \
\
tar -xf spark.tgz --strip-components=1; \
chown -R spark:spark .; \
mv jars /opt/spark/; \
mv bin /opt/spark/; \
mv sbin /opt/spark/; \
mv kubernetes/dockerfiles/spark/decom.sh /opt/; \
mv examples /opt/spark/; \
mv kubernetes/tests /opt/spark/; \
mv data /opt/spark/; \
mv python/pyspark /opt/spark/python/pyspark/; \
mv python/lib /opt/spark/python/lib/; \
mv R /opt/spark/; \
chmod a+x /opt/decom.sh; \
cd ..; \
rm -rf "$SPARK_TMP";
ENV SPARK_HOME /opt/spark
WORKDIR /opt/spark/jars
RUN rm hadoop-client-api-3.3.4.jar \
&& rm hadoop-client-runtime-3.3.4.jar \
&& curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-api/3.3.6/hadoop-client-api-3.3.6.jar -o hadoop-client-api-3.3.6.jar \
&& curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-client-runtime/3.3.6/hadoop-client-runtime-3.3.6.jar -o hadoop-client-runtime-3.3.6.jar
WORKDIR /opt/airflow
USER 50000