forked from academyofdata/clusterdock
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget2hdfs.sh
28 lines (25 loc) · 1.46 KB
/
get2hdfs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
#for some reason not always the name resolution is good in these containers, so add the google DNS before running this, just in case
#echo "nameserver 8.8.8.8" >> /etc/resolv.conf
#make a temporary directory to store the downloaded csv files
mkdir /tmp/data
#go there
cd /tmp/data
#run the script that will download all the csv files
wget -qO- https://raw.githubusercontent.com/academyofdata/clusterdock/master/getrawdata.sh | bash -s
#if everything went fine we should have some csv files in /tmp/data, put them onto hdfs
#first do a little setup - create a few directories and give everyone in HDFS unrestricted access
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /data
HADOOP_USER_NAME=hdfs hdfs dfs -chmod a+w /data
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /metadata
HADOOP_USER_NAME=hdfs hdfs dfs -chmod a+w /metadata
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /input
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /input/movies
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /input/users
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /input/ratings
HADOOP_USER_NAME=hdfs hdfs dfs -mkdir /input/ratings-all
#now put the files in there
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/movies.csv /input/movies/movies.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/users.csv /input/users/users.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/ratings.csv /input/ratings/ratings.csv
HADOOP_USER_NAME=hdfs hdfs dfs -put /tmp/data/ratings-all.csv /input/ratings-all/ratings-all.csv
HADOOP_USER_NAME=hdfs hdfs dfs -chmod -R a+w /input