-
Notifications
You must be signed in to change notification settings - Fork 94
/
spark-install.sh
executable file
·217 lines (192 loc) · 7.93 KB
/
spark-install.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
#!/bin/bash
# Run this by typing `bash spark-install.sh`
echo "DOWNLOADING SPARK"
# Specify your shell config file
# Aliases will be appended to this file
SHELL_PROFILE="$HOME/.bashrc"
# Set the install location, $HOME is set by default
SPARK_INSTALL_LOCATION=$HOME
# Specify the URL to download Spark from
SPARK_URL=http://apache.mirrors.tds.net/spark/spark-2.1.0/spark-2.1.0-bin-hadoop2.7.tgz
# The Spark folder name should be the same as the name of the file being downloaded as specified in the SPARK_URL
SPARK_FOLDER_NAME=spark-2.1.0-bin-hadoop2.7.tgz
# Find the proper md5 hash from the Apache site
SPARK_MD5=50e73f255f9bde50789ad5bd657c7a71
# Print Disclaimer prior to running script
echo "DISCLAIMER: This is an automated script for installing Spark but you should feel responsible for what you're doing!"
echo "This script will install Spark to your home directory, modify your PATH, and add environment variables to your SHELL config file"
read -r -p "Proceed? [y/N] " response
if [[ ! $response =~ ^([yY][eE][sS]|[yY])$ ]]
then
echo "Aborting..."
exit 1
fi
# Verify that $SHELL_PROFILE is pointing to the proper file
read -r -p "Is $SHELL_PROFILE your shell profile? [y/N] " response
if [[ $response =~ ^([yY][eE][sS]|[yY])$ ]]
then
echo "All relevent aliases will be added to this file; THIS IS IMPORTANT!"
read -r -p "To verify, please type in the name of your shell profile (just the file name): " response
if [[ ! $response == $(basename $SHELL_PROFILE) ]]
then
echo "What you typed doesn't match $(basename $SHELL_PROFILE)!"
echo "Please double check what shell profile you are using and alter spark-install accordingly!"
exit 1
fi
else
echo "Please alter the spark-install.sh script to specify the correct file"
exit 1
fi
# Create scripts folder for storing jupyspark.sh and localsparksubmit.sh
read -r -p "Would you like to create a scripts folder in your Home directory? [y/N] " response
if [[ $response =~ ^([yY][eE][sS]|[yY])$ ]]; then
if [[ ! -d $HOME/scripts ]]
then
mkdir $HOME/scripts
echo "export PATH=\$PATH:$HOME/scripts" >> $SHELL_PROFILE
else
echo "scripts folder already exists! Verify this folder has been added to your PATH"
fi
else
if [[ ! -d $HOME/scripts ]]
then
echo "Installing without installing jupyspark.sh and localsparksubmit.sh"
fi
fi
if [[ -d $HOME/scripts ]];
then
# Create jupyspark.sh script in your scripts folder
read -r -p "Would you like to create the jupyspark.sh script for launching a local jupyter spark server? [y/N] " response
if [[ $response =~ ^([yY][eE][sS]|[yY])$ ]]; then
echo "#!/bin/bash
export PYSPARK_DRIVER_PYTHON=jupyter
export PYSPARK_DRIVER_PYTHON_OPTS=\"notebook --NotebookApp.open_browser=True --NotebookApp.ip='localhost' --NotebookApp.port=8888\"
\${SPARK_HOME}/bin/pyspark \
--master local[4] \
--executor-memory 1G \
--driver-memory 1G \
--conf spark.sql.warehouse.dir=\"file:///tmp/spark-warehouse\" \
--packages com.databricks:spark-csv_2.11:1.5.0 \
--packages com.amazonaws:aws-java-sdk-pom:1.10.34 \
--packages org.apache.hadoop:hadoop-aws:2.7.3" > $HOME/scripts/jupyspark.sh
chmod +x $HOME/scripts/jupyspark.sh
fi
# Create localsparksubmit.sh script in your scripts folder
read -r -p "Would you like to create the localsparksubmit.sh script for submittiing local python scripts through spark-submit? [y/N] " response
if [[ $response =~ ^([yY][eE][sS]|[yY])$ ]]; then
echo "#!/bin/bash
\${SPARK_HOME}/bin/spark-submit \
--master local[4] \
--executor-memory 1G \
--driver-memory 1G \
--conf spark.sql.warehouse.dir=\"file:///tmp/spark-warehouse\" \
--packages com.databricks:spark-csv_2.11:1.5.0 \
--packages com.amazonaws:aws-java-sdk-pom:1.10.34 \
--packages org.apache.hadoop:hadoop-aws:2.7.3 \
\$1" > $HOME/scripts/localsparksubmit.sh
chmod +x $HOME/scripts/localsparksubmit.sh
fi
fi
# Check to see if JDK is installed
javac -version 2> /dev/null
if [ ! $? -eq 0 ]
then
# Install JDK
if [[ $(uname -s) = "Darwin" ]]
then
echo "Downloading JDK..."
brew install Caskroom/cask/java
elif [[ $(uname -s) = "Linux" ]]
then
echo "Downloading JDK..."
sudo add-apt-repository ppa:webupd8team/java
sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys EEA14886
sudo apt-get update
sudo apt-get install oracle-java8-installer
fi
fi
SUCCESSFUL_SPARK_INSTALL=0
SPARK_INSTALL_TRY=0
if [[ $(uname -s) = "Darwin" ]]
then
echo -e "\n\tDetected Mac OS X as the Operating System\n"
while [ $SUCCESSFUL_SPARK_INSTALL -eq 0 ]
do
curl $SPARK_URL > $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME
# Check MD5 Hash
if [[ $(openssl md5 $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME | sed -e "s/^.* //") == "$SPARK_MD5" ]]
then
# Unzip
tar -xzf $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME -C $SPARK_INSTALL_LOCATION
# Remove the compressed file
rm $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME
# Install py4j
pip install py4j
SUCCESSFUL_SPARK_INSTALL=1
else
echo 'ERROR: Spark MD5 Hash does not match'
echo "$(openssl md5 $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME | sed -e "s/^.* //") != $SPARK_MD5"
if [ $SPARK_INSTALL_TRY -lt 3 ]
then
echo -e '\nTrying Spark Install Again...\n'
SPARK_INSTALL_TRY=$[$SPARK_INSTALL_TRY+1]
echo $SPARK_INSTALL_TRY
else
echo -e '\nSPARK INSTALL FAILED\n'
echo -e 'Check the MD5 Hash and run again'
exit 1
fi
fi
done
elif [[ $(uname -s) = "Linux" ]]
then
echo -e "\n\tDetected Linux as the Operating System\n"
while [ $SUCCESSFUL_SPARK_INSTALL -eq 0 ]
do
curl $SPARK_URL > $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME
# Check MD5 Hash
if [[ $(md5sum $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME | sed -e "s/ .*$//") == "$SPARK_MD5" ]]
then
# Unzip
tar -xzf $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME -C $SPARK_INSTALL_LOCATION
# Remove the compressed file
rm $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME
# Install py4j
pip install py4j
SUCCESSFUL_SPARK_INSTALL=1
else
echo 'ERROR: Spark MD5 Hash does not match'
echo "$(md5sum $SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME | sed -e "s/ .*$//") != $SPARK_MD5"
if [ $SPARK_INSTALL_TRY -lt 3 ]
then
echo -e '\nTrying Spark Install Again...\n'
SPARK_INSTALL_TRY=$[$SPARK_INSTALL_TRY+1]
echo $SPARK_INSTALL_TRY
else
echo -e '\nSPARK INSTALL FAILED\n'
echo -e 'Check the MD5 Hash and run again'
exit 1
fi
fi
done
else
echo "Unable to detect Operating System"
exit 1
fi
# Remove extension from spark folder name
SPARK_FOLDER_NAME=$(echo $SPARK_FOLDER_NAME | sed -e "s/.tgz$//")
echo "
# Spark variables
export SPARK_HOME=\"$SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME\"
export PYTHONPATH=\"$SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME/python/:$PYTHONPATH\"
# Spark 2
export PYSPARK_DRIVER_PYTHON=ipython
export PATH=\$SPARK_HOME/bin:\$PATH
alias pyspark=\"$SPARK_INSTALL_LOCATION/$SPARK_FOLDER_NAME/bin/pyspark \
--conf spark.sql.warehouse.dir='file:///tmp/spark-warehouse' \
--packages com.databricks:spark-csv_2.11:1.5.0 \
--packages com.amazonaws:aws-java-sdk-pom:1.10.34 \
--packages org.apache.hadoop:hadoop-aws:2.7.3\"" >> $SHELL_PROFILE
source $SHELL_PROFILE
echo "INSTALL COMPLETE"
echo "Please refer to Step 4 at https://github.com/zipfian/spark-install for testing your installation"