Download as pdf or txt
Download as pdf or txt
You are on page 1of 11

Create nodes at Amazon AWS

I had the following nodes at AWS.

172.31.81.15 ec2-52-203-39-53.compute-1.amazonaws.com

172.31.85.92 ec2-54-89-210-61.compute-1.amazonaws.com

172.31.84.111 ec2-34-230-71-79.compute-1.amazonaws.com

172.31.83.249 ec2-34-201-244-234.compute-1.amazonaws.com

--------------------------------------------------------------------

Assign nodes to Host and datanodes:

Host namenode

HostName ec2-52-203-39-53.compute-1.amazonaws.com

User ubuntu

IdentityFile ~/.ssh/mkey.pem

Host datanode1

HostName ec2-54-89-210-61.compute-1.amazonaws.com

User ubuntu

IdentityFile ~/.ssh/mkey.pem

Host datanode2

HostName ec2-34-230-71-79.compute-1.amazonaws.com

User ubuntu

IdentityFile ~/.ssh/mkey.pem

Host datanode3

HostName ec2-34-201-244-234.compute-1.amazonaws.com

User ubuntu

IdentityFile ~/.ssh/mkey.pem

------------------------------------------------------
# namenode

sudo chmod 600 ~/.ssh/mkey.pem

scp ~/.ssh/mkey.pem ~/.ssh/config datanode1:~/.ssh

scp ~/.ssh/mkey.pem ~/.ssh/config datanode2:~/.ssh

scp ~/.ssh/mkey.pem ~/.ssh/config datanode3:~/.ssh

--------------------------------

#Generate key files on the NameNode

ssh-keygen -t rsa -P ""

# Copy the generated keys from id_rsa.pub to authorized_keys in .ssh folder

cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys

# Copy the generated keys from id_rsa.pub to authorized_keys in .ssh folder on datanode1,2

cat ~/.ssh/id_rsa.pub | ssh datanode1 'cat >> ~/.ssh/authorized_keys'

cat ~/.ssh/id_rsa.pub | ssh datanode2 'cat >> ~/.ssh/authorized_keys'

cat ~/.ssh/id_rsa.pub | ssh datanode3 'cat >> ~/.ssh/authorized_keys'

# Check ssh functionality

------------------------------------------

# All Nodes

sudo apt-get update

sudo apt-get install default-jdk

java -version

------------------------------------------

wget http://www-us.apache.org/dist/hadoop/common/hadoop-2.7.5/hadoop-2.7.5.tar.gz
sudo tar xzf hadoop-2.7.5.tar.gz

ls

hadoop-2.7.5 hadoop-2.7.5.tar.gz

sudo mv hadoop-2.7.5 /usr/local/hadoop

-----------------------------------------------

-----------------------------------------------

## Set environment variables on all Nodes

# Add these variables to ~/.profile

sudo nano ~/.profile

# Hadoop Environment Variables

# export JAVA_HOME=/ur

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64

export PATH=$PATH:$JAVA_HOME/bin

export HADOOP_HOME=/usr/local/hadoop

export PATH=$PATH:$HADOOP_HOME/bin

export HADOOP_CONF_DIR=/usr/local/hadoop/etc/hadoop

# Java Variable

export HADOOP_CLASSPATH=${JAVA_HOME}/lib/tools.jar

# Hive Variables

export HIVE_HOME=/usr/local/hive

export PATH=$PATH:$HIVE_HOME/bin

export CLASSPATH=$CLASSPATH:/usr/local/hadoop/lib/*:.

export CLASSPATH=$CLASSPATH:/usr/local/hive/lib/*:.
# Load variables

. ~/.profile

# Verify paths

echo $JAVA_HOME

echo $HADOOP_CONF_DIR

echo $HADOOP_HOME

-------------------------------------------

# Update change of ownership of $HADOOP_HOME directory to ubuntu

sudo chown -R ubuntu $HADOOP_HOME

------------------------------

## Hadoop Configuration Files on all Nodes

sudo nano $HADOOP_CONF_DIR/hadoop-env.sh

# HADOOP_CONF_DIR/hadoop-env.sh change JAVA_HOME to

export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64

----------------------------------

## HADOOP_CONF_DIR/core-site.xmls

# Change the namenode_public_dns to your NameNode Public DNS

# commandline sudo nano $HADOOP_CONF_DIR/core-site.xml

<property>

<name>fs.defaultFS</name>

<value>hdfs://ec2-52-203-39-53.compute-1.amazonaws.com:9000</value>

<description>localhost may be replaced with a DNS that points to the NameNode.</description>

</property>

# Copy the core-site.xml from datanode1 to datanode 1, 2


scp $HADOOP_CONF_DIR/core-site.xml datanode1:$HADOOP_CONF_DIR

scp $HADOOP_CONF_DIR/core-site.xml datanode2:$HADOOP_CONF_DIR

scp $HADOOP_CONF_DIR/core-site.xml datanode3:$HADOOP_CONF_DIR

-------------------------------------------------------------

# Changing /etc/hosts

sudo nano /etc/hosts

sudo chown ubuntu /etc/hosts

# Add the following to /etc/hosts

# PrivateIP PublicDNS

172.31.81.15 ec2-52-203-39-53.compute-1.amazonaws.com

172.31.85.92 ec2-54-89-210-61.compute-1.amazonaws.com

172.31.84.111 ec2-34-230-71-79.compute-1.amazonaws.com

172.31.83.249 ec2-34-201-244-234.compute-1.amazonaws.com

# return ownership

sudo chown root /etc/hosts

--------------------------------------------------

# modify $HADOOP_CONF_DIR/hdfs-site.xml on namenode

<configuration>

<property>

<name>dfs.replication</name>

<value>3</value>
</property>

<property>

<name>dfs.namenode.name.dir</name>

<value>file:///usr/local/hadoop/hadoop_data/hdfs/namenode</value>

</property>

</configuration>

-----------------------------------------------

# create hdfs data folder in the $HADOOP_HOME directory namenode

sudo mkdir -p $HADOOP_HOME/hadoop_data/hdfs/namenode

# create a file named masters in the $HADOOP_CONF_DIR directory in namenode

echo "namenode" | cat >> $HADOOP_CONF_DIR/masters

# Verify

sudo nano $HADOOP_CONF_DIR/masters

# Remove the old $HADOOP_CONF_DIR/slave file

sudo rm $HADOOP_CONF_DIR/slaves

# add the datanode hosts to the file $HADOOP_CONF_DIR/slaves

echo "datanode1" | cat >> $HADOOP_CONF_DIR/slaves


echo "datanode2" | cat >> $HADOOP_CONF_DIR/slaves

echo "datanode3" | cat >> $HADOOP_CONF_DIR/slaves

# Verify

sudo nano $HADOOP_CONF_DIR/slaves

# remove localhost entry if the slaves not removed before

--------------------------------------------------------------

# update the $HADOOP_HOME directory ownership to ubuntu

sudo chown -R ubuntu $HADOOP_HOME

----------------------------------------------------

# on each datanode

# create slaves file

echo "datanode1" | cat >> $HADOOP_CONF_DIR/slaves

echo "datanode2" | cat >> $HADOOP_CONF_DIR/slaves

echo "datanode3" | cat >> $HADOOP_CONF_DIR/slaves

------------------------------------------------------

# modify DataNode1 $HADOOP_CONF_DIR/hdfs-site.xml files

<configuration>

<property>

<name>dfs.replication</name>

<value>3</value>

</property>
<property>

<name>dfs.datanode.data.dir</name>

<value>file:///usr/local/hadoop/hadoop_data/hdfs/datanode</value>

</property>

</configuration>

--------------------------------

# copy the hdfs-site.xml from datanode1 to 2 and 3

scp $HADOOP_CONF_DIR/hdfs-site.xml datanode2:$HADOOP_CONF_DIR

scp $HADOOP_CONF_DIR/hdfs-site.xml datanode3:$HADOOP_CONF_DIR

--------------------------------------

# create a data directory on each DataNode

sudo mkdir -p $HADOOP_HOME/hadoop_data/hdfs/datanode

----------------------------------

# Update change of ownership of $HADOOP_HOME directory to ubuntu

sudo chown -R ubuntu $HADOOP_HOME

# Format the NameNode

hdfs namenode -format

# Start the DFS services


$HADOOP_HOME/sbin/start-dfs.sh

# Browse the HDFS in web

ec2-52-203-39-53.compute-1.amazonaws.com:50070

OR

namenode_public_ip:50070

-----------------------------------------

# Start YARN on NameNode

$HADOOP_HOME/sbin/start-yarn.sh

$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver

-----------------------------------

# Check Running Java services

# Run jps on namenode and datanodes

----------------------------------

# Stop services in revese order

$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh stop historyserver

$HADOOP_HOME/sbin/stop-yarn.sh

$HADOOP_HOME/sbin/stop-dfs.sh

----------------------------------------

# Mapreduce Codes in namenode

hadoop com.sun.tools.javac.Main WordCount.java

# Package the code for use with hadoop hdfs

jar cf WordCount.jar WordCount*.class


# make a folder

hdfs dfs -mkdir /input

hdfs dfs -mkdir /output

# Copy book from local

# hdfs dfs -copyFromLocal Books/The Tragedies of Hamlet.txt /input/hamlet.txt - NOT WOrKING

hdfs dfs -put Book/PeterPan.txt /input/PeterPan.txt

#run the code on the hdfs

hadoop jar WordCount.jar WordCount /book/ /output

# If error happens of directory already exist

# remove thru this command if it is empty

hdfs dfs -rmdir /output

# if it not empty

hdfs dfs -rm -r /ouput

#run the code on the hdfs

hadoop jar WordCount.jar WordCount /book/ /output

#run the code on the hdfs

hadoop jar WordCount.jar WordCount grep /book/ /output1 'zip'

# Check the output

hadoop fs -cat /output/part-r-00000

# generic command

hadoop fs -cat /output/*


hadoop fs -cat /output1/*

# Delete the directory's content

hdfs dfs -rm /output/*

# Delete the direcotry

hdfs dfs -rmdir /output

You might also like