多台虚拟机搭建hadoop环境

Hadoop配置流程

修改hosts
建立hadoop运行帐号
ssh - 免密码
配置jdk hadoop运行环境
设置hadoop配置文件
通过ssh将配置同步到各节点
格式化namenode
启动hadoop，jps和网站检查运行情况

使用的是3台ubuntu系统进行hadoop环境的搭载、hadoop版本1.2.1

1.修改hosts

可先进行主机hostname的修改，三台都进行设置。

/etc/hosts

127.0.0.1       localhost localhost.localdomain	localhost
#127.0.1.1	ubuntu
192.168.149.135 master
192.168.149.134 slave2 
192.168.149.133	slave1 slave1.localdomain slave1
::1     ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters

2.建立hadoop运行帐号

为 hadoop 集群专门设置一个用户组及用户，用root设置添加用户的权限

sudo groupadd hadoop
sudo useradd –s /bin/bash –d /home/hadoop –m czx-hadoop –g hadoop
sudo passwd czx-hadoop
vim /etc/sudoers
czx-hadoop ALL=(ALL) ALL #修改用户权限

3 个虚机结点均需要进行以上步骤来完成 hadoop 运行帐号的建立。

3.ssh免密码登录

需要下载sudo apt-get install openssh-serve
配置环境

/etc/ssh/ssh_config

# This is the ssh client system-wide configuration file.  See
# ssh_config(5) for more information.  This file provides defaults for
# users, and the values can be changed in per-user configuration files
# or on the command line.
# Configuration data is parsed as follows:
#  1. command line options
#  2. user-specific file
#  3. system-wide file
# Any configuration value is only changed the first time it is set.
# Thus, host-specific definitions should be at the beginning of the
# configuration file, and defaults at the end.
# Site-wide defaults for some commonly used options.  For a comprehensive
# list of available options, their meanings and defaults, please see the
# ssh_config(5) man page.
Host *
#   ForwardAgent no
#   ForwardX11 no
#   ForwardX11Trusted yes
#   RhostsRSAAuthentication no
#   RSAAuthentication yes
   PasswordAuthentication yes
#   HostbasedAuthentication no
#   GSSAPIAuthentication no
#   GSSAPIDelegateCredentials no
#   GSSAPIKeyExchange no
#   GSSAPITrustDNS no
#   BatchMode no
#   CheckHostIP yes
#   AddressFamily any
#   ConnectTimeout 0
#   StrictHostKeyChecking ask
#   IdentityFile ~/.ssh/identity
#   IdentityFile ~/.ssh/id_rsa
#   IdentityFile ~/.ssh/id_dsa
#   IdentityFile ~/.ssh/id_ecdsa
#   IdentityFile ~/.ssh/id_ed25519
   Port 22
   Protocol 2
#   Cipher 3des
#   Ciphers aes128-ctr,aes192-ctr,aes256-ctr,arcfour256,arcfour128,aes128-cbc,3des-cbc
#   MACs hmac-md5,hmac-sha1,umac-64@openssh.com,hmac-ripemd160
#   EscapeChar ~
#   Tunnel no
#   TunnelDevice any:any
#   PermitLocalCommand no
#   VisualHostKey no
#   ProxyCommand ssh -q -W %h:%p gateway.example.com
#   RekeyLimit 1G 1h
    SendEnv LANG LC_*
    HashKnownHosts yes
    GSSAPIAuthentication yes
    GSSAPIDelegateCredentials no

/etc/ssh/sshd_config将PasswordAuthentication改成yes，将PubkeyAuthentication改成yes，然后保存配置文件

# Package generated configuration file
# See the sshd_config(5) manpage for details
# What ports, IPs and protocols we listen for
Port 22
# Use these options to restrict which interfaces/protocols sshd will bind to
#ListenAddress ::
#ListenAddress 0.0.0.0
Protocol 2
# HostKeys for protocol version 2
HostKey /etc/ssh/ssh_host_rsa_key
HostKey /etc/ssh/ssh_host_dsa_key
HostKey /etc/ssh/ssh_host_ecdsa_key
HostKey /etc/ssh/ssh_host_ed25519_key
#Privilege Separation is turned on for security
UsePrivilegeSeparation yes
# Lifetime and size of ephemeral version 1 server key
KeyRegenerationInterval 3600
ServerKeyBits 1024
# Logging
SyslogFacility AUTH
LogLevel INFO
# Authentication:
LoginGraceTime 120
PermitRootLogin prohibit-password
StrictModes yes
RSAAuthentication yes
PubkeyAuthentication yes
AuthorizedKeysFile	%h/.ssh/authorized_keys
# Don't read the user's ~/.rhosts and ~/.shosts files
IgnoreRhosts yes
# For this to work you will also need host keys in /etc/ssh_known_hosts
RhostsRSAAuthentication no
# similar for protocol version 2
HostbasedAuthentication no
# Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication
#IgnoreUserKnownHosts yes
# To enable empty passwords, change to yes (NOT RECOMMENDED)
PermitEmptyPasswords no
# Change to yes to enable challenge-response passwords (beware issues with
# some PAM modules and threads)
ChallengeResponseAuthentication no
# Change to no to disable tunnelled clear text passwords
PasswordAuthentication yes
# Kerberos options
#KerberosAuthentication no
#KerberosGetAFSToken no
#KerberosOrLocalPasswd yes
#KerberosTicketCleanup yes
# GSSAPI options
#GSSAPIAuthentication no
#GSSAPICleanupCredentials yes
X11Forwarding yes
X11DisplayOffset 10
PrintMotd no
PrintLastLog yes
TCPKeepAlive yes
#UseLogin no
#MaxStartups 10:30:60
#Banner /etc/issue.net
# Allow client to pass locale environment variables
AcceptEnv LANG LC_*
Subsystem sftp /usr/lib/openssh/sftp-server
# Set this to 'yes' to enable PAM authentication, account processing,
# and session processing. If this is enabled, PAM authentication will
# be allowed through the ChallengeResponseAuthentication and
# PasswordAuthentication.  Depending on your PAM configuration,
# PAM authentication via ChallengeResponseAuthentication may bypass
# the setting of "PermitRootLogin without-password".
# If you just want the PAM account and session checks to run without
# PAM authentication, then enable this but set PasswordAuthentication
# and ChallengeResponseAuthentication to 'no'.
UsePAM yes

配置过程

参考官方网站
在三台虚拟机 czx-hadoop用户下,将其余两方的id_rsa.pub 放入authorized_keys，一开始都先进行 ssh localhost看是否可以免密登录

mkdir ~/.ssh
chmod 700 ~/.ssh
ssh-keygen -t rsa # id_rsa.pub、id_rsa
cat id_rsa.pub >> authorized_keys

4. 配置jdk hadoop运行环境

下载jdk,解压到指定目录 /usr/lib/jvm ，hadoop可以直接指定到 ~/hadoop 当前用户的家目录下
可通过用户环境变量~/.bashrc 或全局环境变量 /etc/profile 增加一下环境

#java
export JAVA_HOME=/usr/lib/jvm/java 
export JRE_HOME=${JAVA_HOME}/jre 
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib 
export PATH=${JAVA_HOME}/bin:$PATH 
#hadoop
export HADOOP_INSTALL=/home/hadoop/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin

5. 设置hadoop配置文件

hadoop官网参看

core-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
	<property>
		<!-- namenode RPC交互端口 -->
		<name>fs.default.name</name>   		
		<value>hdfs://master:9000</value>
		<final>true</final>
	</property>
	<property>
		<!-- 临时目录设定 -->
		<name>hadoop.tmp.dir</name>   		
		<value>/home/hadoop/hadoop/tmp</value>
		<description>temp dir</description>
	</property>
</configuration>

hdfs-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
	<property>
		<!--存贮在本地的名字节点数据镜象的目录,作为名字节点的冗余备份-->
		<name>dfs.name.dir</name>                    
		<value>/home/hadoop/hadoop/name</value>
		<final>true</final>
	</property>
	<property>
  		 <!--数据节点的块本地存放目录-->
		<name>dfs.data.dir</name>                    
		<value>/home/hadoop/hadoop/data</value>
		<final>true</final>
	</property>
	<property>
  		<!--缺省的块复制数量-->
		<name>dfs.replication</name>                   
		<value>2</value>
		<final>true</final>
	</property>
</configuration>

mared-site.xml

<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
	<property>
		<!--作业跟踪管理器是否和MR任务在一个进程中-->
		<name>mapred.job.tracker</name>                  
		<value>192.168.149.135:9001</value>
	</property>
</configuration>

masters 和 slaves 再在hadoop-env.sh设置jdk路径

6. 通过ssh将配置同步到各节点

# 将hadoop的文件复制到其他节点上
scp -r ./hadoop slave1:~
scp -r ./hadoop slave2:~
#将配置文件也一起复制过去,也可以手动配置，防止你的环境配置有其他配置项
scp -r ~/.bashrc slave1:~
scp -r ~/.bashrc slave2:~

7. 格式化namenode

1 2	# 这一步在主结点 master 上进行操作: hadoop namenode -format

8. 启动hadoop，jps和网站检查运行情况

#hadoop1.2.1启动，因为实验环境可能低版本比较，最新版用其他方式启动。
start-all.sh 
jps  #查看进程状态
#网站查看运行状态 http://master:50030 http://master:50070 master换成master机器的ip

其他

1. HDFS常用操作

hadoopdfs -ls 列出HDFS下的文件
hadoop dfs -ls in 列出HDFS下某个文档中的文件
hadoop dfs -put test1.txt test 上传文件到指定目录并且重新命名，只有所有的DataNode都接收完数据才算成功
hadoop dfs -get in getin 从HDFS获取文件并且重新命名为getin，同put一样可操作文件也可操作目录
hadoop dfs -rmr out 删除指定文件从HDFS上
hadoop dfs -cat in/ 查看HDFS上in目录的内容
hadoop dfsadmin -report 查看HDFS的基本统计信息，结果如下
hadoop dfsadmin -safemode leave 退出安全模式
hadoop dfsadmin -safemode enter *进入安全模式

2. 添加节点

可扩展性是HDFS的一个重要特性，首先在新加的节点上安装hadoop，
然后修改$HADOOP_HOME/conf/master文件，加入 NameNode主机名，
然后在NameNode节点上修改$HADOOP_HOME/conf/slaves文件，加入新加节点主机名，
再建立到新加节点无密码的SSH连接

3. shell自动安装脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122 #!/bin/bash  
  
#validate user or group  
validate() {  
 if [ 'id -u' == 0 ];then  
   echo "must not be root!"  
   exit 0  
 else  
   echo "---------welcome to hadoop---------"  
 fi  
}  
  
#hadoop install  
hd-dir() {  
 if [ ! -d /home/hadoop/ ];then  
   mkdir /home/hadoop/  
 else  
   echo "download hadoop will begin"  
 fi  
}  
  
download-hd() {  
 wget -c http://archive.apache.org/dist/hadoop/core/hadoop-1.2.1.tar.gz -O /home/hadoop/hadoop-1.2.1.tar.gz  
 tar -xzvf /home/hadoop/hadoop-1.2.1.tar.gz -C /home/hadoop  
 rm /home/hadoop/hadoop-1.2.1.tar.gz  
 Ln -s /home/hadoop/hadoop-1.2.1 /home/hadoop/hadoop1.2.1  
}  
download-java() {  
 wget -c wget -c http://download.oracle.com/otn-pub/java/jdk/7/jdk-7-linux-i586.tar.gz   -O /home/hadoop/jdk-7-linux-i586.tar.gz  
 
	if [ ! -d /usr/lib/jvm ];then  
  	 mkdir /usr/lib/jvm  
	else 
	  tar -xzvf /home/hadoop/jdk-7-linux-i586.tar.gz  -C /usr/lib/jvm 
 	  rm /home/hadoop/jdk-7-linux-i586.tar.gz  
	fi 
} 
  
#hadoop conf  
hd-conf() {  
 echo "export JAVA_HOME=/usr/lib/jvm/java" >> /home/hadoop/hadoop1.2.1/conf/hadoop-env.sh  
 echo "#set path jdk" >> /home/hadoop/.bashrc
 echo "export JAVA_HOME=/usr/lib/jvm/java" >> /home/hadoop/.bashrc 
 echo "#hadoop path" >> /home/hadoop/.bashrc 
 echo "export HADOOP_HOME=/home/hadoop/hadoop1.2.1" >> /home/hadoop/.bashrc 
 echo "PATH=$PATH:$HADOOP_HOME/bin:$JAVA_HOME/bin" >> /home/hadoop/.bashrc 
 echo "HADOOP_HOME_WARN_SUPPRESS=1" >> /home/hadoop/.bashrc 
#hadoop core-site.xml  
 echo "<configuration>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "<name>fs.default.name</name>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "<value>hdfs://master:9000" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "<name>hadoop.tmp.dir</name>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "<value>/home/hadoop/tmp</value>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
 echo "</configuration>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml  
#hadoop hdfs-site.xml  
  
 echo "<configuration>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<name>dfs.name.dir</name>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<value>/home/hadoop/name</value>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<name>dfs.data.dir</name>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<value>/home/hadoop/data</value>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<name>dfs.replication</name>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "<value>1</value>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
 echo "</configuration>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml  
# hadoop mapred-site.xml  
  
 echo "<configuration>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml  
 echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml  
 echo "<name>mapred.job.tracker</name>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml  
 echo "<value>master:9001</value>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml  
 echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml  
 echo "</configuration>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml  
#hadoop master  
 echo "hadoop-master" >> /home/hadoop/hadoop1.2.1/conf/masters  
  
#hadoop slaves  
 echo "hadoop-master" >> /home/hadoop/hadoop1.2.1/conf/slaves  
source /home/hadoop/.bashrc 
}  
  
hd-start() {  
     hadoop namenode -format  
}  
  
  
yes-or-no() {  
  echo "Is your name $* ?"  
  while true  
  do  
     echo -n "Enter yes or no: "  
     read x  
     case "$x" in  
     y | yes ) return 0;;  
     n | no ) return 1;;  
     * ) echo "Answer yes or no";;  
   esac  
  done  
}  
  
echo "Original params are $*"  
  
if yes-or-no "$1"  
then  
  echo "HI $1,nice name!"  
  validate  
  hd-dir  
  download-hd
  download-java 
  hd-conf  
else  
  echo "Never mind!"  
fi