Honeyc's Blog

多台虚拟机搭建hadoop环境

Hadoop配置流程

  1. 修改hosts
  2. 建立hadoop运行帐号
  3. ssh - 免密码
  4. 配置jdk hadoop运行环境
  5. 设置hadoop配置文件
  6. 通过ssh将配置同步到各节点
  7. 格式化namenode
  8. 启动hadoop,jps和网站检查运行情况

使用的是3台ubuntu系统进行hadoop环境的搭载、hadoop版本1.2.1

1.修改hosts

可先进行主机hostname的修改,三台都进行设置。

/etc/hosts

1
2
3
4
5
6
7
8
9
10
11
127.0.0.1 localhost localhost.localdomain localhost
#127.0.1.1 ubuntu
192.168.149.135 master
192.168.149.134 slave2
192.168.149.133 slave1 slave1.localdomain slave1
::1 ip6-localhost ip6-loopback
fe00::0 ip6-localnet
ff00::0 ip6-mcastprefix
ff02::1 ip6-allnodes
ff02::2 ip6-allrouters

2.建立hadoop运行帐号

为 hadoop 集群专门设置一个用户组及用户,用root设置添加用户的权限

1
2
3
4
5
6
sudo groupadd hadoop
sudo useradd –s /bin/bash –d /home/hadoop –m czx-hadoop –g hadoop
sudo passwd czx-hadoop
vim /etc/sudoers
czx-hadoop ALL=(ALL) ALL #修改用户权限

3 个虚机结点均需要进行以上步骤来完成 hadoop 运行帐号的建立。

3.ssh免密码登录

需要下载sudo apt-get install openssh-serve
配置环境

/etc/ssh/ssh_config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# This is the ssh client system-wide configuration file. See
# ssh_config(5) for more information. This file provides defaults for
# users, and the values can be changed in per-user configuration files
# or on the command line.
# Configuration data is parsed as follows:
# 1. command line options
# 2. user-specific file
# 3. system-wide file
# Any configuration value is only changed the first time it is set.
# Thus, host-specific definitions should be at the beginning of the
# configuration file, and defaults at the end.
# Site-wide defaults for some commonly used options. For a comprehensive
# list of available options, their meanings and defaults, please see the
# ssh_config(5) man page.
Host *
# ForwardAgent no
# ForwardX11 no
# ForwardX11Trusted yes
# RhostsRSAAuthentication no
# RSAAuthentication yes
PasswordAuthentication yes
# HostbasedAuthentication no
# GSSAPIAuthentication no
# GSSAPIDelegateCredentials no
# GSSAPIKeyExchange no
# GSSAPITrustDNS no
# BatchMode no
# CheckHostIP yes
# AddressFamily any
# ConnectTimeout 0
# StrictHostKeyChecking ask
# IdentityFile ~/.ssh/identity
# IdentityFile ~/.ssh/id_rsa
# IdentityFile ~/.ssh/id_dsa
# IdentityFile ~/.ssh/id_ecdsa
# IdentityFile ~/.ssh/id_ed25519
Port 22
Protocol 2
# Cipher 3des
# Ciphers aes128-ctr,aes192-ctr,aes256-ctr,arcfour256,arcfour128,aes128-cbc,3des-cbc
# MACs hmac-md5,hmac-sha1,umac-64@openssh.com,hmac-ripemd160
# EscapeChar ~
# Tunnel no
# TunnelDevice any:any
# PermitLocalCommand no
# VisualHostKey no
# ProxyCommand ssh -q -W %h:%p gateway.example.com
# RekeyLimit 1G 1h
SendEnv LANG LC_*
HashKnownHosts yes
GSSAPIAuthentication yes
GSSAPIDelegateCredentials no

/etc/ssh/sshd_config将PasswordAuthentication改成yes,将PubkeyAuthentication改成yes,然后保存配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# Package generated configuration file
# See the sshd_config(5) manpage for details
# What ports, IPs and protocols we listen for
Port 22
# Use these options to restrict which interfaces/protocols sshd will bind to
#ListenAddress ::
#ListenAddress 0.0.0.0
Protocol 2
# HostKeys for protocol version 2
HostKey /etc/ssh/ssh_host_rsa_key
HostKey /etc/ssh/ssh_host_dsa_key
HostKey /etc/ssh/ssh_host_ecdsa_key
HostKey /etc/ssh/ssh_host_ed25519_key
#Privilege Separation is turned on for security
UsePrivilegeSeparation yes
# Lifetime and size of ephemeral version 1 server key
KeyRegenerationInterval 3600
ServerKeyBits 1024
# Logging
SyslogFacility AUTH
LogLevel INFO
# Authentication:
LoginGraceTime 120
PermitRootLogin prohibit-password
StrictModes yes
RSAAuthentication yes
PubkeyAuthentication yes
AuthorizedKeysFile %h/.ssh/authorized_keys
# Don't read the user's ~/.rhosts and ~/.shosts files
IgnoreRhosts yes
# For this to work you will also need host keys in /etc/ssh_known_hosts
RhostsRSAAuthentication no
# similar for protocol version 2
HostbasedAuthentication no
# Uncomment if you don't trust ~/.ssh/known_hosts for RhostsRSAAuthentication
#IgnoreUserKnownHosts yes
# To enable empty passwords, change to yes (NOT RECOMMENDED)
PermitEmptyPasswords no
# Change to yes to enable challenge-response passwords (beware issues with
# some PAM modules and threads)
ChallengeResponseAuthentication no
# Change to no to disable tunnelled clear text passwords
PasswordAuthentication yes
# Kerberos options
#KerberosAuthentication no
#KerberosGetAFSToken no
#KerberosOrLocalPasswd yes
#KerberosTicketCleanup yes
# GSSAPI options
#GSSAPIAuthentication no
#GSSAPICleanupCredentials yes
X11Forwarding yes
X11DisplayOffset 10
PrintMotd no
PrintLastLog yes
TCPKeepAlive yes
#UseLogin no
#MaxStartups 10:30:60
#Banner /etc/issue.net
# Allow client to pass locale environment variables
AcceptEnv LANG LC_*
Subsystem sftp /usr/lib/openssh/sftp-server
# Set this to 'yes' to enable PAM authentication, account processing,
# and session processing. If this is enabled, PAM authentication will
# be allowed through the ChallengeResponseAuthentication and
# PasswordAuthentication. Depending on your PAM configuration,
# PAM authentication via ChallengeResponseAuthentication may bypass
# the setting of "PermitRootLogin without-password".
# If you just want the PAM account and session checks to run without
# PAM authentication, then enable this but set PasswordAuthentication
# and ChallengeResponseAuthentication to 'no'.
UsePAM yes

配置过程

参考官方网站
在三台虚拟机 czx-hadoop用户下,将其余两方的id_rsa.pub 放入authorized_keys,一开始都先进行 ssh localhost看是否可以免密登录

1
2
3
4
mkdir ~/.ssh
chmod 700 ~/.ssh
ssh-keygen -t rsa # id_rsa.pub、id_rsa
cat id_rsa.pub >> authorized_keys

4. 配置jdk hadoop运行环境

下载jdk,解压到指定目录 /usr/lib/jvm ,hadoop可以直接指定到 ~/hadoop 当前用户的家目录下
可通过用户环境变量~/.bashrc 或全局环境变量 /etc/profile 增加一下环境

1
2
3
4
5
6
7
8
#java
export JAVA_HOME=/usr/lib/jvm/java
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#hadoop
export HADOOP_INSTALL=/home/hadoop/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin

5. 设置hadoop配置文件

hadoop官网参看

core-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<!-- namenode RPC交互端口 -->
<name>fs.default.name</name>
<value>hdfs://master:9000</value>
<final>true</final>
</property>
<property>
<!-- 临时目录设定 -->
<name>hadoop.tmp.dir</name>
<value>/home/hadoop/hadoop/tmp</value>
<description>temp dir</description>
</property>
</configuration>

hdfs-site.xml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<!--存贮在本地的名字节点数据镜象的目录,作为名字节点的冗余备份-->
<name>dfs.name.dir</name>
<value>/home/hadoop/hadoop/name</value>
<final>true</final>
</property>
<property>
<!--数据节点的块本地存放目录-->
<name>dfs.data.dir</name>
<value>/home/hadoop/hadoop/data</value>
<final>true</final>
</property>
<property>
<!--缺省的块复制数量-->
<name>dfs.replication</name>
<value>2</value>
<final>true</final>
</property>
</configuration>

mared-site.xml

1
2
3
4
5
6
7
8
9
10
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<!--作业跟踪管理器是否和MR任务在一个进程中-->
<name>mapred.job.tracker</name>
<value>192.168.149.135:9001</value>
</property>
</configuration>

masters 和 slaves 再在hadoop-env.sh设置jdk路径

6. 通过ssh将配置同步到各节点

1
2
3
4
5
6
7
# 将hadoop的文件复制到其他节点上
scp -r ./hadoop slave1:~
scp -r ./hadoop slave2:~
#将配置文件也一起复制过去,也可以手动配置,防止你的环境配置有其他配置项
scp -r ~/.bashrc slave1:~
scp -r ~/.bashrc slave2:~

7. 格式化namenode

1
2
# 这一步在主结点 master 上进行操作:
hadoop namenode -format

8. 启动hadoop,jps和网站检查运行情况

1
2
3
4
#hadoop1.2.1启动,因为实验环境可能低版本比较,最新版用其他方式启动。
start-all.sh
jps #查看进程状态
#网站查看运行状态 http://master:50030 http://master:50070 master换成master机器的ip

其他

1. HDFS常用操作

hadoopdfs -ls 列出HDFS下的文件
hadoop dfs -ls in 列出HDFS下某个文档中的文件
hadoop dfs -put test1.txt test 上传文件到指定目录并且重新命名,只有所有的DataNode都接收完数据才算成功
hadoop dfs -get in getin 从HDFS获取文件并且重新命名为getin,同put一样可操作文件也可操作目录
hadoop dfs -rmr out 删除指定文件从HDFS上
hadoop dfs -cat in/ 查看HDFS上in目录的内容
hadoop dfsadmin -report 查看HDFS的基本统计信息,结果如下
hadoop dfsadmin -safemode leave 退出安全模式
hadoop dfsadmin -safemode enter *进入安全模式

2. 添加节点

可扩展性是HDFS的一个重要特性,首先在新加的节点上安装hadoop,
然后修改$HADOOP_HOME/conf/master文件,加入 NameNode主机名,
然后在NameNode节点上修改$HADOOP_HOME/conf/slaves文件,加入新加节点主机名,
再建立到新加节点无密码的SSH连接

3. shell自动安装脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/bin/bash
#validate user or group
validate() {
if [ 'id -u' == 0 ];then
echo "must not be root!"
exit 0
else
echo "---------welcome to hadoop---------"
fi
}
#hadoop install
hd-dir() {
if [ ! -d /home/hadoop/ ];then
mkdir /home/hadoop/
else
echo "download hadoop will begin"
fi
}
download-hd() {
wget -c http://archive.apache.org/dist/hadoop/core/hadoop-1.2.1.tar.gz -O /home/hadoop/hadoop-1.2.1.tar.gz
tar -xzvf /home/hadoop/hadoop-1.2.1.tar.gz -C /home/hadoop
rm /home/hadoop/hadoop-1.2.1.tar.gz
Ln -s /home/hadoop/hadoop-1.2.1 /home/hadoop/hadoop1.2.1
}
download-java() {
wget -c wget -c http://download.oracle.com/otn-pub/java/jdk/7/jdk-7-linux-i586.tar.gz -O /home/hadoop/jdk-7-linux-i586.tar.gz
if [ ! -d /usr/lib/jvm ];then
mkdir /usr/lib/jvm
else
tar -xzvf /home/hadoop/jdk-7-linux-i586.tar.gz -C /usr/lib/jvm
rm /home/hadoop/jdk-7-linux-i586.tar.gz
fi
}
#hadoop conf
hd-conf() {
echo "export JAVA_HOME=/usr/lib/jvm/java" >> /home/hadoop/hadoop1.2.1/conf/hadoop-env.sh
echo "#set path jdk" >> /home/hadoop/.bashrc
echo "export JAVA_HOME=/usr/lib/jvm/java" >> /home/hadoop/.bashrc
echo "#hadoop path" >> /home/hadoop/.bashrc
echo "export HADOOP_HOME=/home/hadoop/hadoop1.2.1" >> /home/hadoop/.bashrc
echo "PATH=$PATH:$HADOOP_HOME/bin:$JAVA_HOME/bin" >> /home/hadoop/.bashrc
echo "HADOOP_HOME_WARN_SUPPRESS=1" >> /home/hadoop/.bashrc
#hadoop core-site.xml
echo "<configuration>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "<name>fs.default.name</name>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "<value>hdfs://master:9000" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "<name>hadoop.tmp.dir</name>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "<value>/home/hadoop/tmp</value>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
echo "</configuration>" >> /home/hadoop/hadoop1.2.1/conf/core-site.xml
#hadoop hdfs-site.xml
echo "<configuration>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<name>dfs.name.dir</name>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<value>/home/hadoop/name</value>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<name>dfs.data.dir</name>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<value>/home/hadoop/data</value>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<name>dfs.replication</name>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "<value>1</value>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
echo "</configuration>" >> /home/hadoop/hadoop1.2.1/conf/hdfs-site.xml
# hadoop mapred-site.xml
echo "<configuration>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml
echo "<property>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml
echo "<name>mapred.job.tracker</name>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml
echo "<value>master:9001</value>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml
echo "</property>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml
echo "</configuration>" >> /home/hadoop/hadoop1.2.1/conf/mapred-site.xml
#hadoop master
echo "hadoop-master" >> /home/hadoop/hadoop1.2.1/conf/masters
#hadoop slaves
echo "hadoop-master" >> /home/hadoop/hadoop1.2.1/conf/slaves
source /home/hadoop/.bashrc
}
hd-start() {
hadoop namenode -format
}
yes-or-no() {
echo "Is your name $* ?"
while true
do
echo -n "Enter yes or no: "
read x
case "$x" in
y | yes ) return 0;;
n | no ) return 1;;
* ) echo "Answer yes or no";;
esac
done
}
echo "Original params are $*"
if yes-or-no "$1"
then
echo "HI $1,nice name!"
validate
hd-dir
download-hd
download-java
hd-conf
else
echo "Never mind!"
fi