PXF 访问 HDFS
1. hadoop 环境变量配置:
export HADOOP_HOME=/data/hadoop-2.9.2
export HADOOP_CONF_DIR=/data/hadoop-2.9.2/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH:
2. hadoop 配置:
mkdir /data/hadoop-2.9.2/name
mkdir /data/hadoop-2.9.2/data
mkdir /data/hadoop-2.9.2/tmp
## 参数配置
cat hadoop-env.sh
export JAVA_HOME=/data/jdk1.8
3. core-site.xml
vim $HADOOP_CONF_DIR/core-site.xml
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://matrixdb01:9000</value>
</property>
<property>
<name>hadoop.proxyuser.mxadmin.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mxadmin.groups</name> --代表访问hadoop的用户组,必须加
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mxadmin.users</name> --代表访问hadoop的用户,必须加
<value>*</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/data/hadoop-2.9.2/tmp</value>
</property>
</configuration>
4. hdfs-site.xml
vim $HADOOP_CONF_DIR/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>/data/hadoop-2.9.2/name</value>
</property>
<property>
<name>hadoop.data.dir</name>
<value>/data/hadoop-2.9.2/data</value>
</property>
<property>
<name>dfs.namenode.rpc-bind-host</name>
<value>0.0.0.0</value>
</property>
</configuration>
5. mapred-site.xml
vim $HADOOP_CONF_DIR/mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
</configuration>
6. yarn-site.xml
vim $HADOOP_CONF_DIR/yarn-site.xml
<?xml version="1.0"?>
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.resourcemanager.hostname</name>
<value>matrixdb01</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
</configuration>
7. slave
vim $HADOOP_CONF_DIR/slaves
matrixdb01
matrixdb02
matrixdb03
8. 同步到其它节点
scp -r /data/hadoop-2.9.2 matrixdb02:/data/
scp -r /data/hadoop-2.9.2 matrixdb03:/data/
9. 启动
[mxadmin@matrixdb01 hadoop]$ start-all.sh
This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
Starting namenodes on [matrixdb01]
matrixdb01: starting namenode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-namenode-matrixdb01.out
matrixdb03: starting datanode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-datanode-matrixdb03.out
matrixdb01: starting datanode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-datanode-matrixdb01.out
matrixdb02: starting datanode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-datanode-matrixdb02.out
Starting secondary namenodes [0.0.0.0]
0.0.0.0: starting secondarynamenode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-secondarynamenode-matrixdb01.out
starting yarn daemons
resourcemanager running as process 10127. Stop it first.
matrixdb03: starting nodemanager, logging to /data/hadoop-2.9.2/logs/yarn-mxadmin-nodemanager-matrixdb03.out
matrixdb02: starting nodemanager, logging to /data/hadoop-2.9.2/logs/yarn-mxadmin-nodemanager-matrixdb02.out
matrixdb01: starting nodemanager, logging to /data/hadoop-2.9.2/logs/yarn-mxadmin-nodemanager-matrixdb01.out
10. 查看进程
[mxadmin@matrixdb01 hadoop]$ jps
11889 NameNode
13010 Jps
12484 NodeManager
12247 SecondaryNameNode
12058 DataNode
10127 ResourceManager
创建文件夹:
[mxadmin@matrixdb01 hadoop]$ hdfs dfs -mkdir /greenplum
[mxadmin@matrixdb01 data]$ hdfs dfs -mkdir /greenplum/pxf_examples
[mxadmin@matrixdb01 data]$ hdfs dfs -ls /
drwxr-xr-x - mxadmin supergroup 0 2020-12-30 11:14 /greenplum
[mxadmin@matrixdb01 data]$ hdfs dfs -ls /greenplum/
drwxr-xr-x - mxadmin supergroup 0 2020-12-30 11:14 /greenplum/pxf_examples
[mxadmin@matrixdb01 data] echo 'Prague,Jan,101,4875.33
Rome,Mar,87,1557.39
Bangalore,May,317,8936.99
Beijing,Jul,411,11600.67' > pxf_hdfs_simple.txt
hdfs dfs -put /data/pxf_hdfs_simple.txt /greenplum/pxf_examples/
[mxadmin@matrixdb01 data]$ hdfs dfs -cat /greenplum/pxf_examples/pxf_hdfs_simple.txt
Prague,Jan,101,4875.33
Rome,Mar,87,1557.39
Bangalore,May,317,8936.99
Beijing,Jul,411,11600.67
11. 创建 PXF Server 配置文件
PXF 默认有一个 Default Server,存放在 $PXF_CONF/servers/default
mkdir $PXF_CONF/servers/single_hdfs
cp $HADOOP_CONF_DIR/core-site.xml $PXF_CONF/servers/single_hdfs/
cp $HADOOP_CONF_DIR/hdfs-site.xml $PXF_CONF/servers/single_hdfs/
scp -r single_hdfs matrixdb02:pwd
scp -r single_hdfs matrixdb03:pwd
12. 创建 pxf_fdw EXTENSION
createdb pxfhdfs
pxfhdfs=# CREATE EXTENSION pxf_fdw ;
13. 创建 FDW Server
CREATE SERVER hdfs_svr FOREIGN DATA WRAPPER hdfs_pxf_fdw OPTIONS ( config 'single_hdfs' );
14. 创建用户名 admin 的 FDW User Mapping
CREATE USER MAPPING FOR mxadmin SERVER hdfs_svr;
15. 创建 FDW Foreign Table,指向对应的 HDFS 文件
CREATE FOREIGN TABLE pxf_hdfs_table (location text, month text,
num_orders int, total_sales float8) SERVER hdfs_svr OPTIONS ( resource '/greenplum/pxf_examples/pxf_hdfs_simple.txt', format 'text',
delimiter ',');
16. 查询 Foreign Table 获取数据
pxfhdfs=# SELECT * FROM pxf_hdfs_table ;
location | month | num_orders | total_sales
-----------+-------+------------+-------------
Prague | Jan | 101 | 4875.33
Rome | Mar | 87 | 1557.39
Bangalore | May | 317 | 8936.99
Beijing | Jul | 411 | 11600.67
(4 rows)
17. 创建 FDW Foreign Table,指向对应的 HDFS 目录
确保 HDFS 目录已创建
CREATE FOREIGN TABLE pxf_hdfsdir_table (location text, month
text, num_orders int, total_sales float8) SERVER hdfs_svr OPTIONS (resource '/greenplum/pxf_dir_examples', format 'text', delimiter ',');
18. 向 Foreign Table 写入数据
pxfhdfs=# INSERT INTO pxf_hdfsdir_table SELECT * FROM pxf_hdfs_table ;
INSERT 0 4
19. 查询 Foreign Table 获取目录下所有数据
pxfhdfs=# SELECT COUNT(*) FROM pxf_hdfsdir_table ;
count
-------
4
(1 row)