PXF 访问 HDFS

1. hadoop 环境变量配置:

export HADOOP_HOME=/data/hadoop-2.9.2
export HADOOP_CONF_DIR=/data/hadoop-2.9.2/etc/hadoop
export PATH=$PATH:$HADOOP_HOME/bin:$HADOOP_HOME/sbin:$PATH:

2. hadoop 配置:

mkdir /data/hadoop-2.9.2/name
mkdir /data/hadoop-2.9.2/data
mkdir /data/hadoop-2.9.2/tmp

## 参数配置
cat hadoop-env.sh
export JAVA_HOME=/data/jdk1.8

3. core-site.xml

vim $HADOOP_CONF_DIR/core-site.xml
<configuration>
<property>
  <name>fs.defaultFS</name>
  <value>hdfs://matrixdb01:9000</value>
</property>
<property>
  <name>hadoop.proxyuser.mxadmin.hosts</name>
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.mxadmin.groups</name>   --代表访问hadoop的用户组,必须加
  <value>*</value>
</property>
<property>
  <name>hadoop.proxyuser.mxadmin.users</name>   --代表访问hadoop的用户,必须加
  <value>*</value>
</property>
  <property>
   <name>hadoop.tmp.dir</name>
   <value>/data/hadoop-2.9.2/tmp</value>
   </property>
</configuration>

4. hdfs-site.xml

vim $HADOOP_CONF_DIR/hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>3</value>
</property>
<property>
   <name>dfs.name.dir</name>
   <value>/data/hadoop-2.9.2/name</value>
</property>
<property>
   <name>hadoop.data.dir</name>
   <value>/data/hadoop-2.9.2/data</value>
</property>
<property>
        <name>dfs.namenode.rpc-bind-host</name>
        <value>0.0.0.0</value>
</property>
</configuration>

5. mapred-site.xml

vim $HADOOP_CONF_DIR/mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
   <property>
        <name>mapreduce.framework.name</name>
        <value>yarn</value>
   </property>
</configuration>

6. yarn-site.xml

vim $HADOOP_CONF_DIR/yarn-site.xml
<?xml version="1.0"?>
<configuration>
<!-- Site specific YARN configuration properties -->
    <property>
        <name>yarn.resourcemanager.hostname</name>
        <value>matrixdb01</value>
    </property>
    <property>
        <name>yarn.nodemanager.aux-services</name>
        <value>mapreduce_shuffle</value>
    </property>
</configuration>

7. slave

vim $HADOOP_CONF_DIR/slaves
matrixdb01
matrixdb02
matrixdb03

8. 同步到其它节点

scp -r /data/hadoop-2.9.2 matrixdb02:/data/
scp -r /data/hadoop-2.9.2 matrixdb03:/data/

9. 启动

[mxadmin@matrixdb01 hadoop]$ start-all.sh 
This script is Deprecated. Instead use start-dfs.sh and start-yarn.sh
Starting namenodes on [matrixdb01]
matrixdb01: starting namenode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-namenode-matrixdb01.out
matrixdb03: starting datanode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-datanode-matrixdb03.out
matrixdb01: starting datanode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-datanode-matrixdb01.out
matrixdb02: starting datanode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-datanode-matrixdb02.out
Starting secondary namenodes [0.0.0.0]
0.0.0.0: starting secondarynamenode, logging to /data/hadoop-2.9.2/logs/hadoop-mxadmin-secondarynamenode-matrixdb01.out
starting yarn daemons
resourcemanager running as process 10127. Stop it first.
matrixdb03: starting nodemanager, logging to /data/hadoop-2.9.2/logs/yarn-mxadmin-nodemanager-matrixdb03.out
matrixdb02: starting nodemanager, logging to /data/hadoop-2.9.2/logs/yarn-mxadmin-nodemanager-matrixdb02.out
matrixdb01: starting nodemanager, logging to /data/hadoop-2.9.2/logs/yarn-mxadmin-nodemanager-matrixdb01.out

10. 查看进程

[mxadmin@matrixdb01 hadoop]$ jps
 11889 NameNode
 13010 Jps
 12484 NodeManager
 12247 SecondaryNameNode
 12058 DataNode
 10127 ResourceManager

创建文件夹:

[mxadmin@matrixdb01 hadoop]$ hdfs dfs -mkdir /greenplum
[mxadmin@matrixdb01 data]$ hdfs dfs -mkdir /greenplum/pxf_examples
[mxadmin@matrixdb01 data]$ hdfs dfs -ls /
 drwxr-xr-x - mxadmin supergroup 0 2020-12-30 11:14 /greenplum
[mxadmin@matrixdb01 data]$ hdfs dfs -ls /greenplum/
 drwxr-xr-x - mxadmin supergroup 0 2020-12-30 11:14 /greenplum/pxf_examples
[mxadmin@matrixdb01 data] echo 'Prague,Jan,101,4875.33
 Rome,Mar,87,1557.39
 Bangalore,May,317,8936.99
 Beijing,Jul,411,11600.67' > pxf_hdfs_simple.txt
hdfs dfs -put /data/pxf_hdfs_simple.txt /greenplum/pxf_examples/
[mxadmin@matrixdb01 data]$ hdfs dfs -cat /greenplum/pxf_examples/pxf_hdfs_simple.txt
 Prague,Jan,101,4875.33
 Rome,Mar,87,1557.39
 Bangalore,May,317,8936.99
 Beijing,Jul,411,11600.67

11. 创建 PXF Server 配置文件

PXF 默认有一个 Default Server,存放在 $PXF_CONF/servers/default

mkdir $PXF_CONF/servers/single_hdfs
cp $HADOOP_CONF_DIR/core-site.xml $PXF_CONF/servers/single_hdfs/
cp $HADOOP_CONF_DIR/hdfs-site.xml $PXF_CONF/servers/single_hdfs/
scp -r single_hdfs matrixdb02:pwd
scp -r single_hdfs matrixdb03:pwd

12. 创建 pxf_fdw EXTENSION

createdb pxfhdfs
pxfhdfs=# CREATE EXTENSION pxf_fdw ;

13. 创建 FDW Server

CREATE SERVER hdfs_svr FOREIGN DATA WRAPPER hdfs_pxf_fdw OPTIONS ( config 'single_hdfs' );

14. 创建用户名 admin 的 FDW User Mapping

CREATE USER MAPPING FOR mxadmin SERVER hdfs_svr;

15. 创建 FDW Foreign Table,指向对应的 HDFS 文件

CREATE FOREIGN TABLE pxf_hdfs_table (location text, month text,
 num_orders int, total_sales float8) SERVER hdfs_svr OPTIONS ( resource '/greenplum/pxf_examples/pxf_hdfs_simple.txt', format 'text',
 delimiter ',');

16. 查询 Foreign Table 获取数据

pxfhdfs=# SELECT * FROM pxf_hdfs_table ;
location | month | num_orders | total_sales
-----------+-------+------------+-------------
Prague | Jan | 101 | 4875.33
Rome | Mar | 87 | 1557.39
Bangalore | May | 317 | 8936.99
Beijing | Jul | 411 | 11600.67
(4 rows)

17. 创建 FDW Foreign Table,指向对应的 HDFS 目录

确保 HDFS 目录已创建

CREATE FOREIGN TABLE pxf_hdfsdir_table (location text, month
 text, num_orders int, total_sales float8) SERVER hdfs_svr OPTIONS (resource '/greenplum/pxf_dir_examples', format 'text', delimiter ',');

18. 向 Foreign Table 写入数据

pxfhdfs=# INSERT INTO pxf_hdfsdir_table SELECT * FROM pxf_hdfs_table ;
 INSERT 0 4

19. 查询 Foreign Table 获取目录下所有数据

pxfhdfs=# SELECT COUNT(*) FROM pxf_hdfsdir_table ;
 count
-------
     4
 (1 row)