Install software

download apache-hive-3.1.2-bin.tar.gz from apache hive site and put it into /data2 directory at namenode1

ssh namenode1

su

groupadd hive

useradd -g hive hive

usermod -a -G hadoop hive

passwd hive

cd /app

tar -xzvf /data2/apache-hive-3.1.2-bin.tar.gz -C /app/

mv apache-hive-3.1.2-bin hive

[root@namenode1 app]# chmod -R 755 hive
[root@namenode1 app]# chown -R hive:hive hive


su - hdfs

hdfs dfs -mkdir -p /user/hive/warehouse

hdfs dfs -chmod -R 775/user/hive/warehouse

hdfs dfs -chown -R hive:hadoop /user/hive

hdfs dfs -mkdir /tmp

hdfs dfs -chmod -R 777 /tmp

update configuration files

vi core-site.xml

<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://namenode2:8020</value>
</property>

<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>

<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>

</configuration>

create metastore database

login into r02edge(where the postgresql instance is running)

su - postgres

psql
create a new user:
create role hive with login password 'sOmePasWD' valid until 'infinity';

create a database for metastore:
create database hivemetastoredb with encoding='UTF8' owner=hive connection limit=-1;

grant all privileges:
grant all privileges on database hivemetastoredb to hive;



If you need to drop the old version of the database then sessions need to killed:

postgres-# select pg_terminate_backend(pid)
postgres-# from pg_stat_activity
postgres-# where pid in (select pid from pg_stat_activity where datname='hivemetastoredb');

drop database hivemetastoredb;

set the environment variables

get back to namenode1:

su - hive

vi .bashrc

export HIVE_HOME=/app/hive
export HADOOP_HOME=/app/hadoop
export HADOOP_INSTALL=$HADOOP_HOME
export HADOOP_MAPRED_HOME=$HADOOP_HOME
export HADOOP_COMMON_HOME=$HADOOP_HOME
export HADOOP_HDFS_HOME=$HADOOP_HOME
export YARN_HOME=$HADOOP_HOME
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin:${HIVE_HOME}/bin
export HADOOP_OPTS="-Djava.library.path=$HADOOP_HOME/lib/native"
export CLASSPATH=${CLASSPATH}:${HIVE_HOME}/lib:${HADOOP_HOME}/share/hadoop/common/lib

export LD_LIBRARY_PATH=${HADOOP_COMMON_LIB_NATIVE_DIR}:${LD_LIBRARY_PATH}


alias ls=ls

cd /app/hive/conf

update hive-site.xml to use metastore database

vi hive-site.xml

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<configuration>
<property>
<name>hive.metastore.local</name>
<value>true</value>
</property>

<property>
<name>hive.metastore.warehouse.dir</name>
<value>/user/hive/warehouse</value>
</property>

<property>
<name>javax.jdo.option.ConnectionDriverName</name>
<value>org.postgresql.Driver</value>
</property>

<property>
<name>javax.jdo.option.ConnectionURL</name>
<value>jdbc:postgresql://192.168.2.30:5432/hivemetastoredb</value>
</property>

<property>
<name>javax.jdo.option.ConnectionUserName</name>
<value>hive</value>
</property>

<property>
<name>javax.jdo.option.ConnectionPassword</name>
<value>MySecretPassWord</value>
</property>

<property>
<name>hive.server2.thrift.port</name>
<value>10000</value>
</property>

<property>
<name>hive.server2.enable.doAs</name>
<value>true</value>
</property>

<property>
<name>hive.execution.engine</name>
<value>spark</value>
</property>

<property>
<name>hive.metastore.port</name>
<value>9083</value>
</property>

<property>
<name>mapreduce.input.fileinputformat.input.dir.recursive</name>
<value>true</value>
</property>
</configuration>

run schematool

to avoid  Exception in thread "main" java.lang.NoSuchMethodError: com.google.common.base.Preconditions.checkArgument(ZLjava/lang/String;Ljava/lang/Object;)V


cd /app/hive/lib

rm guava-19.0.1.jar

cp /app/hadoop/share/hadoop/common/lib/guava-27.0-jre.jar .

schematool -dbType postgres -initSchema -passWord pwdMyne -userName hive hivemetastoredb  


Metastore connection URL: jdbc:postgresql://192.168.2.30:5432/hivemetastoredb
Metastore Connection Driver : org.postgresql.Driver
Metastore connection User: hive
Starting metastore schema initialization to 3.1.0
Initialization script hive-schema-3.1.0.postgres.sql

Initialization script completed
schemaTool completed

start the hiveserver2

[hive@namenode1 ~]$ mkdir hiveserver2log
[hive@namenode1 ~]$ cd hiveserver2log
[hive@namenode1 hiveserver2log]$ nohup hive --service hiveserver2 1>hiveserver2.log 2>hiveserver2.err &
[1] 1380

start the metastore

[hive@namenode1 ~]$ mkdir hivesmetastorelog
[hive@namenode1 ~]$ cd hivesmetastorelog
[hive@namenode1 hiveserver2log]$ nohup hive --service metastore  &

the thrift server will start at port 9083 of namenode1:

[hive@namenode1 hivemetastorelog]$ lsof -i:9083
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
java 3857 hive 501u IPv4 172707 0t0 TCP *:emc-pp-mgmtsvc (LISTEN)
java 3857 hive 502u IPv4 170604 0t0 TCP namenode1.custom-built-apps.com:emc-pp-mgmtsvc->r02edge.custom-built-apps.com:33228 (ESTABLISHED)
[hive@namenode1 hivemetastorelog]$ netstat -n | grep 9083
tcp 0 0 192.168.2.25:9083 192.168.2.30:33228 ESTABLISHED

verify installation from hive and beeline

hive --version

Hive 3.1.2
Git git://HW13934/Users/gates/tmp/hive-branch-3.1/hive -r 8190d2be7b7165effa62bd21b7d60ef81fb0e4af


hive

create database dataexplorer1;

exit;

su - hdfs

hdfs dfs -chown dataexplorer1:hadoop  /hive/warehouse/dataexplorer1.db

hdfs dfs -chown -R dataexplorer1:hadoop /tmp/hadoop-yarn/staging/dataexplorer1

su - dataexplorer1

hive

use dataexplorer1;

create table hosts(id integer,hostname string);

insert into hosts values(1,'namenode1.custom-built-apps.com')

select * from hosts;

OK
1 namenode1.custom-built-apps.com
Time taken: 4.911 seconds, Fetched: 1 row(s)

exit;

beeline -u jdbc:hive2://namenode1:10000/dataexplorer1 -n dataexplorer1

Connected to: Apache Hive (version 3.1.2)
Driver: Hive JDBC (version 3.1.2)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 3.1.2 by Apache Hive
0: jdbc:hive2://> select * from dataexplorer1.hosts;
OK
+-----------+----------------------------------+
| hosts.id     |           hosts.hostname                |
+-----------+----------------------------------+
| 1                | namenode1.custom-built-apps.com |
+-----------+----------------------------------+
1 row selected (5.829 seconds)


create a script : beeline_cba

#!/bin/sh

beeline -u jdbc:hive2://namenode1:10000/dataexplorer1 -n ${USER}

creating a user database

su - dataexplorer1

beeline_cba

create database mydb location '/user/dataexplorer1/mydb';

use mydb;

create table mydb.mytab(id string);

insert into mydb.mytab values('first insert');



if spark is installed :

spark-sql

insert into hosts values(2,'namenode2.custom-built-apps.com')

this is a faster approach then changing hive engine to spark

moreover if the beeline is not needed, there is no need to start hiveserver2,  in version 3 hivemetastore is a standalone package and this is all that spark needs to connect to the underlying tables.


check the HiveServer2 WebUI

http://namenode1.custom-built-apps.com:10002

set the backup of the metastore database


command line postgress metastore db backup/restore

su - hive

vi $HOME/.pgpass

r02edge:5432:hivemetastoredb:hive:PaSSword_here

mkdir backups

cd backups

pg_dump -F c -b -v -f hivemetastoredb.backup_080320202 hivemetastoredb

the database can be restored using psql:

psql hivemetastoredb <hivemetastoredb.backup_080320202

Loss/corruption of metastore database resiliency 

tables creation process must be resilient to the loss of metastore database.

the approach is:

  1. create a hive database
  2. create the tables as external
  3. keep the table creation scripts
  4. if the metastore database cannot be restored→ recreate the database
  5. run the schematool to recreate the metastore database
  6. run the table creation scripts, the data in external tables will not be lost in case of metastore loss.
  7. verify the metastrore tbls table contain all the tables  metadata.





Attachments: