create database if not exists bisys comment 'bisys系統(tǒng)庫' location '/user/hive/warehouse/bisys.db' with dbproperties ('creator'='hadoop','date'='2019-12-26');
create database if not exists bidwh comment 'bidwh系統(tǒng)庫' location '/user/hive/warehouse/bidwh.db' with dbproperties ('creator'='hadoop','date'='2019-06-21');
create database if not exists biolap comment 'biolap系統(tǒng)庫' location '/user/hive/warehouse/biolap.db' with dbproperties ('creator'='hadoop','date'='2019-06-24');
create database if not exists biolap comment 'biolap系統(tǒng)庫' location '/user/wdxx/db/biolap.db' with dbproperties ('creator'='wdxx','date'='2019-06-26');
----------------------new---------------------------------------------
create database if not exists bibuf comment 'bibuf系統(tǒng)庫' location '/user/wdxx/db/bibuf.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
create database if not exists bicore comment 'bicore系統(tǒng)庫' location '/user/wdxx/db/bicore.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
create database if not exists biodb comment 'biodb系統(tǒng)庫' location '/user/wdxx/db/biodb.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
create database if not exists bidwh comment 'bidwh系統(tǒng)庫' location '/user/wdxx/db/bidwh.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
create database if not exists biolap comment 'biolap系統(tǒng)庫' location '/user/wdxx/db/biolap.db' with dbproperties ('creator'='wdxx','date'='2020-04-26');
create table IF NOT EXISTS EXCEL_ZB_LIST
(
ywymc STRING,
zywymc STRING,
ztmc STRING,
zbbm STRING,
zbmc STRING,
zbjc STRING,
tjkj STRING,
dw STRING,
jsgs STRING,
zbly STRING,
zbcc STRING,
tjpl STRING,
sflj STRING,
zbdy STRING,
wd STRING,
sfyszb STRING,
clsj DATE default sysdate not null,
file_name STRING,
hzbbm STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS SEQUENCEFILE
location /user/hive/warehouse/bisys.db/EXCEL_ZB_LIST
create table TB_HZB_WD
(
wdbid STRING,
wdcol STRING,
wdlm STRING,
wdtext STRING,
zbbm STRING,
hzbbm STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS SEQUENCEFILE
location '/user/hive/warehouse/bisys.db/TB_HZB_WD';
為了對表進(jìn)行合理的管理以及提高查詢效率,Hive可以將表組織成“分區(qū)”。一個(gè)分區(qū)實(shí)際上就是表下的一個(gè)目錄,一個(gè)表可以在多個(gè)維度上進(jìn)行分區(qū),分區(qū)之間的關(guān)系就是目錄樹的關(guān)系
1、創(chuàng)建分區(qū)表
通過PARTITIONED BY子句指定,分區(qū)的順序決定了誰是父目錄,誰是子目錄
創(chuàng)建有一個(gè)分區(qū)的分區(qū)表:CREATE TABLE IF NOT EXISTS part_test(
c1 string ,
c2 string ,
c3 string ,
c4 string)PARTITIONED BY (day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'STORED AS TEXTFILE;
創(chuàng)建有兩個(gè)分區(qū)的分區(qū)表:CREATE TABLE IF NOT EXISTS part_test_1(
c1 string ,
c2 string ,
c3 string ,
c4 string ) PARTITIONED BY (month_id string,day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'STORED AS TEXTFILE;
2、 外部分區(qū)表
外部表也可以建成分區(qū)表,如hdfs目錄/user/tuoming/part下有
201805和201806兩個(gè)目錄,201805下有一個(gè)20180509子目錄,201806下有20180609和20180610兩個(gè)子目錄。
創(chuàng)建一個(gè)映射到/user/tuoming/part目錄的外部分區(qū)表:CREATE EXTERNAL TABLE IF NOT EXISTS part_test_2(
c1 string ,
c2 string ,
c3 string ,
c4 string)PARTITIONED BY (month_id string,day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY ','STORED AS TEXTFILELOCATION '/user/tuoming/part';
為part_test_2增加分區(qū):
alter table part_test_2 add partition(month_id='201805',day_id='20180509') location '/user/tuoming/part/201805/20180509';
alter table part_test_2 add partition(month_id='201806',day_id='20180609') location '/user/tuoming/part/201806/20180609';
alter table part_test_2 add partition(month_id='201806',day_id='20180610') location '/user/tuoming/part/201806/20180610';
使用show partitions語句查看part_test_2有哪些分區(qū):show partitions part_test_2;
3、 內(nèi)部分區(qū)表創(chuàng)建一個(gè)主分區(qū)為month_id,子分區(qū)為day_id的內(nèi)部分區(qū)表:CREATE TABLE IF NOT EXISTS part_test_3(
c1 string ,
c2 string ,
c3 string ,
c4 string )PARTITIONED BY (month_id string,day_id string)ROW FORMAT DELIMITED FIELDS TERMINATED BY '|'STORED AS TEXTFILE;
為內(nèi)部分區(qū)表加載數(shù)據(jù)(1)使用load data inpath…overwrite into table partition語句從hdfs目錄加載:
load data inpath '/user/tuoming/test/test' overwrite into table part_test_3 partition(month_id='201805',day_id='20180509');
(2)使用insert overwrite table/ insert into…partition語句從查詢結(jié)果中加載:覆蓋插入:
insert overwrite table part_test_3 partition(month_id='201805',day_id='20180509') select * from part_test_temp;
追加插入:
insert into part_test_3 partition(month_id='201805',day_id='20180509') select * from part_test_temp;
注意:使用以上兩種方法為內(nèi)部分區(qū)表加載數(shù)據(jù)不需要預(yù)創(chuàng)建分區(qū),加載數(shù)據(jù)時(shí)會自動創(chuàng)建相應(yīng)的分區(qū)。如果想要為內(nèi)部表預(yù)先創(chuàng)建分區(qū),需要使用hadoop fs –mkdir命令在表目錄下先創(chuàng)建相應(yīng)的分區(qū)目錄,然后再使用alter table add partition語句增加分區(qū):
4、 刪除分區(qū)
使用alter table…drop partition語句刪除對應(yīng)分區(qū):
alter table part_test_3 drop partition(day_id='20180509');
注意:外部分區(qū)表使用alter table…drop partition語句刪除分區(qū),只會刪除元數(shù)據(jù),相應(yīng)的目錄和文件并不會刪除。內(nèi)部表使用該語句刪除分區(qū),既會刪除元數(shù)據(jù),也會刪除相應(yīng)的目錄和數(shù)據(jù)文件。
5、 動態(tài)分區(qū)上述使用insert overwrite table…partition…從查詢結(jié)果加載數(shù)據(jù)到分區(qū),必須指定特定的分區(qū),而且每個(gè)分區(qū)都需要使用一條插入語句。當(dāng)需要一次插入多個(gè)分區(qū)的數(shù)據(jù)時(shí),可以使用動態(tài)分區(qū),根據(jù)查詢得到的數(shù)據(jù)動態(tài)分配到分區(qū)里。動態(tài)分區(qū)與靜態(tài)分區(qū)的區(qū)別就是不指定分區(qū)目錄,由hive根據(jù)實(shí)際的數(shù)據(jù)選擇插入到哪一個(gè)分區(qū)。#啟動動態(tài)分區(qū)功能set hive.exec.dynamic.partition=true;
#允許全部分區(qū)都是動態(tài)分區(qū)set hive.exec.dynamic.partition.mode=nonstrick;
hive:默認(rèn)允許動態(tài)分區(qū)個(gè)數(shù)為100,超出拋出異常:
在執(zhí)行插入數(shù)據(jù)到分區(qū)時(shí),添加參數(shù)設(shè)置:
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;注:這個(gè)屬性表示每個(gè)節(jié)點(diǎn)生成動態(tài)分區(qū)的最大個(gè)數(shù),默認(rèn)是100
set hive.exec.max.dynamic.partitions=10000;注:這個(gè)屬性表示一個(gè)DML操作可以創(chuàng)建的最大動態(tài)分區(qū)數(shù),默認(rèn)是1000
set hive.exec.max.created.files=10000;注:這個(gè)屬性表示一個(gè)DML操作可以創(chuàng)建的最大文件數(shù),默認(rèn)是100000
#month_id為靜態(tài)分區(qū),day_id為動態(tài)分區(qū):insert overwrite table dynamic_test partition(month_id='201710',day_id) select c1,c2,c3,c4,c5,c6,c7,day_id from kafka_offsetwhere substr(day_id,1,6)='201710';
# month_id和 day_id均為動態(tài)分區(qū):insert overwrite table dynamic_test partition(month_id,day_id) select c1,c2,c3,c4,c5,c6,c7,substr(day_id,1,6) as month_id,day_id from kafka_offset;為了讓分區(qū)列的值相同的數(shù)據(jù)盡量在同一個(gè)mapreduce中,這樣每一個(gè)mapreduce可以盡量少的產(chǎn)生新的文件夾,可以借助distribute by的功能,將分區(qū)列值相同的數(shù)據(jù)放到一起。
insert overwrite table dynamic_test partition(month_id,day_id)select c1,c2,c3,c4,c5,c6,c7,substr(day_id,1,6) as month_id,day_id from kafka_offsetdistribute by month_id,day_id;
清表方法,如清除default庫下面的所有表,如下方法
hive -e "use default;show tables">tbname.txt
cat tbname.txt|while read line
do
hive -e "use bisys;drop table $line"
done
HIVE優(yōu)化后的一鍵腳本刪除庫下面的表,不同庫下面只需修改庫名
hive -e "use biolap;show tables;">tbname.txt
cat tbname.txt|while read line
do
echo -n "drop table $line;">>droptables.txt
done
tables=`cat droptables.txt`
echo $tables
hive -e "use biodb;$tables"
rm -rf droptables.txt
rm -rf tbname.txt
動態(tài)分區(qū)要添加的參數(shù)
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions.pernode=10000;
set hive.exec.max.dynamic.partitions=10000;
set hive.exec.max.created.files=10000;
set spark.executor.memory=12g;
set spark.executor.cores=1;
set spark.executor.instances=10;
set spark.sql.shuffle.partition=20000;