腳本化加載文件與轉儲
阿新 • • 發佈:2018-08-31
mem rtu ffi hdf conf 工作 awk dbo sql文件
腳本化加載文件與轉儲
1、加載數據到原生表
1.1 介紹
由於每天都會產生大量的日誌數據,需要對每天的日誌進行加載與清洗以及轉儲,編寫腳本文件後,通過azkaban進行調度即可。
1.2 編寫load_data_to_hive_raw_logs.sql
加載數據到hive原生表,註意使用的動態分區,每天只加載前一天的數據。其中#號部分是需要替換成具體的時間值。
--load_data_to_hive_raw_logs.sql use umeng_big11 ; load data inpath ‘hdfs://mycluster/user/centos/umeng/raw _logs/${hiveconf:ym}/${hiveconf:day}/${hiveconf:hm}‘ into table raw_logs partition(ym=${hiveconf:ym},day=${hiveconf:day},hm=${hiveconf:hm}) ;
1.3 編寫load_data_to_hive_raw_logs.sh
該腳本負責負責調用上面的sql腳本,調用前需要將參數進行填充。
[load_data_to_hive_raw_logs.sh] #!/bin/bash cd /home/centos/umeng if [[ $# = 0 ]] ; then time=`date -d "-3 minutes" "+%Y%m-%d-%H%M"` ; else time=$1-$2-$3 fi #external time variable echo -n $time > _time ym=`echo $time | awk -F ‘-‘ ‘{print $1}‘` day=`echo $time | awk -F ‘-‘ ‘{print $2}‘` hm=`echo $time | awk -F ‘-‘ ‘{print $3}‘` hive -hiveconf ym=${ym} -hiveconf day=${day} -hiveconf hm=${hm} -f load_data_to_hive_raw_logs.sql
1.4 修改腳本權限並執行
#增加執行權限
$>chmod +x load_data_to_hive_raw_logs.sh
#調用腳本,指定具體時間
$>./load_data_to_hive_raw_logs.sh 2018 2 4
#使用當前時間
$>./load_data_to_hive_raw_logs.sh
2、叉分並轉儲
2.1 說明
加載原生表的日誌需要進行叉分,時間對齊以及地域信息處理分別轉儲到5張日誌子表中。日誌子表也都是分區表,因此查出來的數據需要動態指定分區表。
2.2 叉分startuplogs表
2.2.1 編寫sql文件
[fork_startuplogs.sql]
--startuplog,動態分區
use umeng_big11 ;
set hive.cli.print.header=true ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into startuplogs partition(ym,day,hm)
select
t.appChannel ,
t.appId ,
t.appPlatform ,
t.appVersion ,
t.brand ,
t.carrier ,
t.country ,
t.createdAtMs ,
t.deviceId ,
t.deviceStyle ,
t.ipAddress ,
t.network ,
t.osType ,
t.province ,
t.screenSize ,
t.tenantId ,
date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
date_format(cast(t.createdatms as timestamp) , ‘HHmm‘)
from
(
select
--動態傳參叉分函數
${hiveconf:func}(servertimestr , clienttimems , clientip , log)
from
raw_logs
where
status = 200
and ym=${hiveconf:ym}
and day = ${hiveconf:day}
and hm = ${hiveconf:hm}
)t
2.2.2 編寫sh文件
shell腳本需要從_time文件中提取時間值,然後傳入sql文件名和叉分函數名。
[fork_logs.sh]
#!/bin/bash
cd /home/centos/umeng
#取第一個參數作為叉分函數
func=${1}
time=`cat _time`
ym=`echo -n $time | awk -F ‘-‘ ‘{print $1}‘`
day=`echo -n $time | awk -F ‘-‘ ‘{print $2}‘`
hm=`echo -n $time | awk -F ‘-‘ ‘{print $3}‘`
hive -hiveconf ym=${ym} -hiveconf day=${day} -hiveconf hm=${hm} -hiveconf func=${2} -f ${1}
2.2.3 執行腳本
#指定叉分函數
$>./fork_logs.sh fork_startuplogs.sql forkstartuplogs
2.3 叉分eventlogs表
2.3.1 編寫sql
[fork_eventlogs.sql]
--eventlog,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into eventlogs partition(ym,day,hm)
select
t.appChannel ,
t.appId ,
t.appPlatform ,
t.appVersion ,
t.createdAtMs ,
t.deviceId ,
t.deviceStyle ,
t.eventDurationSecs,
t.eventId ,
t.osType ,
t.tenantId ,
date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
date_format(cast(t.createdatms as timestamp) , ‘HHmm‘)
from
(
select
--動態傳參叉分函數
${hiveconf:func}(servertimestr , clienttimems , clientip , log)
from
raw_logs
where
status = 200
and ym=${hiveconf:ym} --年月
and day = ${hiveconf:day} --日
and hm = ${hiveconf:hm} --時分
)t
2.3.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_eventlogs.sql forkeventlogs
2.4 叉分errorlogs表
2.4.1 編寫sql
[fork_errorlogs.sql]
--eventlog,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into errorlogs partition(ym,day,hm)
select
t.appChannel ,
t.appId ,
t.appPlatform ,
t.appVersion ,
t.createdAtMs ,
t.deviceId ,
t.deviceStyle ,
t.errorBrief ,
t.errorDetail ,
t.osType ,
t.tenantId ,
date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
date_format(cast(t.createdatms as timestamp) , ‘HHmm‘)
from
(
select
--動態傳參叉分函數
${hiveconf:func}(servertimestr , clienttimems , clientip , log)
from
raw_logs
where
status = 200
and ym=${hiveconf:ym} --年月
and day = ${hiveconf:day} --日
and hm = ${hiveconf:hm} --時分
)t
2.4.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_errorlogs.sql forkerrorlogs
2.5 叉分usagelogs表
2.5.1 編寫sql
[fork_usagelogs.sql]
--usagelogs,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into usagelogs partition(ym,day,hm)
select
t.appChannel ,
t.appId ,
t.appPlatform ,
t.appVersion ,
t.createdAtMs ,
t.deviceId ,
t.deviceStyle ,
t.osType ,
t.singleDownloadTraffic,
t.singleUploadTraffic ,
t.singleUseDurationSecs,
t.tenantId
date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
date_format(cast(t.createdatms as timestamp) , ‘HHmm‘)
from
(
select
--動態傳參叉分函數
${hiveconf:func}(servertimestr , clienttimems , clientip , log)
from
raw_logs
where
status = 200
and ym=${hiveconf:ym}
and day = ${hiveconf:day}
and hm = ${hiveconf:hm}
)t
2.5.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_usagelogs.sql forkusagelogs
2.6 叉分pagelogs表
2.6.1 編寫sql
[fork_pagelogs.sql]
--pagelog,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into pagelogs partition(ym,day,hm)
select
t.appChannel ,
t.appId ,
t.appPlatform ,
t.appVersion ,
t.createdAtMs ,
t.deviceId ,
t.deviceStyle ,
t.nextPage ,
t.osType ,
t.pageId ,
t.pageViewCntInSession,
t.stayDurationSecs ,
t.tenantId ,
t.visitIndex ,
date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
date_format(cast(t.createdatms as timestamp) , ‘HHmm‘)
from
(
select
--動態傳參叉分函數
${hiveconf:func}(servertimestr , clienttimems , clientip , log)
from
raw_logs
where
status = 200
and ym=${hiveconf:ym}
and day = ${hiveconf:day}
and hm = ${hiveconf:hm}
)t
2.6.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_pagelogs.sql forkpagelogs
3、總結
編寫5個sql文件,一個shell腳本文件,通過傳遞參數給執行腳本,動態執行每張日誌表的叉分與轉儲工作。
腳本化加載文件與轉儲