1. 程式人生 > >腳本化加載文件與轉儲

腳本化加載文件與轉儲

mem rtu ffi hdf conf 工作 awk dbo sql文件

腳本化加載文件與轉儲

1、加載數據到原生表

1.1 介紹

由於每天都會產生大量的日誌數據,需要對每天的日誌進行加載與清洗以及轉儲,編寫腳本文件後,通過azkaban進行調度即可。

1.2 編寫load_data_to_hive_raw_logs.sql

加載數據到hive原生表,註意使用的動態分區,每天只加載前一天的數據。其中#號部分是需要替換成具體的時間值。

--load_data_to_hive_raw_logs.sql
use umeng_big11 ;
load data inpath ‘hdfs://mycluster/user/centos/umeng/raw _logs/${hiveconf:ym}/${hiveconf:day}/${hiveconf:hm}‘ into table raw_logs partition(ym=${hiveconf:ym},day=${hiveconf:day},hm=${hiveconf:hm}) ;

1.3 編寫load_data_to_hive_raw_logs.sh

該腳本負責負責調用上面的sql腳本,調用前需要將參數進行填充。

[load_data_to_hive_raw_logs.sh]
#!/bin/bash
cd /home/centos/umeng
if [[ $# = 0 ]] ;
then time=`date -d "-3 minutes" "+%Y%m-%d-%H%M"` ;
else time=$1-$2-$3
fi

#external time variable
echo -n $time > _time

ym=`echo $time | awk -F ‘-‘ ‘{print $1}‘`
day=`echo $time | awk -F ‘-‘ ‘{print $2}‘`
hm=`echo $time | awk -F ‘-‘ ‘{print $3}‘`

hive -hiveconf ym=${ym} -hiveconf day=${day} -hiveconf hm=${hm} -f load_data_to_hive_raw_logs.sql

1.4 修改腳本權限並執行

#增加執行權限
$>chmod +x load_data_to_hive_raw_logs.sh

#調用腳本,指定具體時間
$>./load_data_to_hive_raw_logs.sh 2018 2 4

#使用當前時間
$>./load_data_to_hive_raw_logs.sh

2、叉分並轉儲

2.1 說明

加載原生表的日誌需要進行叉分,時間對齊以及地域信息處理分別轉儲到5張日誌子表中。日誌子表也都是分區表,因此查出來的數據需要動態指定分區表。

2.2 叉分startuplogs表

2.2.1 編寫sql文件

[fork_startuplogs.sql]

--startuplog,動態分區
use umeng_big11 ;
set hive.cli.print.header=true ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into startuplogs partition(ym,day,hm) 
select
  t.appChannel  ,
  t.appId       ,
  t.appPlatform ,
  t.appVersion  ,
  t.brand       ,
  t.carrier     ,
  t.country     ,
  t.createdAtMs ,
  t.deviceId    ,
  t.deviceStyle ,
  t.ipAddress   ,
  t.network     ,
  t.osType      ,
  t.province    ,
  t.screenSize  ,
  t.tenantId    , 
  date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘HHmm‘) 
from
  (
    select 
      --動態傳參叉分函數
      ${hiveconf:func}(servertimestr , clienttimems , clientip , log)
    from 
      raw_logs 
    where 
      status = 200
      and ym=${hiveconf:ym}
      and day = ${hiveconf:day}
      and hm = ${hiveconf:hm}
  )t
2.2.2 編寫sh文件

shell腳本需要從_time文件中提取時間值,然後傳入sql文件名和叉分函數名。

[fork_logs.sh]

#!/bin/bash
cd /home/centos/umeng
#取第一個參數作為叉分函數
func=${1}
time=`cat _time`
ym=`echo -n $time | awk -F ‘-‘ ‘{print $1}‘`
day=`echo -n $time | awk -F ‘-‘ ‘{print $2}‘`
hm=`echo -n $time | awk -F ‘-‘ ‘{print $3}‘`

hive -hiveconf ym=${ym} -hiveconf day=${day} -hiveconf hm=${hm} -hiveconf func=${2} -f ${1}
2.2.3 執行腳本
#指定叉分函數
$>./fork_logs.sh fork_startuplogs.sql forkstartuplogs

2.3 叉分eventlogs表

2.3.1 編寫sql

[fork_eventlogs.sql]

--eventlog,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into eventlogs partition(ym,day,hm) 
select
  t.appChannel      ,
  t.appId           ,
  t.appPlatform     ,
  t.appVersion      ,   
  t.createdAtMs     ,
  t.deviceId        ,   
  t.deviceStyle     ,
  t.eventDurationSecs, 
  t.eventId         ,
  t.osType          ,
  t.tenantId        ,   
  date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘HHmm‘) 
from
  (
    select 
      --動態傳參叉分函數
      ${hiveconf:func}(servertimestr , clienttimems , clientip , log)
    from 
      raw_logs 
    where 
      status = 200
      and ym=${hiveconf:ym}         --年月
      and day = ${hiveconf:day} --日
      and hm = ${hiveconf:hm}   --時分
  )t
2.3.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_eventlogs.sql forkeventlogs

2.4 叉分errorlogs表

2.4.1 編寫sql

[fork_errorlogs.sql]

--eventlog,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into errorlogs partition(ym,day,hm) 
select
  t.appChannel  ,
  t.appId       ,
  t.appPlatform ,
  t.appVersion  ,
  t.createdAtMs ,
  t.deviceId    ,
  t.deviceStyle ,
  t.errorBrief  ,
  t.errorDetail ,
  t.osType      ,
  t.tenantId    ,
  date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘HHmm‘) 
from
  (
    select 
      --動態傳參叉分函數
      ${hiveconf:func}(servertimestr , clienttimems , clientip , log)
    from 
      raw_logs 
    where 
      status = 200
      and ym=${hiveconf:ym}         --年月
      and day = ${hiveconf:day} --日
      and hm = ${hiveconf:hm}   --時分
  )t
2.4.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_errorlogs.sql forkerrorlogs

2.5 叉分usagelogs表

2.5.1 編寫sql

[fork_usagelogs.sql]

--usagelogs,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into usagelogs partition(ym,day,hm) 
select
  t.appChannel          ,
  t.appId               ,
  t.appPlatform         ,
  t.appVersion          ,
  t.createdAtMs         ,
  t.deviceId            ,
  t.deviceStyle         ,
  t.osType              ,
  t.singleDownloadTraffic,
  t.singleUploadTraffic ,
  t.singleUseDurationSecs,
  t.tenantId 
  date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘HHmm‘) 
from
  (
    select 
      --動態傳參叉分函數
      ${hiveconf:func}(servertimestr , clienttimems , clientip , log)
    from 
      raw_logs 
    where
      status = 200
      and ym=${hiveconf:ym}
      and day = ${hiveconf:day}
      and hm = ${hiveconf:hm}
  )t
2.5.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_usagelogs.sql forkusagelogs

2.6 叉分pagelogs表

2.6.1 編寫sql

[fork_pagelogs.sql]

--pagelog,動態分區
use umeng_big11 ;
add jar /soft/hive/lib/umeng_hive.jar ;
create temporary function forkstartuplogs as ‘com.oldboy.umeng.hive.udf.ForkStartupLogUDTF‘ ;
create temporary function forkeventlogs as ‘com.oldboy.umeng.hive.udf.ForkEventLogUDTF‘ ;
create temporary function forkerrorlogs as ‘com.oldboy.umeng.hive.udf.ForkErrorLogUDTF‘ ;
create temporary function forkusagelogs as ‘com.oldboy.umeng.hive.udf.ForkUsageLogUDTF‘ ;
create temporary function forkpagelogs as ‘com.oldboy.umeng.hive.udf.ForkPageLogUDTF‘ ;
insert into pagelogs partition(ym,day,hm) 
select
  t.appChannel          ,
  t.appId               ,
  t.appPlatform         ,
  t.appVersion          ,
  t.createdAtMs         ,
  t.deviceId            ,
  t.deviceStyle         ,
  t.nextPage            ,
  t.osType              ,
  t.pageId              ,
  t.pageViewCntInSession,
  t.stayDurationSecs    ,
  t.tenantId            ,
  t.visitIndex          ,
  date_format(cast(t.createdatms as timestamp) , ‘yyyyMM‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘dd‘) ,
  date_format(cast(t.createdatms as timestamp) , ‘HHmm‘) 
from
  (
    select 
      --動態傳參叉分函數
      ${hiveconf:func}(servertimestr , clienttimems , clientip , log)
    from 
      raw_logs 
    where 
      status = 200
      and ym=${hiveconf:ym}
      and day = ${hiveconf:day}
      and hm = ${hiveconf:hm}
  )t
2.6.2 執行腳本
$>#指定叉分函數
$>./fork_logs.sh fork_pagelogs.sql forkpagelogs

3、總結

編寫5個sql文件,一個shell腳本文件,通過傳遞參數給執行腳本,動態執行每張日誌表的叉分與轉儲工作。

腳本化加載文件與轉儲