1. 程式人生 > 實用技巧 >大資料實戰(七十三):電商數倉(五十七)數倉之使用者行為數倉搭建(二)ODS層,DWD層,DWS層,ADS層

大資料實戰(七十三):電商數倉(五十七)數倉之使用者行為數倉搭建(二)ODS層,DWD層,DWS層,ADS層

1 ODS

原始資料層,存放原始資料,直接載入原始日誌、資料,資料保持原貌不做處理。

1.1 建立資料庫

1)建立資料倉庫目錄,並修改所有者

sudo-u hdfs hadoop fs -mkdir /warehouse

sudo -u hdfs hadoop fs -chown hive:hive /warehouse

2)修改hive配置

2)啟動Hive客戶端(注意要以hive使用者啟動

sudo-uhive hive

3)建立gmall資料庫

hive (default)> create database gmall;

說明如果資料庫存在且有資料需要強制刪除時執行:

drop database gmall cascade;

4)使用gmall資料庫

hive (default)> use gmall;

1.2 建立啟動日誌ods_start_log

以下為建表語句

hive (gmall)>

drop table if exists ods_start_log;

CREATE EXTERNAL TABLE ods_start_log (`line` string)

PARTITIONED BY (`dt` string)

STORED AS

INPUTFORMAT 'com.hadoop.mapred.DeprecatedLzoTextInputFormat'

OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'

LOCATION '/warehouse/gmall/ods/ods_start_log';

說明HiveLZO壓縮:https://cwiki.apache.org/confluence/display/Hive/LanguageManual+LZO

1.3 ODS載入資料指令碼

1)在hadoop102/root/bin目錄下建立指令碼

[root@hadoop102 bin]$ vim ods_log.sh

指令碼中編寫如下內容

#!/bin/bash

# 定義變數方便修改
APP
=gmall # 如果是輸入的日期按照取輸入日期;如果沒輸入日期取當前時間的前一天 if [ -n "$1" ] ;then do_date=$1 else do_date=`date -d "-1 day" +%F` fi echo "===日誌日期為 $do_date===" sql=" load data inpath '/origin_data/gmall/log/topic_start/$do_date' into table "$APP".ods_start_log partition(dt='$do_date'); " beeline -u "jdbc:hive2://hadoop102:10000/" -n hive -e "$sql"

說明1

[ -n 變數]判斷變數的值,是否為空

--變數的值,非空,返回true

--變數的值,為空,返回false

說明2

檢視date命令的使用,[root@hadoop102 ~]$ date --help

2)增加指令碼執行許可權

[root@hadoop102 bin]$ chmod 777 ods_log.sh

3)指令碼使用

[root@hadoop102 module]$ ods_log.sh 2019-09-03

2 DWD層啟動資料解析

2.1 建立啟動

1)建表語句

hive (gmall)> 
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`mid_id` string,
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string,
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string,  
`app_time` string,
`network` string, 
`lng` string, 
`lat` string, 
`entry` string, 
`open_ad_type` string, 
`action` string, 
`loading_time` string, 
`detail` string, 
`extend1` string
)
PARTITIONED BY (dt string)
location '/warehouse/gmall/dwd/dwd_start_log/';
hive (gmall)> 
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`mid_id` string,
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string,
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string,  
`app_time` string,
`network` string, 
`lng` string, 
`lat` string, 
`entry` string, 
`open_ad_type` string, 
`action` string, 
`loading_time` string, 
`detail` string, 
`extend1` string
)
PARTITIONED BY (dt string)
location '/warehouse/gmall/dwd/dwd_start_log/';
hive (gmall)> 
drop table if exists dwd_start_log;
CREATE EXTERNAL TABLE dwd_start_log(
`mid_id` string,
`user_id` string, 
`version_code` string, 
`version_name` string, 
`lang` string, 
`source` string, 
`os` string, 
`area` string, 
`model` string,
`brand` string, 
`sdk_version` string, 
`gmail` string, 
`height_width` string,  
`app_time` string,
`network` string, 
`lng` string, 
`lat` string, 
`entry` string, 
`open_ad_type` string, 
`action` string, 
`loading_time` string, 
`detail` string, 
`extend1` string
)
PARTITIONED BY (dt string)
location '/warehouse/gmall/dwd/dwd_start_log/';

2.2 DWD啟動載入資料指令碼

1)在hadoop102/root/bin目錄下建立指令碼

[root@hadoop102 bin]$ vim dwd_start_log.sh

指令碼中編寫如下內容

#!/bin/bash

# 定義變數方便修改
APP=gmall

# 如果是輸入的日期按照取輸入日期;如果沒輸入日期取當前時間的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date -d "-1 day" +%F`  
fi 

sql="
set hive.exec.dynamic.partition.mode=nonstrict;

insert overwrite table "$APP".dwd_start_log
PARTITION (dt='$do_date')
select 
    get_json_object(line,'$.mid') mid_id,
    get_json_object(line,'$.uid') user_id,
    get_json_object(line,'$.vc') version_code,
    get_json_object(line,'$.vn') version_name,
    get_json_object(line,'$.l') lang,
    get_json_object(line,'$.sr') source,
    get_json_object(line,'$.os') os,
    get_json_object(line,'$.ar') area,
    get_json_object(line,'$.md') model,
    get_json_object(line,'$.ba') brand,
    get_json_object(line,'$.sv') sdk_version,
    get_json_object(line,'$.g') gmail,
    get_json_object(line,'$.hw') height_width,
    get_json_object(line,'$.t') app_time,
    get_json_object(line,'$.nw') network,
    get_json_object(line,'$.ln') lng,
    get_json_object(line,'$.la') lat,
    get_json_object(line,'$.entry') entry,
    get_json_object(line,'$.open_ad_type') open_ad_type,
    get_json_object(line,'$.action') action,
    get_json_object(line,'$.loading_time') loading_time,
    get_json_object(line,'$.detail') detail,
    get_json_object(line,'$.extend1') extend1
from "$APP".ods_start_log 
where dt='$do_date';
"

beeline -u "jdbc:hive2://hadoop102:10000/" -n hive -e "$sql"

2)增加指令碼執行許可權

[root@hadoop102 bin]$ chmod 777 dwd_start_log.sh

3)指令碼使用

[root@hadoop102 module]$ dwd_start_log.sh 2019-09-03

3 DWS層(需求:使用者日活躍

目標統計當日、當週、當月活動的每個裝置明細

3.1 每日活躍裝置明細

1建表語句

hive (gmall)>
drop table if exists dws_uv_detail_day;
create external table dws_uv_detail_day
(
    `mid_id` string COMMENT '裝置唯一標識',
    `user_id` string COMMENT '使用者標識', 
    `version_code` string COMMENT '程式版本號', 
    `version_name` string COMMENT '程式版本名', 
    `lang` string COMMENT '系統語言', 
    `source` string COMMENT '渠道號', 
    `os` string COMMENT '安卓系統版本', 
    `area` string COMMENT '區域', 
    `model` string COMMENT '手機型號', 
    `brand` string COMMENT '手機品牌', 
    `sdk_version` string COMMENT 'sdkVersion', 
    `gmail` string COMMENT 'gmail', 
    `height_width` string COMMENT '螢幕寬高',
    `app_time` string COMMENT '客戶端日誌產生時的時間',
    `network` string COMMENT '網路模式',
    `lng` string COMMENT '經度',
    `lat` string COMMENT '緯度'
)
partitioned by(dt string)
stored as parquet
location '/warehouse/gmall/dws/dws_uv_detail_day'
;

3.2 DWS載入資料指令碼

1)在hadoop102/root/bin目錄下建立指令碼

[root@hadoop102 bin]$ vim dws_log.sh

指令碼中編寫如下內容

#!/bin/bash

# 定義變數方便修改
APP=gmall

# 如果是輸入的日期按照取輸入日期;如果沒輸入日期取當前時間的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date -d "-1 day" +%F`  
fi 


sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

  insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
  select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang)) lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat
  from "$APP".dwd_start_log
  where dt='$do_date'  
  group by mid_id;
"

beeline -u "jdbc:hive2://hadoop102:10000/" -n hive -e "$sql"

#!/bin/bash

# 定義變數方便修改
APP=gmall

# 如果是輸入的日期按照取輸入日期;如果沒輸入日期取當前時間的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date -d "-1 day" +%F`  
fi 


sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

  insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
  select  
    mid_id,
    concat_ws('|', collect_set(user_id)) user_id,
    concat_ws('|', collect_set(version_code)) version_code,
    concat_ws('|', collect_set(version_name)) version_name,
    concat_ws('|', collect_set(lang)) lang,
    concat_ws('|', collect_set(source)) source,
    concat_ws('|', collect_set(os)) os,
    concat_ws('|', collect_set(area)) area, 
    concat_ws('|', collect_set(model)) model,
    concat_ws('|', collect_set(brand)) brand,
    concat_ws('|', collect_set(sdk_version)) sdk_version,
    concat_ws('|', collect_set(gmail)) gmail,
    concat_ws('|', collect_set(height_width)) height_width,
    concat_ws('|', collect_set(app_time)) app_time,
    concat_ws('|', collect_set(network)) network,
    concat_ws('|', collect_set(lng)) lng,
    concat_ws('|', collect_set(lat)) lat
  from "$APP".dwd_start_log
  where dt='$do_date'  
  group by mid_id;
"

beeline -u "jdbc:hive2://hadoop102:10000/" -n hive -e "$sql"

2)增加指令碼執行許可權

[root@hadoop102 bin]$ chmod 777 dws_log.sh

3)指令碼使用

[root@hadoop102 module]$ dws_log.sh 2019-09-03

4 ADS層(需求:使用者日活躍

目標:當日活躍裝置數

4.1活躍裝置數

1建表語句

hive (gmall)>
drop table if exists ads_uv_count;
create external table ads_uv_count( 
 `dt` string COMMENT '統計日期',
 `day_count` bigint COMMENT '當日使用者數量'
) COMMENT '活躍裝置數' 
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count/'
;

4.2ADS層載入資料指令碼

1)在hadoop102/root/bin目錄下建立指令碼

[root@hadoop102 bin]$ vim ads_uv_log.sh

指令碼中編寫如下內容

#!/bin/bash

# 定義變數方便修改
APP=gmall

# 如果是輸入的日期按照取輸入日期;如果沒輸入日期取當前時間的前一天
if [ -n "$1" ] ;then
    do_date=$1
else 
    do_date=`date -d "-1 day" +%F`  
fi 

sql="
  set hive.exec.dynamic.partition.mode=nonstrict;

insert into table "$APP".ads_uv_count 
select  
  '$do_date' dt,
   daycount.ct
from 
(
   select  
      '$do_date' dt,
       count(*) ct
   from "$APP".dws_uv_detail_day
   where dt='$do_date'  
)daycount;
"

beeline -u "jdbc:hive2://hadoop102:10000/" -n hive -e "$sql"

2)增加指令碼執行許可權

[root@hadoop102 bin]$ chmod 777 ads_uv_log.sh

3)指令碼使用

[root@hadoop102 module]$ ads_uv_log.sh 2019-09-03