数据仓库DWS层-离线数仓-大数据
目标:统计当日、当周、当月活动的每个设备明细
每日活跃设备明细
create external table dws_uv_detail_day
(`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
)
partitioned by(dt string)
stored as parquet
location ‘/warehouse/gmall/dws/dws_uv_detail_day’;step2. 从启动日志表导入数据
insert overwrite table dws_uv_detail_day
partition(dt=’2020-08-19’)
selectmid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang))lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat
from dwd_start_log
where dt=’2020-08-19’
group by mid_id;step3. 验证导入数据是否正确
SELECT * FROM dws_uv_detail_day WHERE dt=’2020-08-19’ LIMIT 2;
mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |dt |
———|———-|——————|——————|——-|———|—————-|——|———————|—————|——————-|——————————————————-|——————|—————————————-|———-|—————-|—————-|—————|
0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08-19|
1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |7501134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08-19|每周活跃设备明细
create external table dws_uv_detail_wk(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`monday_date` string COMMENT '周一日期',
`sunday_date` string COMMENT '周日日期'
) COMMENT ‘活跃用户按周明细’
PARTITIONED BY (wk_dt
string)
stored as parquet
location ‘/warehouse/gmall/dws/dws_uv_detail_wk/‘;step2. 从日活跃表导入数据
!!!这里一定要设置partition为nonstrict,否则会提示partition不是static
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_wk partition(wk_dt)
selectmid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_add(next_day('2020-08-19','MO'),-7),
date_add(next_day('2020-08-19','MO'),-1),
concat(date_add( next_day('2020-08-19','MO'),-7), '_' , date_add(next_day('2020-08-19','MO'),-1)
)
from dws_uv_detail_day
where dt>=date_add(next_day(‘2020-08-19’,’MO’),-7) and dt<=date_add(next_day(‘2020-08-19’,’MO’),-1)
group by mid_id;step3. 验证导入数据是否正确
SELECT * FROM dws_uv_detail_wk LIMIT 2;
mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |monday_date|sunday_date|wk_dt |
———|———-|——————|——————|——-|———|—————-|——|———————|—————|——————-|——————————————————-|——————|—————————————-|———-|—————-|—————-|—————-|—————-|——————————-|
0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08-17 |2020-08-23 |2020-08-17_2020-08-23|
1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |7501134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08-17 |2020-08-23 |2020-08-17_2020-08-23|select count(*) from dws_uv_detail_wk;
_c0|
—-|
708|每月活跃设备明细
create external table dws_uv_detail_mn(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度'
) COMMENT ‘活跃用户按月明细’
PARTITIONED BY (mn
string)
stored as parquet
location ‘/warehouse/gmall/dws/dws_uv_detail_mn/‘;step2. 从日活表导入数据
!!!一定要设置partition的mode
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table dws_uv_detail_mn partition(mn)
selectmid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_format('2019-02-10','yyyy-MM')
from dws_uv_detail_day
where date_format(dt,’yyyy-MM’) = date_format(‘2019-02-10’,’yyyy-MM’)
group by mid_id;step3. 从日活跃表导入数据
SELECT * FROM dws_uv_detail_mn LIMIT 2;
mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |mn |
———|———-|——————|——————|——-|———|—————-|——|———————|—————|——————-|——————————————————-|——————|—————————————-|———-|—————-|—————-|———-|
0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08|
1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |7501134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08|select count(*) from dws_uv_detail_mn ;
_c0|
—-|
708|
活跃用户详情导入成功,接下来编写脚本
#!/bin/bash
# 定义变量方便修改
APP=gmall
hive=/opt/module/hive/bin/hive
# 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
if [ -n "$1" ] ;then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
set hive.exec.dynamic.partition.mode=nonstrict;
insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat
from "$APP".dwd_start_log
where dt='$do_date'
group by mid_id;
insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang)) lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_add(next_day('$do_date','MO'),-7),
date_add(next_day('$do_date','MO'),-1),
concat(date_add( next_day('$do_date','MO'),-7), '_' , date_add(next_day('$do_date','MO'),-1)
)
from "$APP".dws_uv_detail_day
where dt>=date_add(next_day('$do_date','MO'),-7) and dt<=date_add(next_day('$do_date','MO'),-1)
group by mid_id;
insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
select
mid_id,
concat_ws('|', collect_set(user_id)) user_id,
concat_ws('|', collect_set(version_code)) version_code,
concat_ws('|', collect_set(version_name)) version_name,
concat_ws('|', collect_set(lang))lang,
concat_ws('|', collect_set(source)) source,
concat_ws('|', collect_set(os)) os,
concat_ws('|', collect_set(area)) area,
concat_ws('|', collect_set(model)) model,
concat_ws('|', collect_set(brand)) brand,
concat_ws('|', collect_set(sdk_version)) sdk_version,
concat_ws('|', collect_set(gmail)) gmail,
concat_ws('|', collect_set(height_width)) height_width,
concat_ws('|', collect_set(app_time)) app_time,
concat_ws('|', collect_set(network)) network,
concat_ws('|', collect_set(lng)) lng,
concat_ws('|', collect_set(lat)) lat,
date_format('$do_date','yyyy-MM')
from "$APP".dws_uv_detail_day
where date_format(dt,'yyyy-MM') = date_format('$do_date','yyyy-MM')
group by mid_id;
"
$hive -e "$sql"
每日新增设备
create external table dws_new_mid_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '创建时间'
) COMMENT '每日新增设备信息'
stored as parquet
location '/warehouse/gmall/dws/dws_new_mid_day/';
#step2. 从每日用户活跃表导入数据
insert into table dws_new_mid_day
select
ud.mid_id,
ud.user_id ,
ud.version_code ,
ud.version_name ,
ud.lang ,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.network,
ud.lng,
ud.lat,
'2020-08-19'
from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id
where ud.dt='2020-08-19' and nm.mid_id is null;
#step3. 验证导入数据是否正确
SELECT * FROM dws_new_mid_day LIMIT 2;
mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |create_date|
------|-------|------------|------------|-----|------|-----------|----|--------------|----------|-------------|-------------------------------------|------------|---------------------------|-------|-----------|-----------|-----------|
0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640*960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08-19 |
1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |750*1134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08-19 |
日新增详情表导入脚本
#!/bin/bash
APP=gmall
hive=$HIVE_HOME/bin/hive
if [ -n "$1" ];then
do_date=$1
else
do_date=`date -d "-1 day" +%F`
fi
sql="
INSERT INTO TABLE "$APP".dws_new_mid_day
SELECT
ud.mid_id,
ud.user_id,
ud.version_code,
ud.version_name,
ud.lang,
ud.source,
ud.os,
ud.area,
ud.model,
ud.brand,
ud.sdk_version,
ud.gmail,
ud.height_width,
ud.app_time,
ud.network,
ud.lng,
ud.lat,
'$do_date'
FROM "$APP".dws_uv_detail_day ud
LEFT JOIN "$APP".dws_new_mid_day nm on ud.mid_id=nm.mid_id
WHERE ud.dt='$do_date' and nm.mid_id is null;
"
$hive -e "$sql"
最后是每日留存明细表
create external table dws_user_retention_day
(
`mid_id` string COMMENT '设备唯一标识',
`user_id` string COMMENT '用户标识',
`version_code` string COMMENT '程序版本号',
`version_name` string COMMENT '程序版本名',
`lang` string COMMENT '系统语言',
`source` string COMMENT '渠道号',
`os` string COMMENT '安卓系统版本',
`area` string COMMENT '区域',
`model` string COMMENT '手机型号',
`brand` string COMMENT '手机品牌',
`sdk_version` string COMMENT 'sdkVersion',
`gmail` string COMMENT 'gmail',
`height_width` string COMMENT '屏幕宽高',
`app_time` string COMMENT '客户端日志产生时的时间',
`network` string COMMENT '网络模式',
`lng` string COMMENT '经度',
`lat` string COMMENT '纬度',
`create_date` string comment '设备新增时间',
`retention_day` int comment '截止当前日期留存天数'
) COMMENT '每日用户留存情况'
PARTITIONED BY (`dt` string)
stored as parquet
location '/warehouse/gmall/dws/dws_user_retention_day/';
#step2. 从用户每日详情表导入数据,如果用户留存表里没有新的用户详情,则发现一个新增用户
insert overwrite table dws_user_retention_day
partition(dt="2020-08-19")
select
nm.mid_id,
nm.user_id,
nm.version_code,
nm.version_name,
nm.lang,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
1 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2020-08-19' and nm.create_date=date_add('2020-08-19',-1)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
2 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2020-08-19' and nm.create_date=date_add('2020-08-19',-2)
union all
select
nm.mid_id,
nm.user_id ,
nm.version_code ,
nm.version_name ,
nm.lang ,
nm.source,
nm.os,
nm.area,
nm.model,
nm.brand,
nm.sdk_version,
nm.gmail,
nm.height_width,
nm.app_time,
nm.network,
nm.lng,
nm.lat,
nm.create_date,
3 retention_day
from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
where ud.dt='2020-08-19' and nm.create_date=date_add('2020-08-19',-3);
create external table ads_user_retention_day_count
(
`create_date` string comment '设备新增日期',
`retention_day` int comment '截止当前日期留存天数',
`retention_count` bigint comment '留存数量'
) COMMENT '每日用户留存情况'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_count/';
还没有评论,来说两句吧...