数据仓库DWS层-离线数仓-大数据

我会带着你远行 2022-11-30 04:18 294阅读 0赞

目标:统计当日、当周、当月活动的每个设备明细


  1. 每日活跃设备明细

    create external table dws_uv_detail_day
    (

    1. `mid_id` string COMMENT '设备唯一标识',
    2. `user_id` string COMMENT '用户标识',
    3. `version_code` string COMMENT '程序版本号',
    4. `version_name` string COMMENT '程序版本名',
    5. `lang` string COMMENT '系统语言',
    6. `source` string COMMENT '渠道号',
    7. `os` string COMMENT '安卓系统版本',
    8. `area` string COMMENT '区域',
    9. `model` string COMMENT '手机型号',
    10. `brand` string COMMENT '手机品牌',
    11. `sdk_version` string COMMENT 'sdkVersion',
    12. `gmail` string COMMENT 'gmail',
    13. `height_width` string COMMENT '屏幕宽高',
    14. `app_time` string COMMENT '客户端日志产生时的时间',
    15. `network` string COMMENT '网络模式',
    16. `lng` string COMMENT '经度',
    17. `lat` string COMMENT '纬度'

    )
    partitioned by(dt string)
    stored as parquet
    location ‘/warehouse/gmall/dws/dws_uv_detail_day’;

    step2. 从启动日志表导入数据

    insert overwrite table dws_uv_detail_day
    partition(dt=’2020-08-19’)
    select

    1. mid_id,
    2. concat_ws('|', collect_set(user_id)) user_id,
    3. concat_ws('|', collect_set(version_code)) version_code,
    4. concat_ws('|', collect_set(version_name)) version_name,
    5. concat_ws('|', collect_set(lang))lang,
    6. concat_ws('|', collect_set(source)) source,
    7. concat_ws('|', collect_set(os)) os,
    8. concat_ws('|', collect_set(area)) area,
    9. concat_ws('|', collect_set(model)) model,
    10. concat_ws('|', collect_set(brand)) brand,
    11. concat_ws('|', collect_set(sdk_version)) sdk_version,
    12. concat_ws('|', collect_set(gmail)) gmail,
    13. concat_ws('|', collect_set(height_width)) height_width,
    14. concat_ws('|', collect_set(app_time)) app_time,
    15. concat_ws('|', collect_set(network)) network,
    16. concat_ws('|', collect_set(lng)) lng,
    17. concat_ws('|', collect_set(lat)) lat

    from dwd_start_log
    where dt=’2020-08-19’
    group by mid_id;

    step3. 验证导入数据是否正确

    SELECT * FROM dws_uv_detail_day WHERE dt=’2020-08-19’ LIMIT 2;

    mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |dt |
    ———|———-|——————|——————|——-|———|—————-|——|———————|—————|——————-|——————————————————-|——————|—————————————-|———-|—————-|—————-|—————|
    0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08-19|
    1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |750
    1134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08-19|

  2. 每周活跃设备明细

    create external table dws_uv_detail_wk(

    1. `mid_id` string COMMENT '设备唯一标识',
    2. `user_id` string COMMENT '用户标识',
    3. `version_code` string COMMENT '程序版本号',
    4. `version_name` string COMMENT '程序版本名',
    5. `lang` string COMMENT '系统语言',
    6. `source` string COMMENT '渠道号',
    7. `os` string COMMENT '安卓系统版本',
    8. `area` string COMMENT '区域',
    9. `model` string COMMENT '手机型号',
    10. `brand` string COMMENT '手机品牌',
    11. `sdk_version` string COMMENT 'sdkVersion',
    12. `gmail` string COMMENT 'gmail',
    13. `height_width` string COMMENT '屏幕宽高',
    14. `app_time` string COMMENT '客户端日志产生时的时间',
    15. `network` string COMMENT '网络模式',
    16. `lng` string COMMENT '经度',
    17. `lat` string COMMENT '纬度',
    18. `monday_date` string COMMENT '周一日期',
    19. `sunday_date` string COMMENT '周日日期'

    ) COMMENT ‘活跃用户按周明细’
    PARTITIONED BY (wk_dt string)
    stored as parquet
    location ‘/warehouse/gmall/dws/dws_uv_detail_wk/‘;

    step2. 从日活跃表导入数据

    !!!这里一定要设置partition为nonstrict,否则会提示partition不是static

    set hive.exec.dynamic.partition.mode=nonstrict;

    insert overwrite table dws_uv_detail_wk partition(wk_dt)
    select

    1. mid_id,
    2. concat_ws('|', collect_set(user_id)) user_id,
    3. concat_ws('|', collect_set(version_code)) version_code,
    4. concat_ws('|', collect_set(version_name)) version_name,
    5. concat_ws('|', collect_set(lang)) lang,
    6. concat_ws('|', collect_set(source)) source,
    7. concat_ws('|', collect_set(os)) os,
    8. concat_ws('|', collect_set(area)) area,
    9. concat_ws('|', collect_set(model)) model,
    10. concat_ws('|', collect_set(brand)) brand,
    11. concat_ws('|', collect_set(sdk_version)) sdk_version,
    12. concat_ws('|', collect_set(gmail)) gmail,
    13. concat_ws('|', collect_set(height_width)) height_width,
    14. concat_ws('|', collect_set(app_time)) app_time,
    15. concat_ws('|', collect_set(network)) network,
    16. concat_ws('|', collect_set(lng)) lng,
    17. concat_ws('|', collect_set(lat)) lat,
    18. date_add(next_day('2020-08-19','MO'),-7),
    19. date_add(next_day('2020-08-19','MO'),-1),
    20. concat(date_add( next_day('2020-08-19','MO'),-7), '_' , date_add(next_day('2020-08-19','MO'),-1)

    )
    from dws_uv_detail_day
    where dt>=date_add(next_day(‘2020-08-19’,’MO’),-7) and dt<=date_add(next_day(‘2020-08-19’,’MO’),-1)
    group by mid_id;

    step3. 验证导入数据是否正确

    SELECT * FROM dws_uv_detail_wk LIMIT 2;

    mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |monday_date|sunday_date|wk_dt |
    ———|———-|——————|——————|——-|———|—————-|——|———————|—————|——————-|——————————————————-|——————|—————————————-|———-|—————-|—————-|—————-|—————-|——————————-|
    0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08-17 |2020-08-23 |2020-08-17_2020-08-23|
    1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |750
    1134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08-17 |2020-08-23 |2020-08-17_2020-08-23|

    select count(*) from dws_uv_detail_wk;

    _c0|
    —-|
    708|

  3. 每月活跃设备明细

    create external table dws_uv_detail_mn(

    1. `mid_id` string COMMENT '设备唯一标识',
    2. `user_id` string COMMENT '用户标识',
    3. `version_code` string COMMENT '程序版本号',
    4. `version_name` string COMMENT '程序版本名',
    5. `lang` string COMMENT '系统语言',
    6. `source` string COMMENT '渠道号',
    7. `os` string COMMENT '安卓系统版本',
    8. `area` string COMMENT '区域',
    9. `model` string COMMENT '手机型号',
    10. `brand` string COMMENT '手机品牌',
    11. `sdk_version` string COMMENT 'sdkVersion',
    12. `gmail` string COMMENT 'gmail',
    13. `height_width` string COMMENT '屏幕宽高',
    14. `app_time` string COMMENT '客户端日志产生时的时间',
    15. `network` string COMMENT '网络模式',
    16. `lng` string COMMENT '经度',
    17. `lat` string COMMENT '纬度'

    ) COMMENT ‘活跃用户按月明细’
    PARTITIONED BY (mn string)
    stored as parquet
    location ‘/warehouse/gmall/dws/dws_uv_detail_mn/‘;

    step2. 从日活表导入数据

    !!!一定要设置partition的mode

    set hive.exec.dynamic.partition.mode=nonstrict;

    insert overwrite table dws_uv_detail_mn partition(mn)
    select

    1. mid_id,
    2. concat_ws('|', collect_set(user_id)) user_id,
    3. concat_ws('|', collect_set(version_code)) version_code,
    4. concat_ws('|', collect_set(version_name)) version_name,
    5. concat_ws('|', collect_set(lang)) lang,
    6. concat_ws('|', collect_set(source)) source,
    7. concat_ws('|', collect_set(os)) os,
    8. concat_ws('|', collect_set(area)) area,
    9. concat_ws('|', collect_set(model)) model,
    10. concat_ws('|', collect_set(brand)) brand,
    11. concat_ws('|', collect_set(sdk_version)) sdk_version,
    12. concat_ws('|', collect_set(gmail)) gmail,
    13. concat_ws('|', collect_set(height_width)) height_width,
    14. concat_ws('|', collect_set(app_time)) app_time,
    15. concat_ws('|', collect_set(network)) network,
    16. concat_ws('|', collect_set(lng)) lng,
    17. concat_ws('|', collect_set(lat)) lat,
    18. date_format('2019-02-10','yyyy-MM')

    from dws_uv_detail_day
    where date_format(dt,’yyyy-MM’) = date_format(‘2019-02-10’,’yyyy-MM’)
    group by mid_id;

    step3. 从日活跃表导入数据

    SELECT * FROM dws_uv_detail_mn LIMIT 2;

    mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |mn |
    ———|———-|——————|——————|——-|———|—————-|——|———————|—————|——————-|——————————————————-|——————|—————————————-|———-|—————-|—————-|———-|
    0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08|
    1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |750
    1134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08|

    select count(*) from dws_uv_detail_mn ;

    _c0|
    —-|
    708|


活跃用户详情导入成功,接下来编写脚本

  1. #!/bin/bash
  2. # 定义变量方便修改
  3. APP=gmall
  4. hive=/opt/module/hive/bin/hive
  5. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
  6. if [ -n "$1" ] ;then
  7. do_date=$1
  8. else
  9. do_date=`date -d "-1 day" +%F`
  10. fi
  11. sql="
  12. set hive.exec.dynamic.partition.mode=nonstrict;
  13. insert overwrite table "$APP".dws_uv_detail_day partition(dt='$do_date')
  14. select
  15. mid_id,
  16. concat_ws('|', collect_set(user_id)) user_id,
  17. concat_ws('|', collect_set(version_code)) version_code,
  18. concat_ws('|', collect_set(version_name)) version_name,
  19. concat_ws('|', collect_set(lang)) lang,
  20. concat_ws('|', collect_set(source)) source,
  21. concat_ws('|', collect_set(os)) os,
  22. concat_ws('|', collect_set(area)) area,
  23. concat_ws('|', collect_set(model)) model,
  24. concat_ws('|', collect_set(brand)) brand,
  25. concat_ws('|', collect_set(sdk_version)) sdk_version,
  26. concat_ws('|', collect_set(gmail)) gmail,
  27. concat_ws('|', collect_set(height_width)) height_width,
  28. concat_ws('|', collect_set(app_time)) app_time,
  29. concat_ws('|', collect_set(network)) network,
  30. concat_ws('|', collect_set(lng)) lng,
  31. concat_ws('|', collect_set(lat)) lat
  32. from "$APP".dwd_start_log
  33. where dt='$do_date'
  34. group by mid_id;
  35. insert overwrite table "$APP".dws_uv_detail_wk partition(wk_dt)
  36. select
  37. mid_id,
  38. concat_ws('|', collect_set(user_id)) user_id,
  39. concat_ws('|', collect_set(version_code)) version_code,
  40. concat_ws('|', collect_set(version_name)) version_name,
  41. concat_ws('|', collect_set(lang)) lang,
  42. concat_ws('|', collect_set(source)) source,
  43. concat_ws('|', collect_set(os)) os,
  44. concat_ws('|', collect_set(area)) area,
  45. concat_ws('|', collect_set(model)) model,
  46. concat_ws('|', collect_set(brand)) brand,
  47. concat_ws('|', collect_set(sdk_version)) sdk_version,
  48. concat_ws('|', collect_set(gmail)) gmail,
  49. concat_ws('|', collect_set(height_width)) height_width,
  50. concat_ws('|', collect_set(app_time)) app_time,
  51. concat_ws('|', collect_set(network)) network,
  52. concat_ws('|', collect_set(lng)) lng,
  53. concat_ws('|', collect_set(lat)) lat,
  54. date_add(next_day('$do_date','MO'),-7),
  55. date_add(next_day('$do_date','MO'),-1),
  56. concat(date_add( next_day('$do_date','MO'),-7), '_' , date_add(next_day('$do_date','MO'),-1)
  57. )
  58. from "$APP".dws_uv_detail_day
  59. where dt>=date_add(next_day('$do_date','MO'),-7) and dt<=date_add(next_day('$do_date','MO'),-1)
  60. group by mid_id;
  61. insert overwrite table "$APP".dws_uv_detail_mn partition(mn)
  62. select
  63. mid_id,
  64. concat_ws('|', collect_set(user_id)) user_id,
  65. concat_ws('|', collect_set(version_code)) version_code,
  66. concat_ws('|', collect_set(version_name)) version_name,
  67. concat_ws('|', collect_set(lang))lang,
  68. concat_ws('|', collect_set(source)) source,
  69. concat_ws('|', collect_set(os)) os,
  70. concat_ws('|', collect_set(area)) area,
  71. concat_ws('|', collect_set(model)) model,
  72. concat_ws('|', collect_set(brand)) brand,
  73. concat_ws('|', collect_set(sdk_version)) sdk_version,
  74. concat_ws('|', collect_set(gmail)) gmail,
  75. concat_ws('|', collect_set(height_width)) height_width,
  76. concat_ws('|', collect_set(app_time)) app_time,
  77. concat_ws('|', collect_set(network)) network,
  78. concat_ws('|', collect_set(lng)) lng,
  79. concat_ws('|', collect_set(lat)) lat,
  80. date_format('$do_date','yyyy-MM')
  81. from "$APP".dws_uv_detail_day
  82. where date_format(dt,'yyyy-MM') = date_format('$do_date','yyyy-MM')
  83. group by mid_id;
  84. "
  85. $hive -e "$sql"

每日新增设备

  1. create external table dws_new_mid_day
  2. (
  3. `mid_id` string COMMENT '设备唯一标识',
  4. `user_id` string COMMENT '用户标识',
  5. `version_code` string COMMENT '程序版本号',
  6. `version_name` string COMMENT '程序版本名',
  7. `lang` string COMMENT '系统语言',
  8. `source` string COMMENT '渠道号',
  9. `os` string COMMENT '安卓系统版本',
  10. `area` string COMMENT '区域',
  11. `model` string COMMENT '手机型号',
  12. `brand` string COMMENT '手机品牌',
  13. `sdk_version` string COMMENT 'sdkVersion',
  14. `gmail` string COMMENT 'gmail',
  15. `height_width` string COMMENT '屏幕宽高',
  16. `app_time` string COMMENT '客户端日志产生时的时间',
  17. `network` string COMMENT '网络模式',
  18. `lng` string COMMENT '经度',
  19. `lat` string COMMENT '纬度',
  20. `create_date` string comment '创建时间'
  21. ) COMMENT '每日新增设备信息'
  22. stored as parquet
  23. location '/warehouse/gmall/dws/dws_new_mid_day/';
  24. #step2. 从每日用户活跃表导入数据
  25. insert into table dws_new_mid_day
  26. select
  27. ud.mid_id,
  28. ud.user_id ,
  29. ud.version_code ,
  30. ud.version_name ,
  31. ud.lang ,
  32. ud.source,
  33. ud.os,
  34. ud.area,
  35. ud.model,
  36. ud.brand,
  37. ud.sdk_version,
  38. ud.gmail,
  39. ud.height_width,
  40. ud.app_time,
  41. ud.network,
  42. ud.lng,
  43. ud.lat,
  44. '2020-08-19'
  45. from dws_uv_detail_day ud left join dws_new_mid_day nm on ud.mid_id=nm.mid_id
  46. where ud.dt='2020-08-19' and nm.mid_id is null;
  47. #step3. 验证导入数据是否正确
  48. SELECT * FROM dws_new_mid_day LIMIT 2;
  49. mid_id|user_id|version_code|version_name|lang |source|os |area|model |brand |sdk_version |gmail |height_width|app_time |network|lng |lat |create_date|
  50. ------|-------|------------|------------|-----|------|-----------|----|--------------|----------|-------------|-------------------------------------|------------|---------------------------|-------|-----------|-----------|-----------|
  51. 0 |0 |14|13 |1.3.6|1.0.0 |es|en|D |8.1.3|8.1.7|MX |Huawei-7|HTC-3|Huawei|HTC|V2.7.0|V2.8.0|4H34OQ8C@gmail.com|617U6H5X@gmail.com|640*960 |1597830241221|1597785492032|WIFI|3G|-68.1|-53.0|-43.1|-47.2|2020-08-19 |
  52. 1 |1 |15 |1.3.0 |en |A |8.1.5 |MX |sumsung-0 |Sumsung |V2.6.6 |QQ4V37RD@gmail.com |750*1134 |1597827058186 |WIFI |-41.7 |-13.6 |2020-08-19 |

日新增详情表导入脚本

  1. #!/bin/bash
  2. APP=gmall
  3. hive=$HIVE_HOME/bin/hive
  4. if [ -n "$1" ];then
  5. do_date=$1
  6. else
  7. do_date=`date -d "-1 day" +%F`
  8. fi
  9. sql="
  10. INSERT INTO TABLE "$APP".dws_new_mid_day
  11. SELECT
  12. ud.mid_id,
  13. ud.user_id,
  14. ud.version_code,
  15. ud.version_name,
  16. ud.lang,
  17. ud.source,
  18. ud.os,
  19. ud.area,
  20. ud.model,
  21. ud.brand,
  22. ud.sdk_version,
  23. ud.gmail,
  24. ud.height_width,
  25. ud.app_time,
  26. ud.network,
  27. ud.lng,
  28. ud.lat,
  29. '$do_date'
  30. FROM "$APP".dws_uv_detail_day ud
  31. LEFT JOIN "$APP".dws_new_mid_day nm on ud.mid_id=nm.mid_id
  32. WHERE ud.dt='$do_date' and nm.mid_id is null;
  33. "
  34. $hive -e "$sql"

最后是每日留存明细表

  1. create external table dws_user_retention_day
  2. (
  3. `mid_id` string COMMENT '设备唯一标识',
  4. `user_id` string COMMENT '用户标识',
  5. `version_code` string COMMENT '程序版本号',
  6. `version_name` string COMMENT '程序版本名',
  7. `lang` string COMMENT '系统语言',
  8. `source` string COMMENT '渠道号',
  9. `os` string COMMENT '安卓系统版本',
  10. `area` string COMMENT '区域',
  11. `model` string COMMENT '手机型号',
  12. `brand` string COMMENT '手机品牌',
  13. `sdk_version` string COMMENT 'sdkVersion',
  14. `gmail` string COMMENT 'gmail',
  15. `height_width` string COMMENT '屏幕宽高',
  16. `app_time` string COMMENT '客户端日志产生时的时间',
  17. `network` string COMMENT '网络模式',
  18. `lng` string COMMENT '经度',
  19. `lat` string COMMENT '纬度',
  20. `create_date` string comment '设备新增时间',
  21. `retention_day` int comment '截止当前日期留存天数'
  22. ) COMMENT '每日用户留存情况'
  23. PARTITIONED BY (`dt` string)
  24. stored as parquet
  25. location '/warehouse/gmall/dws/dws_user_retention_day/';
  26. #step2. 从用户每日详情表导入数据,如果用户留存表里没有新的用户详情,则发现一个新增用户
  27. insert overwrite table dws_user_retention_day
  28. partition(dt="2020-08-19")
  29. select
  30. nm.mid_id,
  31. nm.user_id,
  32. nm.version_code,
  33. nm.version_name,
  34. nm.lang,
  35. nm.source,
  36. nm.os,
  37. nm.area,
  38. nm.model,
  39. nm.brand,
  40. nm.sdk_version,
  41. nm.gmail,
  42. nm.height_width,
  43. nm.app_time,
  44. nm.network,
  45. nm.lng,
  46. nm.lat,
  47. nm.create_date,
  48. 1 retention_day
  49. from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
  50. where ud.dt='2020-08-19' and nm.create_date=date_add('2020-08-19',-1)
  51. union all
  52. select
  53. nm.mid_id,
  54. nm.user_id ,
  55. nm.version_code ,
  56. nm.version_name ,
  57. nm.lang ,
  58. nm.source,
  59. nm.os,
  60. nm.area,
  61. nm.model,
  62. nm.brand,
  63. nm.sdk_version,
  64. nm.gmail,
  65. nm.height_width,
  66. nm.app_time,
  67. nm.network,
  68. nm.lng,
  69. nm.lat,
  70. nm.create_date,
  71. 2 retention_day
  72. from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
  73. where ud.dt='2020-08-19' and nm.create_date=date_add('2020-08-19',-2)
  74. union all
  75. select
  76. nm.mid_id,
  77. nm.user_id ,
  78. nm.version_code ,
  79. nm.version_name ,
  80. nm.lang ,
  81. nm.source,
  82. nm.os,
  83. nm.area,
  84. nm.model,
  85. nm.brand,
  86. nm.sdk_version,
  87. nm.gmail,
  88. nm.height_width,
  89. nm.app_time,
  90. nm.network,
  91. nm.lng,
  92. nm.lat,
  93. nm.create_date,
  94. 3 retention_day
  95. from dws_uv_detail_day ud join dws_new_mid_day nm on ud.mid_id =nm.mid_id
  96. where ud.dt='2020-08-19' and nm.create_date=date_add('2020-08-19',-3);
  97. create external table ads_user_retention_day_count
  98. (
  99. `create_date` string comment '设备新增日期',
  100. `retention_day` int comment '截止当前日期留存天数',
  101. `retention_count` bigint comment '留存数量'
  102. ) COMMENT '每日用户留存情况'
  103. row format delimited fields terminated by '\t'
  104. location '/warehouse/gmall/ads/ads_user_retention_day_count/';

发表评论

表情:
评论列表 (有 0 条评论,294人围观)

还没有评论,来说两句吧...

相关阅读