离线数仓 (十二) --------- DIM 层搭建

布满荆棘的人生 2024-04-01 11:11 156阅读 0赞

目录

  • 一、商品维度表 (全量)
      1. Hive读取索引文件问题
      1. 首日装载
      1. 每日装载
  • 二、优惠券维度表 (全量)
      1. 首日装载
      1. 每日装载
  • 三、活动维度表 (全量)
      1. 首日装载
      1. 每日转载
  • 四、地区维度表 (特殊)
  • 五、时间维度表 (特殊)
      1. 创建临时表
      1. 导入数据
  • 六、用户维度表 (拉链表)
      1. 拉链表概述
      1. 制作拉链表
      1. 首日装载
      1. 每日装载
  • 七、DIM 层首日数据装载脚本
  • 八、DIM 层每日数据装载脚本

一、商品维度表 (全量)

A、建表语句

  1. DROP TABLE IF EXISTS dim_sku_info;
  2. CREATE EXTERNAL TABLE dim_sku_info (
  3. `id` STRING COMMENT '商品id',
  4. `price` DECIMAL(16,2) COMMENT '商品价格',
  5. `sku_name` STRING COMMENT '商品名称',
  6. `sku_desc` STRING COMMENT '商品描述',
  7. `weight` DECIMAL(16,2) COMMENT '重量',
  8. `is_sale` BOOLEAN COMMENT '是否在售',
  9. `spu_id` STRING COMMENT 'spu编号',
  10. `spu_name` STRING COMMENT 'spu名称',
  11. `category3_id` STRING COMMENT '三级分类id',
  12. `category3_name` STRING COMMENT '三级分类名称',
  13. `category2_id` STRING COMMENT '二级分类id',
  14. `category2_name` STRING COMMENT '二级分类名称',
  15. `category1_id` STRING COMMENT '一级分类id',
  16. `category1_name` STRING COMMENT '一级分类名称',
  17. `tm_id` STRING COMMENT '品牌id',
  18. `tm_name` STRING COMMENT '品牌名称',
  19. `sku_attr_values` ARRAY<STRUCT<attr_id:STRING,value_id:STRING,attr_name:STRING,value_name:STRING>> COMMENT '平台属性',
  20. `sku_sale_attr_values` ARRAY<STRUCT<sale_attr_id:STRING,sale_attr_value_id:STRING,sale_attr_name:STRING,sale_attr_value_name:STRING>> COMMENT '销售属性',
  21. `create_time` STRING COMMENT '创建时间'
  22. ) COMMENT '商品维度表'
  23. PARTITIONED BY (`dt` STRING)
  24. STORED AS PARQUET
  25. LOCATION '/warehouse/gmall/dim/dim_sku_info/'
  26. TBLPROPERTIES ("parquet.compression"="lzo");

B、分区规划

在这里插入图片描述

C、数据装载
在这里插入图片描述

1. Hive读取索引文件问题

(1)两种方式,分别查询数据有多少行

  1. hive (gmall)> select * from ods_log;
  2. Time taken: 0.706 seconds, Fetched: 2955 row(s)
  3. hive (gmall)> select count(*) from ods_log;
  4. 2959

(2)两次查询结果不一致。

原因是 select * from ods_log 不执行 MR 操作,直接采用的是 ods_log 建表语句中指定的 DeprecatedLzoTextInputFormat,能够识别 lzo.index 为索引文件。
select count(*) from ods_log 执行 MR 操作,会先经过 hive.input.format,其默认值为 CombineHiveInputFormat,其会先将索引文件当成小文件合并,将其当做普通文件处理。更严重的是,这会导致 LZO 文件无法切片。

  1. hive (gmall)>
  2. hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;

解决办法:修改CombineHiveInputFormat为HiveInputFormat

  1. hive (gmall)>
  2. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;

2. 首日装载

  1. with
  2. sku as
  3. (
  4. select
  5. id,
  6. price,
  7. sku_name,
  8. sku_desc,
  9. weight,
  10. is_sale,
  11. spu_id,
  12. category3_id,
  13. tm_id,
  14. create_time
  15. from ods_sku_info
  16. where dt='2020-06-14'
  17. ),
  18. spu as
  19. (
  20. select
  21. id,
  22. spu_name
  23. from ods_spu_info
  24. where dt='2020-06-14'
  25. ),
  26. c3 as
  27. (
  28. select
  29. id,
  30. name,
  31. category2_id
  32. from ods_base_category3
  33. where dt='2020-06-14'
  34. ),
  35. c2 as
  36. (
  37. select
  38. id,
  39. name,
  40. category1_id
  41. from ods_base_category2
  42. where dt='2020-06-14'
  43. ),
  44. c1 as
  45. (
  46. select
  47. id,
  48. name
  49. from ods_base_category1
  50. where dt='2020-06-14'
  51. ),
  52. tm as
  53. (
  54. select
  55. id,
  56. tm_name
  57. from ods_base_trademark
  58. where dt='2020-06-14'
  59. ),
  60. attr as
  61. (
  62. select
  63. sku_id,
  64. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
  65. from ods_sku_attr_value
  66. where dt='2020-06-14'
  67. group by sku_id
  68. ),
  69. sale_attr as
  70. (
  71. select
  72. sku_id,
  73. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
  74. from ods_sku_sale_attr_value
  75. where dt='2020-06-14'
  76. group by sku_id
  77. )
  78. insert overwrite table dim_sku_info partition(dt='2020-06-14')
  79. select
  80. sku.id,
  81. sku.price,
  82. sku.sku_name,
  83. sku.sku_desc,
  84. sku.weight,
  85. sku.is_sale,
  86. sku.spu_id,
  87. spu.spu_name,
  88. sku.category3_id,
  89. c3.name,
  90. c3.category2_id,
  91. c2.name,
  92. c2.category1_id,
  93. c1.name,
  94. sku.tm_id,
  95. tm.tm_name,
  96. attr.attrs,
  97. sale_attr.sale_attrs,
  98. sku.create_time
  99. from sku
  100. left join spu on sku.spu_id=spu.id
  101. left join c3 on sku.category3_id=c3.id
  102. left join c2 on c3.category2_id=c2.id
  103. left join c1 on c2.category1_id=c1.id
  104. left join tm on sku.tm_id=tm.id
  105. left join attr on sku.id=attr.sku_id
  106. left join sale_attr on sku.id=sale_attr.sku_id;

3. 每日装载

  1. with
  2. sku as
  3. (
  4. select
  5. id,
  6. price,
  7. sku_name,
  8. sku_desc,
  9. weight,
  10. is_sale,
  11. spu_id,
  12. category3_id,
  13. tm_id,
  14. create_time
  15. from ods_sku_info
  16. where dt='2020-06-15'
  17. ),
  18. spu as
  19. (
  20. select
  21. id,
  22. spu_name
  23. from ods_spu_info
  24. where dt='2020-06-15'
  25. ),
  26. c3 as
  27. (
  28. select
  29. id,
  30. name,
  31. category2_id
  32. from ods_base_category3
  33. where dt='2020-06-15'
  34. ),
  35. c2 as
  36. (
  37. select
  38. id,
  39. name,
  40. category1_id
  41. from ods_base_category2
  42. where dt='2020-06-15'
  43. ),
  44. c1 as
  45. (
  46. select
  47. id,
  48. name
  49. from ods_base_category1
  50. where dt='2020-06-15'
  51. ),
  52. tm as
  53. (
  54. select
  55. id,
  56. tm_name
  57. from ods_base_trademark
  58. where dt='2020-06-15'
  59. ),
  60. attr as
  61. (
  62. select
  63. sku_id,
  64. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
  65. from ods_sku_attr_value
  66. where dt='2020-06-15'
  67. group by sku_id
  68. ),
  69. sale_attr as
  70. (
  71. select
  72. sku_id,
  73. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
  74. from ods_sku_sale_attr_value
  75. where dt='2020-06-15'
  76. group by sku_id
  77. )
  78. insert overwrite table dim_sku_info partition(dt='2020-06-15')
  79. select
  80. sku.id,
  81. sku.price,
  82. sku.sku_name,
  83. sku.sku_desc,
  84. sku.weight,
  85. sku.is_sale,
  86. sku.spu_id,
  87. spu.spu_name,
  88. sku.category3_id,
  89. c3.name,
  90. c3.category2_id,
  91. c2.name,
  92. c2.category1_id,
  93. c1.name,
  94. sku.tm_id,
  95. tm.tm_name,
  96. attr.attrs,
  97. sale_attr.sale_attrs,
  98. sku.create_time
  99. from sku
  100. left join spu on sku.spu_id=spu.id
  101. left join c3 on sku.category3_id=c3.id
  102. left join c2 on c3.category2_id=c2.id
  103. left join c1 on c2.category1_id=c1.id
  104. left join tm on sku.tm_id=tm.id
  105. left join attr on sku.id=attr.sku_id
  106. left join sale_attr on sku.id=sale_attr.sku_id;

二、优惠券维度表 (全量)

A、建表语句

  1. DROP TABLE IF EXISTS dim_coupon_info;
  2. CREATE EXTERNAL TABLE dim_coupon_info(
  3. `id` STRING COMMENT '购物券编号',
  4. `coupon_name` STRING COMMENT '购物券名称',
  5. `coupon_type` STRING COMMENT '购物券类型 1 现金券 2 折扣券 3 满减券 4 满件打折券',
  6. `condition_amount` DECIMAL(16,2) COMMENT '满额数',
  7. `condition_num` BIGINT COMMENT '满件数',
  8. `activity_id` STRING COMMENT '活动编号',
  9. `benefit_amount` DECIMAL(16,2) COMMENT '减金额',
  10. `benefit_discount` DECIMAL(16,2) COMMENT '折扣',
  11. `create_time` STRING COMMENT '创建时间',
  12. `range_type` STRING COMMENT '范围类型 1、商品 2、品类 3、品牌',
  13. `limit_num` BIGINT COMMENT '最多领取次数',
  14. `taken_count` BIGINT COMMENT '已领取次数',
  15. `start_time` STRING COMMENT '可以领取的开始日期',
  16. `end_time` STRING COMMENT '可以领取的结束日期',
  17. `operate_time` STRING COMMENT '修改时间',
  18. `expire_time` STRING COMMENT '过期时间'
  19. ) COMMENT '优惠券维度表'
  20. PARTITIONED BY (`dt` STRING)
  21. STORED AS PARQUET
  22. LOCATION '/warehouse/gmall/dim/dim_coupon_info/'
  23. TBLPROPERTIES ("parquet.compression"="lzo");

B、分区规划

在这里插入图片描述

C、数据装载

在这里插入图片描述

1. 首日装载

  1. insert overwrite table dim_coupon_info partition(dt='2020-06-14')
  2. select
  3. id,
  4. coupon_name,
  5. coupon_type,
  6. condition_amount,
  7. condition_num,
  8. activity_id,
  9. benefit_amount,
  10. benefit_discount,
  11. create_time,
  12. range_type,
  13. limit_num,
  14. taken_count,
  15. start_time,
  16. end_time,
  17. operate_time,
  18. expire_time
  19. from ods_coupon_info
  20. where dt='2020-06-14';

2. 每日装载

  1. insert overwrite table dim_coupon_info partition(dt='2020-06-15')
  2. select
  3. id,
  4. coupon_name,
  5. coupon_type,
  6. condition_amount,
  7. condition_num,
  8. activity_id,
  9. benefit_amount,
  10. benefit_discount,
  11. create_time,
  12. range_type,
  13. limit_num,
  14. taken_count,
  15. start_time,
  16. end_time,
  17. operate_time,
  18. expire_time
  19. from ods_coupon_info
  20. where dt='2020-06-15';

三、活动维度表 (全量)

A、建表语句

  1. DROP TABLE IF EXISTS dim_activity_rule_info;
  2. CREATE EXTERNAL TABLE dim_activity_rule_info(
  3. `activity_rule_id` STRING COMMENT '活动规则ID',
  4. `activity_id` STRING COMMENT '活动ID',
  5. `activity_name` STRING COMMENT '活动名称',
  6. `activity_type` STRING COMMENT '活动类型',
  7. `start_time` STRING COMMENT '开始时间',
  8. `end_time` STRING COMMENT '结束时间',
  9. `create_time` STRING COMMENT '创建时间',
  10. `condition_amount` DECIMAL(16,2) COMMENT '满减金额',
  11. `condition_num` BIGINT COMMENT '满减件数',
  12. `benefit_amount` DECIMAL(16,2) COMMENT '优惠金额',
  13. `benefit_discount` DECIMAL(16,2) COMMENT '优惠折扣',
  14. `benefit_level` STRING COMMENT '优惠级别'
  15. ) COMMENT '活动信息表'
  16. PARTITIONED BY (`dt` STRING)
  17. STORED AS PARQUET
  18. LOCATION '/warehouse/gmall/dim/dim_activity_rule_info/'
  19. TBLPROPERTIES ("parquet.compression"="lzo");

B、分区规划

在这里插入图片描述
C、数据装载

在这里插入图片描述

1. 首日装载

  1. insert overwrite table dim_activity_rule_info partition(dt='2020-06-14')
  2. select
  3. ar.id,
  4. ar.activity_id,
  5. ai.activity_name,
  6. ar.activity_type,
  7. ai.start_time,
  8. ai.end_time,
  9. ai.create_time,
  10. ar.condition_amount,
  11. ar.condition_num,
  12. ar.benefit_amount,
  13. ar.benefit_discount,
  14. ar.benefit_level
  15. from
  16. (
  17. select
  18. id,
  19. activity_id,
  20. activity_type,
  21. condition_amount,
  22. condition_num,
  23. benefit_amount,
  24. benefit_discount,
  25. benefit_level
  26. from ods_activity_rule
  27. where dt='2020-06-14'
  28. )ar
  29. left join
  30. (
  31. select
  32. id,
  33. activity_name,
  34. start_time,
  35. end_time,
  36. create_time
  37. from ods_activity_info
  38. where dt='2020-06-14'
  39. )ai
  40. on ar.activity_id=ai.id;

2. 每日转载

  1. insert overwrite table dim_activity_rule_info partition(dt='2020-06-15')
  2. select
  3. ar.id,
  4. ar.activity_id,
  5. ai.activity_name,
  6. ar.activity_type,
  7. ai.start_time,
  8. ai.end_time,
  9. ai.create_time,
  10. ar.condition_amount,
  11. ar.condition_num,
  12. ar.benefit_amount,
  13. ar.benefit_discount,
  14. ar.benefit_level
  15. from
  16. (
  17. select
  18. id,
  19. activity_id,
  20. activity_type,
  21. condition_amount,
  22. condition_num,
  23. benefit_amount,
  24. benefit_discount,
  25. benefit_level
  26. from ods_activity_rule
  27. where dt='2020-06-15'
  28. )ar
  29. left join
  30. (
  31. select
  32. id,
  33. activity_name,
  34. start_time,
  35. end_time,
  36. create_time
  37. from ods_activity_info
  38. where dt='2020-06-15'
  39. )ai
  40. on ar.activity_id=ai.id;

四、地区维度表 (特殊)

A、建表语句

  1. DROP TABLE IF EXISTS dim_base_province;
  2. CREATE EXTERNAL TABLE dim_base_province (
  3. `id` STRING COMMENT 'id',
  4. `province_name` STRING COMMENT '省市名称',
  5. `area_code` STRING COMMENT '地区编码',
  6. `iso_code` STRING COMMENT 'ISO-3166编码,供可视化使用',
  7. `iso_3166_2` STRING COMMENT 'IOS-3166-2编码,供可视化使用',
  8. `region_id` STRING COMMENT '地区id',
  9. `region_name` STRING COMMENT '地区名称'
  10. ) COMMENT '地区维度表'
  11. STORED AS PARQUET
  12. LOCATION '/warehouse/gmall/dim/dim_base_province/'
  13. TBLPROPERTIES ("parquet.compression"="lzo");

B、数据装载

地区维度表数据相对稳定,变化概率较低,故无需每日装载

在这里插入图片描述

  1. insert overwrite table dim_base_province
  2. select
  3. bp.id,
  4. bp.name,
  5. bp.area_code,
  6. bp.iso_code,
  7. bp.iso_3166_2,
  8. bp.region_id,
  9. br.region_name
  10. from ods_base_province bp
  11. join ods_base_region br on bp.region_id = br.id;

五、时间维度表 (特殊)

A、建表语句

  1. DROP TABLE IF EXISTS dim_date_info;
  2. CREATE EXTERNAL TABLE dim_date_info(
  3. `date_id` STRING COMMENT '日',
  4. `week_id` STRING COMMENT '周ID',
  5. `week_day` STRING COMMENT '周几',
  6. `day` STRING COMMENT '每月的第几天',
  7. `month` STRING COMMENT '第几月',
  8. `quarter` STRING COMMENT '第几季度',
  9. `year` STRING COMMENT '年',
  10. `is_workday` STRING COMMENT '是否是工作日',
  11. `holiday_id` STRING COMMENT '节假日'
  12. ) COMMENT '时间维度表'
  13. STORED AS PARQUET
  14. LOCATION '/warehouse/gmall/dim/dim_date_info/'
  15. TBLPROPERTIES ("parquet.compression"="lzo");

B、数据装载

通常情况下,时间维度表的数据并不是来自于业务系统,而是手动写入,并且由于时间维度表数据的可预见性,无须每日导入,一般可一次性导入一年的数据。

1. 创建临时表

DROP TABLE IF EXISTS tmp_dim_date_info;

  1. CREATE EXTERNAL TABLE tmp_dim_date_info (
  2. `date_id` STRING COMMENT '日',
  3. `week_id` STRING COMMENT '周ID',
  4. `week_day` STRING COMMENT '周几',
  5. `day` STRING COMMENT '每月的第几天',
  6. `month` STRING COMMENT '第几月',
  7. `quarter` STRING COMMENT '第几季度',
  8. `year` STRING COMMENT '年',
  9. `is_workday` STRING COMMENT '是否是工作日',
  10. `holiday_id` STRING COMMENT '节假日'
  11. ) COMMENT '时间维度表'
  12. ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
  13. LOCATION '/warehouse/gmall/tmp/tmp_dim_date_info/';

2. 导入数据

将数据文件上传到 HFDS 上临时表指定路径 /warehouse/gmall/tmp/tmp_dim_date_info/

执行以下语句将其导入时间维度表

  1. insert overwrite table dim_date_info select * from tmp_dim_date_info;

检查数据是否导入成功

六、用户维度表 (拉链表)

1. 拉链表概述

A、什么是拉链表

拉链表,记录每条信息的生命周期,一旦一条记录的生命周期结束,就重新开始一条新的记录,当前日期放入生效开始日期。如果当前信息至今有效,在生效结束日期中填入一个极大值(如9999-99-99)。

在这里插入图片描述

B、为什么要做拉链表

拉链表适合于:数据会发生变化,但是变化频率并不高的维度(即;缓慢变化维)。

比如:用户信息会发生变化,但是每天变化的比例不高。如果数据量有一定规模,按照每日全量的方式保存效率很低。

比如:1亿用户*365天,每天一份用户信息。(做每日全量效率低)

在这里插入图片描述

C、如何使用拉链表

通过,生效开始日期 <= 某个日期且生效结束日期某个日期,能够得到某个时间点的数据全量切片。

1)拉链表数据

在这里插入图片描述

2)例如获取2019-01-01的历史切片: select*from user_info where start_date=2019-01-01'and end date=2019-01-01

在这里插入图片描述

3)例如获取 2019-01-02 的历史切片: seleet*from order_info where start_datec="2019-01-02'and end_date=2019-01-02'

在这里插入图片描述

D、拉链表形成过程

在这里插入图片描述

1. 制作拉链表

A、建表语句

  1. DROP TABLE IF EXISTS dim_user_info;
  2. CREATE EXTERNAL TABLE dim_user_info(
  3. `id` STRING COMMENT '用户id',
  4. `login_name` STRING COMMENT '用户名称',
  5. `nick_name` STRING COMMENT '用户昵称',
  6. `name` STRING COMMENT '用户姓名',
  7. `phone_num` STRING COMMENT '手机号码',
  8. `email` STRING COMMENT '邮箱',
  9. `user_level` STRING COMMENT '用户等级',
  10. `birthday` STRING COMMENT '生日',
  11. `gender` STRING COMMENT '性别',
  12. `create_time` STRING COMMENT '创建时间',
  13. `operate_time` STRING COMMENT '操作时间',
  14. `start_date` STRING COMMENT '开始日期',
  15. `end_date` STRING COMMENT '结束日期'
  16. ) COMMENT '用户表'
  17. PARTITIONED BY (`dt` STRING)
  18. STORED AS PARQUET
  19. LOCATION '/warehouse/gmall/dim/dim_user_info/'
  20. TBLPROPERTIES ("parquet.compression"="lzo");

B、分区规划

在这里插入图片描述

C、数据装载

在这里插入图片描述

3. 首日装载

拉链表首日装载,需要进行初始化操作,具体工作为将截止到初始化当日的全部历史用户导入一次性导入到拉链表中。目前的 ods_user_info 表的第一个分区,即 2020-06-14 分区中就是全部的历史用户,故将该分区数据进行一定处理后导入拉链表的 9999-99-99 分区即可。

  1. insert overwrite table dim_user_info partition(dt='9999-99-99')
  2. select
  3. id,
  4. login_name,
  5. nick_name,
  6. md5(name),
  7. md5(phone_num),
  8. md5(email),
  9. user_level,
  10. birthday,
  11. gender,
  12. create_time,
  13. operate_time,
  14. '2020-06-14',
  15. '9999-99-99'
  16. from ods_user_info
  17. where dt='2020-06-14';

4. 每日装载

A、实现思路

在这里插入图片描述

B、sql 编写

  1. with
  2. tmp as
  3. (
  4. select
  5. old.id old_id,
  6. old.login_name old_login_name,
  7. old.nick_name old_nick_name,
  8. old.name old_name,
  9. old.phone_num old_phone_num,
  10. old.email old_email,
  11. old.user_level old_user_level,
  12. old.birthday old_birthday,
  13. old.gender old_gender,
  14. old.create_time old_create_time,
  15. old.operate_time old_operate_time,
  16. old.start_date old_start_date,
  17. old.end_date old_end_date,
  18. new.id new_id,
  19. new.login_name new_login_name,
  20. new.nick_name new_nick_name,
  21. new.name new_name,
  22. new.phone_num new_phone_num,
  23. new.email new_email,
  24. new.user_level new_user_level,
  25. new.birthday new_birthday,
  26. new.gender new_gender,
  27. new.create_time new_create_time,
  28. new.operate_time new_operate_time,
  29. new.start_date new_start_date,
  30. new.end_date new_end_date
  31. from
  32. (
  33. select
  34. id,
  35. login_name,
  36. nick_name,
  37. name,
  38. phone_num,
  39. email,
  40. user_level,
  41. birthday,
  42. gender,
  43. create_time,
  44. operate_time,
  45. start_date,
  46. end_date
  47. from dim_user_info
  48. where dt='9999-99-99'
  49. )old
  50. full outer join
  51. (
  52. select
  53. id,
  54. login_name,
  55. nick_name,
  56. md5(name) name,
  57. md5(phone_num) phone_num,
  58. md5(email) email,
  59. user_level,
  60. birthday,
  61. gender,
  62. create_time,
  63. operate_time,
  64. '2020-06-15' start_date,
  65. '9999-99-99' end_date
  66. from ods_user_info
  67. where dt='2020-06-15'
  68. )new
  69. on old.id=new.id
  70. )
  71. insert overwrite table dim_user_info partition(dt)
  72. select
  73. nvl(new_id,old_id),
  74. nvl(new_login_name,old_login_name),
  75. nvl(new_nick_name,old_nick_name),
  76. nvl(new_name,old_name),
  77. nvl(new_phone_num,old_phone_num),
  78. nvl(new_email,old_email),
  79. nvl(new_user_level,old_user_level),
  80. nvl(new_birthday,old_birthday),
  81. nvl(new_gender,old_gender),
  82. nvl(new_create_time,old_create_time),
  83. nvl(new_operate_time,old_operate_time),
  84. nvl(new_start_date,old_start_date),
  85. nvl(new_end_date,old_end_date),
  86. nvl(new_end_date,old_end_date) dt
  87. from tmp
  88. union all
  89. select
  90. old_id,
  91. old_login_name,
  92. old_nick_name,
  93. old_name,
  94. old_phone_num,
  95. old_email,
  96. old_user_level,
  97. old_birthday,
  98. old_gender,
  99. old_create_time,
  100. old_operate_time,
  101. old_start_date,
  102. cast(date_add('2020-06-15',-1) as string),
  103. cast(date_add('2020-06-15',-1) as string) dt
  104. from tmp
  105. where new_id is not null and old_id is not null;

七、DIM 层首日数据装载脚本

A、编写脚本
在 /home/fancy/bin 目录下创建脚本 ods_to_dim_db_init.sh

  1. [fancy@node101 bin]$ vim ods_to_dim_db_init.sh

在脚本中填写如下内容

  1. #!/bin/bash
  2. APP=gmall
  3. if [ -n "$2" ] ;then
  4. do_date=$2
  5. else
  6. echo "请传入日期参数"
  7. exit
  8. fi
  9. dim_user_info="
  10. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  11. insert overwrite table ${APP}.dim_user_info partition(dt='9999-99-99')
  12. select
  13. id,
  14. login_name,
  15. nick_name,
  16. md5(name),
  17. md5(phone_num),
  18. md5(email),
  19. user_level,
  20. birthday,
  21. gender,
  22. create_time,
  23. operate_time,
  24. '$do_date',
  25. '9999-99-99'
  26. from ${APP}.ods_user_info
  27. where dt='$do_date';
  28. "
  29. dim_sku_info="
  30. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  31. with
  32. sku as
  33. (
  34. select
  35. id,
  36. price,
  37. sku_name,
  38. sku_desc,
  39. weight,
  40. is_sale,
  41. spu_id,
  42. category3_id,
  43. tm_id,
  44. create_time
  45. from ${APP}.ods_sku_info
  46. where dt='$do_date'
  47. ),
  48. spu as
  49. (
  50. select
  51. id,
  52. spu_name
  53. from ${APP}.ods_spu_info
  54. where dt='$do_date'
  55. ),
  56. c3 as
  57. (
  58. select
  59. id,
  60. name,
  61. category2_id
  62. from ${APP}.ods_base_category3
  63. where dt='$do_date'
  64. ),
  65. c2 as
  66. (
  67. select
  68. id,
  69. name,
  70. category1_id
  71. from ${APP}.ods_base_category2
  72. where dt='$do_date'
  73. ),
  74. c1 as
  75. (
  76. select
  77. id,
  78. name
  79. from ${APP}.ods_base_category1
  80. where dt='$do_date'
  81. ),
  82. tm as
  83. (
  84. select
  85. id,
  86. tm_name
  87. from ${APP}.ods_base_trademark
  88. where dt='$do_date'
  89. ),
  90. attr as
  91. (
  92. select
  93. sku_id,
  94. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
  95. from ${APP}.ods_sku_attr_value
  96. where dt='$do_date'
  97. group by sku_id
  98. ),
  99. sale_attr as
  100. (
  101. select
  102. sku_id,
  103. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
  104. from ${APP}.ods_sku_sale_attr_value
  105. where dt='$do_date'
  106. group by sku_id
  107. )
  108. insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
  109. select
  110. sku.id,
  111. sku.price,
  112. sku.sku_name,
  113. sku.sku_desc,
  114. sku.weight,
  115. sku.is_sale,
  116. sku.spu_id,
  117. spu.spu_name,
  118. sku.category3_id,
  119. c3.name,
  120. c3.category2_id,
  121. c2.name,
  122. c2.category1_id,
  123. c1.name,
  124. sku.tm_id,
  125. tm.tm_name,
  126. attr.attrs,
  127. sale_attr.sale_attrs,
  128. sku.create_time
  129. from sku
  130. left join spu on sku.spu_id=spu.id
  131. left join c3 on sku.category3_id=c3.id
  132. left join c2 on c3.category2_id=c2.id
  133. left join c1 on c2.category1_id=c1.id
  134. left join tm on sku.tm_id=tm.id
  135. left join attr on sku.id=attr.sku_id
  136. left join sale_attr on sku.id=sale_attr.sku_id;
  137. "
  138. dim_base_province="
  139. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  140. insert overwrite table ${APP}.dim_base_province
  141. select
  142. bp.id,
  143. bp.name,
  144. bp.area_code,
  145. bp.iso_code,
  146. bp.iso_3166_2,
  147. bp.region_id,
  148. br.region_name
  149. from ${APP}.ods_base_province bp
  150. join ${APP}.ods_base_region br on bp.region_id = br.id;
  151. "
  152. dim_coupon_info="
  153. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  154. insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
  155. select
  156. id,
  157. coupon_name,
  158. coupon_type,
  159. condition_amount,
  160. condition_num,
  161. activity_id,
  162. benefit_amount,
  163. benefit_discount,
  164. create_time,
  165. range_type,
  166. limit_num,
  167. taken_count,
  168. start_time,
  169. end_time,
  170. operate_time,
  171. expire_time
  172. from ${APP}.ods_coupon_info
  173. where dt='$do_date';
  174. "
  175. dim_activity_rule_info="
  176. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  177. insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
  178. select
  179. ar.id,
  180. ar.activity_id,
  181. ai.activity_name,
  182. ar.activity_type,
  183. ai.start_time,
  184. ai.end_time,
  185. ai.create_time,
  186. ar.condition_amount,
  187. ar.condition_num,
  188. ar.benefit_amount,
  189. ar.benefit_discount,
  190. ar.benefit_level
  191. from
  192. (
  193. select
  194. id,
  195. activity_id,
  196. activity_type,
  197. condition_amount,
  198. condition_num,
  199. benefit_amount,
  200. benefit_discount,
  201. benefit_level
  202. from ${APP}.ods_activity_rule
  203. where dt='$do_date'
  204. )ar
  205. left join
  206. (
  207. select
  208. id,
  209. activity_name,
  210. start_time,
  211. end_time,
  212. create_time
  213. from ${APP}.ods_activity_info
  214. where dt='$do_date'
  215. )ai
  216. on ar.activity_id=ai.id;
  217. "
  218. case $1 in
  219. "dim_user_info"){
  220. hive -e "$dim_user_info"
  221. };;
  222. "dim_sku_info"){
  223. hive -e "$dim_sku_info"
  224. };;
  225. "dim_base_province"){
  226. hive -e "$dim_base_province"
  227. };;
  228. "dim_coupon_info"){
  229. hive -e "$dim_coupon_info"
  230. };;
  231. "dim_activity_rule_info"){
  232. hive -e "$dim_activity_rule_info"
  233. };;
  234. "all"){
  235. hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info$dim_base_province"
  236. };;
  237. esac

增加执行权限

  1. [fancy@node101 bin]$ chmod +x ods_to_dim_db_init.sh

B、脚本使用

执行脚本

  1. [fancy@node101 bin]$ ods_to_dim_db_init.sh all 2020-06-14

注意:该脚本不包含时间维度表的装载,时间维度表需手动装载数据。

查看数据是否导入成功

八、DIM 层每日数据装载脚本

A、编写脚本

在 /home/fancy/bin 目录下创建脚本 ods_to_dim_db.sh

  1. [fancy@node101 bin]$ vim ods_to_dim_db.sh

在脚本中填写如下内容

  1. #!/bin/bash
  2. APP=gmall
  3. # 如果是输入的日期按照取输入日期;如果没输入日期取当前时间的前一天
  4. if [ -n "$2" ] ;then
  5. do_date=$2
  6. else
  7. do_date=`date -d "-1 day" +%F`
  8. fi
  9. dim_user_info="
  10. set hive.exec.dynamic.partition.mode=nonstrict;
  11. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  12. with
  13. tmp as
  14. (
  15. select
  16. old.id old_id,
  17. old.login_name old_login_name,
  18. old.nick_name old_nick_name,
  19. old.name old_name,
  20. old.phone_num old_phone_num,
  21. old.email old_email,
  22. old.user_level old_user_level,
  23. old.birthday old_birthday,
  24. old.gender old_gender,
  25. old.create_time old_create_time,
  26. old.operate_time old_operate_time,
  27. old.start_date old_start_date,
  28. old.end_date old_end_date,
  29. new.id new_id,
  30. new.login_name new_login_name,
  31. new.nick_name new_nick_name,
  32. new.name new_name,
  33. new.phone_num new_phone_num,
  34. new.email new_email,
  35. new.user_level new_user_level,
  36. new.birthday new_birthday,
  37. new.gender new_gender,
  38. new.create_time new_create_time,
  39. new.operate_time new_operate_time,
  40. new.start_date new_start_date,
  41. new.end_date new_end_date
  42. from
  43. (
  44. select
  45. id,
  46. login_name,
  47. nick_name,
  48. name,
  49. phone_num,
  50. email,
  51. user_level,
  52. birthday,
  53. gender,
  54. create_time,
  55. operate_time,
  56. start_date,
  57. end_date
  58. from ${APP}.dim_user_info
  59. where dt='9999-99-99'
  60. and start_date<'$do_date'
  61. )old
  62. full outer join
  63. (
  64. select
  65. id,
  66. login_name,
  67. nick_name,
  68. md5(name) name,
  69. md5(phone_num) phone_num,
  70. md5(email) email,
  71. user_level,
  72. birthday,
  73. gender,
  74. create_time,
  75. operate_time,
  76. '$do_date' start_date,
  77. '9999-99-99' end_date
  78. from ${APP}.ods_user_info
  79. where dt='$do_date'
  80. )new
  81. on old.id=new.id
  82. )
  83. insert overwrite table ${APP}.dim_user_info partition(dt)
  84. select
  85. nvl(new_id,old_id),
  86. nvl(new_login_name,old_login_name),
  87. nvl(new_nick_name,old_nick_name),
  88. nvl(new_name,old_name),
  89. nvl(new_phone_num,old_phone_num),
  90. nvl(new_email,old_email),
  91. nvl(new_user_level,old_user_level),
  92. nvl(new_birthday,old_birthday),
  93. nvl(new_gender,old_gender),
  94. nvl(new_create_time,old_create_time),
  95. nvl(new_operate_time,old_operate_time),
  96. nvl(new_start_date,old_start_date),
  97. nvl(new_end_date,old_end_date),
  98. nvl(new_end_date,old_end_date) dt
  99. from tmp
  100. union all
  101. select
  102. old_id,
  103. old_login_name,
  104. old_nick_name,
  105. old_name,
  106. old_phone_num,
  107. old_email,
  108. old_user_level,
  109. old_birthday,
  110. old_gender,
  111. old_create_time,
  112. old_operate_time,
  113. old_start_date,
  114. cast(date_add('$do_date',-1) as string),
  115. cast(date_add('$do_date',-1) as string) dt
  116. from tmp
  117. where new_id is not null and old_id is not null;
  118. "
  119. dim_sku_info="
  120. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  121. with
  122. sku as
  123. (
  124. select
  125. id,
  126. price,
  127. sku_name,
  128. sku_desc,
  129. weight,
  130. is_sale,
  131. spu_id,
  132. category3_id,
  133. tm_id,
  134. create_time
  135. from ${APP}.ods_sku_info
  136. where dt='$do_date'
  137. ),
  138. spu as
  139. (
  140. select
  141. id,
  142. spu_name
  143. from ${APP}.ods_spu_info
  144. where dt='$do_date'
  145. ),
  146. c3 as
  147. (
  148. select
  149. id,
  150. name,
  151. category2_id
  152. from ${APP}.ods_base_category3
  153. where dt='$do_date'
  154. ),
  155. c2 as
  156. (
  157. select
  158. id,
  159. name,
  160. category1_id
  161. from ${APP}.ods_base_category2
  162. where dt='$do_date'
  163. ),
  164. c1 as
  165. (
  166. select
  167. id,
  168. name
  169. from ${APP}.ods_base_category1
  170. where dt='$do_date'
  171. ),
  172. tm as
  173. (
  174. select
  175. id,
  176. tm_name
  177. from ${APP}.ods_base_trademark
  178. where dt='$do_date'
  179. ),
  180. attr as
  181. (
  182. select
  183. sku_id,
  184. collect_set(named_struct('attr_id',attr_id,'value_id',value_id,'attr_name',attr_name,'value_name',value_name)) attrs
  185. from ${APP}.ods_sku_attr_value
  186. where dt='$do_date'
  187. group by sku_id
  188. ),
  189. sale_attr as
  190. (
  191. select
  192. sku_id,
  193. collect_set(named_struct('sale_attr_id',sale_attr_id,'sale_attr_value_id',sale_attr_value_id,'sale_attr_name',sale_attr_name,'sale_attr_value_name',sale_attr_value_name)) sale_attrs
  194. from ${APP}.ods_sku_sale_attr_value
  195. where dt='$do_date'
  196. group by sku_id
  197. )
  198. insert overwrite table ${APP}.dim_sku_info partition(dt='$do_date')
  199. select
  200. sku.id,
  201. sku.price,
  202. sku.sku_name,
  203. sku.sku_desc,
  204. sku.weight,
  205. sku.is_sale,
  206. sku.spu_id,
  207. spu.spu_name,
  208. sku.category3_id,
  209. c3.name,
  210. c3.category2_id,
  211. c2.name,
  212. c2.category1_id,
  213. c1.name,
  214. sku.tm_id,
  215. tm.tm_name,
  216. attr.attrs,
  217. sale_attr.sale_attrs,
  218. sku.create_time
  219. from sku
  220. left join spu on sku.spu_id=spu.id
  221. left join c3 on sku.category3_id=c3.id
  222. left join c2 on c3.category2_id=c2.id
  223. left join c1 on c2.category1_id=c1.id
  224. left join tm on sku.tm_id=tm.id
  225. left join attr on sku.id=attr.sku_id
  226. left join sale_attr on sku.id=sale_attr.sku_id;
  227. "
  228. dim_base_province="
  229. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  230. insert overwrite table ${APP}.dim_base_province
  231. select
  232. bp.id,
  233. bp.name,
  234. bp.area_code,
  235. bp.iso_code,
  236. bp.iso_3166_2,
  237. bp.region_id,
  238. bp.name
  239. from ${APP}.ods_base_province bp
  240. join ${APP}.ods_base_region br on bp.region_id = br.id;
  241. "
  242. dim_coupon_info="
  243. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  244. insert overwrite table ${APP}.dim_coupon_info partition(dt='$do_date')
  245. select
  246. id,
  247. coupon_name,
  248. coupon_type,
  249. condition_amount,
  250. condition_num,
  251. activity_id,
  252. benefit_amount,
  253. benefit_discount,
  254. create_time,
  255. range_type,
  256. limit_num,
  257. taken_count,
  258. start_time,
  259. end_time,
  260. operate_time,
  261. expire_time
  262. from ${APP}.ods_coupon_info
  263. where dt='$do_date';
  264. "
  265. dim_activity_rule_info="
  266. set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
  267. insert overwrite table ${APP}.dim_activity_rule_info partition(dt='$do_date')
  268. select
  269. ar.id,
  270. ar.activity_id,
  271. ai.activity_name,
  272. ar.activity_type,
  273. ai.start_time,
  274. ai.end_time,
  275. ai.create_time,
  276. ar.condition_amount,
  277. ar.condition_num,
  278. ar.benefit_amount,
  279. ar.benefit_discount,
  280. ar.benefit_level
  281. from
  282. (
  283. select
  284. id,
  285. activity_id,
  286. activity_type,
  287. condition_amount,
  288. condition_num,
  289. benefit_amount,
  290. benefit_discount,
  291. benefit_level
  292. from ${APP}.ods_activity_rule
  293. where dt='$do_date'
  294. )ar
  295. left join
  296. (
  297. select
  298. id,
  299. activity_name,
  300. start_time,
  301. end_time,
  302. create_time
  303. from ${APP}.ods_activity_info
  304. where dt='$do_date'
  305. )ai
  306. on ar.activity_id=ai.id;
  307. "
  308. case $1 in
  309. "dim_user_info"){
  310. hive -e "$dim_user_info"
  311. };;
  312. "dim_sku_info"){
  313. hive -e "$dim_sku_info"
  314. };;
  315. "dim_base_province"){
  316. hive -e "$dim_base_province"
  317. };;
  318. "dim_coupon_info"){
  319. hive -e "$dim_coupon_info"
  320. };;
  321. "dim_activity_rule_info"){
  322. hive -e "$dim_activity_rule_info"
  323. };;
  324. "all"){
  325. hive -e "$dim_user_info$dim_sku_info$dim_coupon_info$dim_activity_rule_info"
  326. };;
  327. esac

增加执行权限

  1. [fancy@node101 bin]$ chmod +x ods_to_dim_db.sh

B、脚本使用

执行脚本

  1. [fancy@node101 bin]$ ods_to_dim_db.sh all 2020-06-14

查看数据是否导入成功

发表评论

表情:
评论列表 (有 0 条评论,156人围观)

还没有评论,来说两句吧...

相关阅读