Skip to content

电商仓库建设之ADS层

ADS层数据不再需要列式存储和压缩。由于也没有分区,所以导入的时候不能加上overwrite。

1. 设备主题

1.1 活跃设备数(日、周、月)

需求定义:
日活:当日活跃的设备数
周活:当周活跃的设备数
月活:当月活跃的设备数

  1. 建表语句
sql
drop table if exists ads_uv_count;
create external table ads_uv_count( 
    `dt` string COMMENT '统计日期',
    `day_count` bigint COMMENT '当日活跃设备数量',
    `wk_count`  bigint COMMENT '当周活跃设备数量',
    `mn_count`  bigint COMMENT '当月活跃设备数量',
    `is_weekend` string COMMENT 'Y或N表示是否是周末,用于得到本周最终结果',
    `is_monthend` string COMMENT 'Y或N表示是否是月末,用于得到本月最终结果' 
) COMMENT '活跃设备数表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_uv_count/';
  1. 导入数据
sql
insert into table ads_uv_count 
select  
    '2020-03-10' dt,
    daycount.ct,
    wkcount.ct,
    mncount.ct,
    if(date_add(next_day('2020-03-10','MO'),-1)='2020-03-10','Y','N') ,
    if(last_day('2020-03-10')='2020-03-10','Y','N') 
from 
(
    select  
        '2020-03-10' dt,
        count(*) ct
    from dwt_uv_topic
    where login_date_last='2020-03-10'  
)daycount join 
( 
    select  
        '2020-03-10' dt,
        count (*) ct
    from dwt_uv_topic
    where login_date_last>=date_add(next_day('2020-03-10','MO'),-7) 
    and login_date_last<= date_add(next_day('2020-03-10','MO'),-1) 
) wkcount on daycount.dt=wkcount.dt
join 
( 
    select  
        '2020-03-10' dt,
        count (*) ct
    from dwt_uv_topic
    where date_format(login_date_last,'yyyy-MM')=date_format('2020-03-10',
'yyyy-MM')  
)mncount on daycount.dt=mncount.dt;

1.2 每日新增设备

  1. 建表语句
sql
drop table if exists ads_new_mid_count;
create external table ads_new_mid_count
(
`create_date` string comment '创建时间' ,
`new_mid_count` BIGINT comment '新增设备数量'
) COMMENT '每日新增设备信息数量'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_new_mid_count/'
  1. 导入数据
sql
insert into table ads_new_mid_count
select
login_date_first,
count(*)
from dwt_uv_topic
where login_date_first='2020-03-10'
group by login_date_first;

1.3 沉默用户数

需求定义:
沉默用户:只在安装当天启动过,且启动时间是在7天前。

  1. 建表语句
sql
drop table if exists ads_silent_count;
create external table ads_silent_count( 
    `dt` string COMMENT '统计日期',
    `silent_count` bigint COMMENT '沉默设备数'
)COMMENT '沉默设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_silent_count';
  1. 导入数据
sql
insert into table ads_silent_count
select
    '2020-03-15',
    count(*) 
from dwt_uv_topic
where login_date_first=login_date_last
and login_date_last<=date_add('2020-03-15',-7);

1.3 本周回流用户数

需求定义:
本周回流用户:上周未活跃,本周活跃的设备,且不是本周新增设备。

  1. 建表语句
sql
drop table if exists ads_back_count;
create external table ads_back_count(
`dt` string COMMENT '统计日期',
`wk_dt` string COMMENT '统计日期所在周',
`wastage_count` bigint COMMENT '回流设备数'
)
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_back_count';
  1. 导入数据,从本周活跃用户中去掉上周未活跃和本周新设备
sql
insert into table ads_back_count
select
    '2020-03-15',
    count(*)
from
(
    select
        mid_id
    from dwt_uv_topic
    where login_date_last>=date_add(next_day('2020-03-15','MO'),-7) 
    and login_date_last<= date_add(next_day('2020-03-15','MO'),-1)
    and login_date_first<date_add(next_day('2020-03-15','MO'),-7)
)current_wk
left join
(
    select
        mid_id
    from dws_uv_detail_daycount
    where dt>=date_add(next_day('2020-03-15','MO'),-7*2) 
    and dt<= date_add(next_day('2020-03-15','MO'),-7-1) 
    group by mid_id
)last_wk
on current_wk.mid_id=last_wk.mid_id
where last_wk.mid_id is null;

1.4 流失用户数

需求定义:
流失用户:最近7天未登录的设备

  1. 建表脚本
sql
drop table if exists ads_wastage_count;
create external table ads_wastage_count( 
    `dt` string COMMENT '统计日期',
    `wastage_count` bigint COMMENT '流失设备数'
) COMMENT '流失设备数'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_wastage_count';
  1. 导入数据
sql
insert into table ads_wastage_count
select
     '2020-03-20',
     count(*)
from 
(
    select 
        mid_id
    from dwt_uv_topic
    where login_date_last<=date_add('2020-03-20',-7)
    group by mid_id
)t1;

1.5 留存率

留存用户:某段时间内的新增用户(活跃用户),经过一段时间后,又继续使用应用的被认作是留存用户。
留存率:留存用户占当时新增用户(活跃用户)的比例即是留存率。
页面展示效果如图所示:
Alt text

  1. 建表脚本
sql
drop table if exists ads_user_retention_day_rate;
create external table ads_user_retention_day_rate 
(
     `stat_date`          string comment '统计日期',
     `create_date`       string  comment '设备新增日期',
     `retention_day`     int comment '截至当前日期留存天数',
     `retention_count`    bigint comment  '留存数量',
     `new_mid_count`     bigint comment '设备新增数量',
     `retention_ratio`   decimal(10,2) comment '留存率'
)  COMMENT '每日设备留存表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_retention_day_rate/';
  1. 导入数据
    思路是将统计拆分成天,先写出一天的逻辑,然后分析一天的任务能够查询什么,比如2020-03-14这一天可以统计2020-03-10的3日留存和2020-03-11的2日留存、2020-03-12的1日留存。
sql
insert into table ads_user_retention_day_rate
select
    '2020-03-10',--统计日期
    date_add('2020-03-10',-1),--新增日期
    1,--留存天数
    sum(if(login_date_first=date_add('2020-03-10',-1) and login_date_last=
'2020-03-10',1,0)),--2020-03-09的1日留存数
    sum(if(login_date_first=date_add('2020-03-10',-1),1,0)),--2020-03-09新增
    sum(if(login_date_first=date_add('2020-03-10',-1) and login_date_last=
'2020-03-10',1,0))/sum(if(login_date_first=date_add('2020-03-10',-1),1,0))*100
from dwt_uv_topic

union all

select
    '2020-03-10',--统计日期
    date_add('2020-03-10',-2),--新增日期
    2,--留存天数
    sum(if(login_date_first=date_add('2020-03-10',-2) and login_date_last=
'2020-03-10',1,0)),--2020-03-08的2日留存数
    sum(if(login_date_first=date_add('2020-03-10',-2),1,0)),--2020-03-08新增
    sum(if(login_date_first=date_add('2020-03-10',-2) and login_date_last=
'2020-03-10',1,0))/sum(if(login_date_first=date_add('2020-03-10',-2),1,0))*100
from dwt_uv_topic

union all

select
    '2020-03-10',--统计日期
    date_add('2020-03-10',-3),--新增日期
    3,--留存天数
    sum(if(login_date_first=date_add('2020-03-10',-3) and login_date_last=
'2020-03-10',1,0)),--2020-03-07的3日留存数
    sum(if(login_date_first=date_add('2020-03-10',-3),1,0)),--2020-03-07新增
    sum(if(login_date_first=date_add('2020-03-10',-3) and login_date_last=
'2020-03-10',1,0))/sum(if(login_date_first=date_add('2020-03-10',-3),1,0))*100
from dwt_uv_topic;

1.6 最近连续三周活跃用户数

  1. 建表脚本
sql
drop table if exists ads_continuity_wk_count;
create external table ads_continuity_wk_count( 
    `dt` string COMMENT '统计日期,一般用结束周周日日期,如果每天计算一次,可用当天日期',
    `wk_dt` string COMMENT '持续时间',
    `continuity_count` bigint COMMENT '活跃次数'
) 
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_wk_count';
  1. 导入数据
sql
insert into table ads_continuity_wk_count
select
    '2020-03-10',
    concat(date_add(next_day('2020-03-10','MO'),-7*3),'_',date_add(next_day('2020-03-10','MO'),-1)),
    count(*)
from
(
    select
        mid_id
    from
    (
        Select -–查找本周活跃设备
            mid_id
        from dws_uv_detail_daycount
        where dt>=date_add(next_day('2020-03-10','monday'),-7)
        and dt<=date_add(next_day('2020-03-10','monday'),-1)
        group by mid_id

        union all

        select --查找上周活跃设备
            mid_id
        from dws_uv_detail_daycount
        where dt>=date_add(next_day('2020-03-10','monday'),-7*2)
        and dt<=date_add(next_day('2020-03-10','monday'),-7-1)
        group by mid_id

        union all

        select --查找上上周活跃设备
            mid_id
        from dws_uv_detail_daycount
        where dt>=date_add(next_day('2020-03-10','monday'),-7*3)
        and dt<=date_add(next_day('2020-03-10','monday'),-7*2-1)
        group by mid_id 
    )t1
    group by mid_id  --对三周内的所有活跃设备进行分组
    having count(*)=3-分组后mid_id个数为3的设备为最近连续三周活跃设备
)t2;

1.7 最近7天内连续3天活跃设备数

  1. 建表语句
sql
drop table if exists ads_continuity_uv_count;
create external table ads_continuity_uv_count( 
    `dt` string COMMENT '统计日期',
    `wk_dt` string COMMENT '最近七天日期',
    `continuity_count` bigint
) COMMENT '最近七天内连续三天活跃设备数表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_continuity_uv_count';
  1. 数据加载
    通用办法: 先从DWS层中拿到某个设备数的登录(活跃)日期,在groupby设备ID,使用开窗函数rank()并按照日期排序,得到排名rk,再求差diff=date_sub(登录日期, rk)得到差diff, 结果集再根据设备id和diff分组,使用count(),如果count()>=3过滤得到的就是想要的结果集。
    Alt text
sql
insert into table ads_continuity_uv_count
select
    '2020-03-10',
    concat(date_add('2020-03-10',-6),'_','2020-03-10'),
    count(*)
from
(
    select mid_id
    from
    (
        select mid_id      
        from
        (
            select 
                mid_id,
                date_sub(dt,rank) date_dif --取排序值与日期值之差作为连续标志
            from
            (
                select 
                    mid_id,
                    dt,
-对七天内登录过的设备按照登录日期进行排序
                    rank() over(partition by mid_id order by dt) rank 
                from dws_uv_detail_daycount
                where dt>=date_add('2020-03-10',-6) and dt<='2020-03-10'
            )t1
        )t2 
        group by mid_id,date_dif –-按照连续标志和设备的mid进行分组
        having count(*)>=3 --分组后个数大于3的设备为最近七天内连续三天活跃的设备
    )t3 
    group by mid_id
)t4;

2. 会员主题

2.1 会员主题信息

会员活跃率= 当日活跃会员数/总会员数 会员付费率= 总付费会员数/总会员数 会员新鲜度= 当日活跃会员数/当日活跃会员数

  1. 建表语句
sql
drop table if exists ads_user_topic;
create external table ads_user_topic(
    `dt` string COMMENT '统计日期',
    `day_users` string COMMENT '活跃会员数',
    `day_new_users` string COMMENT '新增会员数',
    `day_new_payment_users` string COMMENT '新增消费会员数',
    `payment_users` string COMMENT '总付费会员数',
    `users` string COMMENT '总会员数',
    `day_users2users` decimal(10,2) COMMENT '会员活跃率',
    `payment_users2users` decimal(10,2) COMMENT '会员付费率',
    `day_new_users2users` decimal(10,2) COMMENT '会员新鲜度'
) COMMENT '会员主题信息表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_topic';
  1. 数据加载
sql
insert into table ads_user_topic
select
    '2020-03-10',
    sum(if(login_date_last='2020-03-10',1,0)),
    sum(if(login_date_first='2020-03-10',1,0)),
    sum(if(payment_date_first='2020-03-10',1,0)),
    sum(if(payment_count>0,1,0)),
    count(*),
    sum(if(login_date_last='2020-03-10',1,0))/count(*),
    sum(if(payment_count>0,1,0))/count(*),
    sum(if(login_date_first='2020-03-10',1,0))/sum(if(login_date_last='2020-03-10',1,0))
from dwt_user_topic;

2.1 用户行为漏斗分析

统计"浏览->购物车->下单->支付"的转化率
思路:统计各个行为的人数,然后计算比值。
Alt text

  1. 建表语句
sql
drop table if exists ads_user_action_convert_day;
create external  table ads_user_action_convert_day(
    `dt` string COMMENT '统计日期',
    `total_visitor_m_count`  bigint COMMENT '总访问人数',
    `cart_u_count` bigint COMMENT '加入购物车的人数',
    `visitor2cart_convert_ratio` decimal(10,2) COMMENT '访问到加入购物车的转化率',
    `order_u_count` bigint     COMMENT '下单人数',
    `cart2order_convert_ratio`  decimal(10,2) COMMENT '加入购物车到下单的转化率',
    `payment_u_count` bigint     COMMENT '支付人数',
    `order2payment_convert_ratio` decimal(10,2) COMMENT '下单到支付的转化率'
 ) COMMENT '用户行为漏斗分析表'
row format delimited  fields terminated by '\t'
location '/warehouse/gmall/ads/ads_user_action_convert_day/';
  1. 数据加载 下面的sql如果只做查询需要强转decimal(10,2), 如果入库可以不写,因为hive会默认自动转换。
sql
insert into table ads_user_action_convert_day
select 
    '2020-03-10',
    uv.day_count,
    ua.cart_count,
    cast(ua.cart_count/uv.day_count as  decimal(10,2)) visitor2cart_convert_ratio,
    ua.order_count,
    cast(ua.order_count/ua.cart_count as  decimal(10,2)) visitor2order_convert_
ratio,
    ua.payment_count,
    cast(ua.payment_count/ua.order_count as  decimal(10,2)) order2payment_
convert_ratio
from  
(
    select 
        dt,
        sum(if(cart_count>0,1,0)) cart_count,
        sum(if(order_count>0,1,0)) order_count,
        sum(if(payment_count>0,1,0)) payment_count
    from dws_user_action_daycount
where dt='2020-03-10'
group by dt
)ua join ads_uv_count uv on uv.dt=ua.dt;

3. 商品主题

计算TOPN类型首先要确定度量值,是个数、次数、金额进行计算,是分组排名还是全局排名。

3.1 商品个数信息

  1. 建表语句
sql
drop table if exists ads_product_info;
create external table ads_product_info(
`dt` string COMMENT '统计日期',
`sku_num` string COMMENT 'sku个数',
`spu_num` string COMMENT 'spu个数'
) COMMENT '商品个数信息'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_info';
  1. 数据加载
sql
insert into table ads_product_info
select
    '2020-03-10' dt,
    sku_num,
    spu_num
from
(
    select
        '2020-03-10' dt,
        count(*) sku_num
    from
        dwt_sku_topic
) tmp_sku_num
join
(
    select
        '2020-03-10' dt,
        count(*) spu_num
    from
    (
        select
            spu_id
        from
            dwt_sku_topic
        group by
            spu_id
    ) tmp_spu_id
) tmp_spu_num
on tmp_sku_num.dt=tmp_spu_num.dt;

3.2 商品销量排名

  1. 建表语句
sql
drop table if exists ads_product_sale_topN;
create external table ads_product_sale_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品id',
    `payment_amount` bigint COMMENT '销量'
) COMMENT '商品销量排名表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_sale_topN';
  1. 数据加载
sql
insert into table ads_product_sale_topN
select
    '2020-03-10' dt,
    sku_id,
    payment_amount
from
    dws_sku_action_daycount
where
    dt='2020-03-10'
order by payment_amount desc
limit 10;

3.3 商品收藏排名

  1. 建表语句
sql
drop table if exists ads_product_favor_topN;
create external table ads_product_favor_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品id',
    `favor_count` bigint COMMENT '收藏量'
) COMMENT '商品收藏排名表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_favor_topN';
  1. 数据加载
sql
insert into table ads_product_favor_topN
select
    '2020-03-10' dt,
    sku_id,
    favor_count
from
    dws_sku_action_daycount
where
    dt='2020-03-10'
order by favor_count desc
limit 10;

3.4 商品加入购物车排名

  1. 建表语句
sql
drop table if exists ads_product_cart_topN;
create external table ads_product_cart_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品id',
    `cart_num` bigint COMMENT '加入购物车数量'
) COMMENT '商品加入购物车排名表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_cart_topN';
  1. 数据加载
sql
insert into table ads_product_cart_topN
select
    '2020-03-10' dt,
    sku_id,
    cart_num
from
    dws_sku_action_daycount
where
    dt='2020-03-10'
order by cart_num desc
limit 10;

3.5 商品退款率排名(最近30天)

  1. 建表语句
sql
drop table if exists ads_product_refund_topN;
create external table ads_product_refund_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品id',
    `refund_ratio` decimal(10,2) COMMENT '退款率'
) COMMENT '商品退款率排名表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_product_refund_topN';
  1. 数据加载
sql
insert into table ads_product_refund_topN
select
    '2020-03-10',
    sku_id,
    refund_last_30d_count/payment_last_30d_count*100 refund_ratio
from dwt_sku_topic
order by refund_ratio desc
limit 10;

3.6 商品差评率

  1. 建表语句
sql
drop table if exists ads_appraise_bad_topN;
create external table ads_appraise_bad_topN(
    `dt` string COMMENT '统计日期',
    `sku_id` string COMMENT '商品id',
    `appraise_bad_ratio` decimal(10,2) COMMENT '差评率'
) COMMENT '商品差评率排名表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_appraise_bad_topN';
  1. 数据加载
sql
insert into table ads_appraise_bad_topN
select
    '2020-03-10' dt,
    sku_id,
appraise_bad_count/(appraise_good_count+appraise_mid_count+appraise_bad_count+appraise_default_count) appraise_bad_ratio
from
    dws_sku_action_daycount
where
    dt='2020-03-10'
order by appraise_bad_ratio desc
limit 10;

4. 营销主题(用户+商品+购买行为)

在电商领域,GMV是Gross Merchandise Volume的缩写,中文可以翻译为"商品交易总额", 计算下单金额一般不使用支付金额算,而是使用订单金额算。

4.1 下单数目统计

  1. 建表语句
sql
drop table if exists ads_order_daycount;
create external table ads_order_daycount(
    dt string comment '统计日期',
    order_count bigint comment '每日下单笔数',
    order_amount bigint comment '每日下单金额',
    order_users bigint comment '每日下单用户数'
) comment '每日订单总计表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_order_daycount';
  1. 数据加载
sql
insert into table ads_order_daycount
select
    '2020-03-10',
    sum(order_count),
    sum(order_amount),
    sum(if(order_count>0,1,0))
from dws_user_action_daycount
where dt='2020-03-10';

4.2 支付信息统计

  1. 建表语句
sql
drop table if exists ads_payment_daycount;
create external table ads_payment_daycount(
    dt string comment '统计日期',
    payment_count bigint comment '单日支付笔数',
    payment_amount bigint comment '单日支付金额',
    payment_user_count bigint comment '单日支付人数',
    payment_sku_count bigint comment '单日支付商品数',
    payment_avg_time double comment '下单到支付的平均时长,取分钟数'
) comment '每日支付总计表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_payment_daycount';
  1. 数据加载
sql
insert into table ads_payment_daycount
select
    tmp_payment.dt,
    tmp_payment.payment_count,
    tmp_payment.payment_amount,
    tmp_payment.payment_user_count,
    tmp_skucount.payment_sku_count,
    tmp_time.payment_avg_time
from
(
    select
        '2020-03-10' dt,
        sum(payment_count) payment_count,
        sum(payment_amount) payment_amount,
        sum(if(payment_count>0,1,0)) payment_user_count
    from dws_user_action_daycount
    where dt='2020-03-10'
)tmp_payment
join
(
    select
        '2020-03-10' dt,
        sum(if(payment_count>0,1,0)) payment_sku_count 
    from dws_sku_action_daycount
    where dt='2020-03-10'
)tmp_skucount on tmp_payment.dt=tmp_skucount.dt
join
(
    select
        '2020-03-10' dt,
        sum(unix_timestamp(payment_time)-unix_timestamp(create_time))/count(*)/60 payment_avg_time
    from dwd_fact_order_info
    where dt='2020-03-10'
    and payment_time is not null
)tmp_time on tmp_payment.dt=tmp_time.dt;

4.3 复购率

  1. 建表语句
sql
drop table ads_sale_tm_category1_stat_mn;
create external table ads_sale_tm_category1_stat_mn
(  
    tm_id string comment '品牌id',
    category1_id string comment '一级品类id ',
    category1_name string comment '一级品类名称 ',
    buycount   bigint comment  '购买人数',
    buy_twice_last bigint  comment '两次以上购买人数',
    buy_twice_last_ratio decimal(10,2)  comment  '单次复购率',
    buy_3times_last   bigint comment   '三次以上购买人数',
    buy_3times_last_ratio decimal(10,2)  comment  '多次复购率',
    stat_mn string comment '统计月份',
    stat_date string comment '统计日期' 
)   COMMENT '复购率统计表'
row format delimited fields terminated by '\t'
location '/warehouse/gmall/ads/ads_sale_tm_category1_stat_mn/';
  1. 数据加载
sql
with 
tmp_order as
(
    select
        user_id,
        order_stats_struct.sku_id sku_id,
        order_stats_struct.order_count order_count
    from dws_user_action_daycount lateral view explode(order_stats) tmp as order_stats_struct
    where date_format(dt,'yyyy-MM')=date_format('2020-03-10','yyyy-MM')
),
tmp_sku as
(
    select
        id,
        tm_id,
        category1_id,
        category1_name
    from dwd_dim_sku_info
    where dt='2020-03-10'
)
insert into table ads_sale_tm_category1_stat_mn
select
    tm_id,
    category1_id,
    category1_name,
    sum(if(order_count>=1,1,0)) buycount,
    sum(if(order_count>=2,1,0)) buyTwiceLast,
    sum(if(order_count>=2,1,0))/sum( if(order_count>=1,1,0)) buyTwiceLastRatio,
    sum(if(order_count>=3,1,0))  buy3timeLast  ,
    sum(if(order_count>=3,1,0))/sum( if(order_count>=1,1,0)) buy3timeLastRatio ,
    date_format('2020-03-10' ,'yyyy-MM') stat_mn,
    '2020-03-10' stat_date
from
(
    select 
        tmp_order.user_id,
        tmp_sku.category1_id,
        tmp_sku.category1_name,
        tmp_sku.tm_id,
        sum(order_count) order_count
    from tmp_order
    join tmp_sku
    on tmp_order.sku_id=tmp_sku.id
    group by tmp_order.user_id,tmp_sku.category1_id,tmp_sku.category1_name,tmp_
sku.tm_id
)tmp
group by tm_id, category1_id, category1_name;

5. ADS层导入脚本

  1. 在/home/jack/bin目录下创建脚本 直接下载查看

6. 编写SQL总结

  1. 先找到目标表
  2. 分析一下,需要哪些表能满足目标表的所有字段
    即准备所有的输入表
  3. 写逻辑
  • 1 insert overwrite table 目标表名称
    观察目标表是否需要分区
  • 2 固定格式,如果需要多表join, 先把整理大框写出,再具体写一个一个子查询
sql
select              
                          
from               
(              
              
) b1               
join               
(              
)b2 on
  • 遇到统计什么次数用count
  • 遇到统计什么金额用sum
  • 如果是累积表,获取旧表数据,再获取新表数据
  • 遇到统计累计值,旧的 + 新的
  • 累计表中获取首次时间
    旧的时候为null, 取当前时间,否则取旧的
  • 累计表中获取末次时间(最近时间)
    新的id不为空,取当前时间否则取旧的
  • 天数和次数的转换。
    if(new.login_count>0, 1,0)
  • 使用group by时要注意:
    查询的字段有三种: 1. 分组里面有; 2. 常量; 3. 聚合函数
  • 累计30天指标
    在新数据表new中进行累加 where最近30天
  • 如果涉及的表比较多可以采用with tmp的方法