forked from dbt-labs/segment
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathsegment_web_page_views__sessionized.sql
136 lines (95 loc) · 3.35 KB
/
segment_web_page_views__sessionized.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
{{ config(
materialized = 'incremental',
unique_key = 'page_view_id',
sort = 'tstamp',
partition_by = {'field': 'tstamp', 'data_type': 'timestamp', 'granularity': var('segment_bigquery_partition_granularity')},
dist = 'page_view_id',
cluster_by = 'page_view_id'
)}}
{#
the initial CTE in this model is unusually complicated; its function is to
select all pageviews (for all time) for users who have pageviews since the
model was most recently run. there are many window functions in this model so
in order to appropriately calculate all of them we need each users entire
page view history, but we only want to grab that for users who have page view
events we need to calculate.
#}
with pageviews as (
select * from {{ref('segment_web_page_views')}}
{% if is_incremental() %}
where anonymous_id in (
select distinct anonymous_id
from {{ref('segment_web_page_views')}}
{{
generate_sessionization_incremental_filter( this, 'tstamp', 'tstamp', '>' )
}}
)
{% endif %}
),
numbered as (
--This CTE is responsible for assigning an all-time page view number for a
--given anonymous_id. We don't need to do this across devices because the
--whole point of this field is for sessionization, and sessions can't span
--multiple devices.
select
*,
row_number() over (
partition by source_name, anonymous_id
order by tstamp
) as page_view_number
from pageviews
),
lagged as (
--This CTE is responsible for simply grabbing the last value of `tstamp`.
--We'll use this downstream to do timestamp math--it's how we determine the
--period of inactivity.
select
*,
lag(tstamp) over (
partition by source_name, anonymous_id
order by page_view_number
) as previous_tstamp
from numbered
),
diffed as (
--This CTE simply calculates `period_of_inactivity`.
select
*,
{{ dbt.datediff('previous_tstamp', 'tstamp', 'second') }} as period_of_inactivity
from lagged
),
new_sessions as (
--This CTE calculates a single 1/0 field--if the period of inactivity prior
--to this page view was greater than 30 minutes, the value is 1, otherwise
--it's 0. We'll use this to calculate the user's session #.
select
*,
case
when period_of_inactivity <= {{var('segment_inactivity_cutoff')}} then 0
else 1
end as new_session
from diffed
),
session_numbers as (
--This CTE calculates a user's session (1, 2, 3) number from `new_session`.
--This single field is the entire point of the entire prior series of
--calculations.
select
*,
sum(new_session) over (
partition by source_name, anonymous_id
order by page_view_number
rows between unbounded preceding and current row
) as session_number
from new_sessions
),
session_ids as (
--This CTE assigns a globally unique session id based on the combination of
--`anonymous_id` and `session_number`.
select
{{dbt_utils.star(ref('segment_web_page_views'))}},
page_view_number,
{{dbt_utils.generate_surrogate_key(['anonymous_id', 'session_number', 'source_name'])}} as session_id
from session_numbers
)
select * from session_ids