Files
cdp/data-layer/infra/clickhouse/retention_cohort.sql.tmpl
2026-05-25 13:38:20 +07:00

44 lines
1.8 KiB
Cheetah

-- Retention Cohort -- of users whose first `initial_event` lands on day D,
-- what share triggered `return_event` on day D+k for k in 1..Periods.
--
-- We compute the cohort day in a CTE first, then LEFT JOIN events_track and
-- count distinct returners per (cohort_day, day_offset). Doing it this way
-- avoids ClickHouse's "aggregate inside another aggregate" restriction that
-- the older retention()-based form ran into.
--
-- Required parameters (clickhouse.Named, string-valued):
-- workspace_id : String
-- from : DateTime64(3,'UTC') (formatted as 'YYYY-MM-DD HH:MM:SS.mmm')
-- to : DateTime64(3,'UTC')
-- initial_event : String
-- return_event : String
--
-- Template inputs:
-- .Outer : []{ OffsetDay int; Last bool }
WITH cohorts AS (
SELECT
user_id,
toDate(min(timestamp)) AS cohort_day
FROM events_track
WHERE workspace_id = {workspace_id:String}
AND received_at >= {from:DateTime64(3,'UTC')}
AND received_at < {to:DateTime64(3,'UTC')}
AND user_id != ''
AND event = {initial_event:String}
GROUP BY user_id
)
SELECT
c.cohort_day AS cohort_day,
uniqExact(c.user_id) AS cohort_size,
{{- range $p := .Outer }}
uniqExactIf(c.user_id, e.event = {return_event:String} AND toDate(e.timestamp) = addDays(c.cohort_day, {{ $p.OffsetDay }})) AS retained_d{{ $p.OffsetDay }}{{ if not $p.Last }},{{ end }}
{{- end }}
FROM cohorts AS c
LEFT JOIN events_track AS e
ON e.workspace_id = {workspace_id:String}
AND e.user_id = c.user_id
AND e.received_at >= {from:DateTime64(3,'UTC')}
AND e.received_at < {to:DateTime64(3,'UTC')}
GROUP BY c.cohort_day
ORDER BY c.cohort_day