data layer
This commit is contained in:
0
data-layer/infra/clickhouse/.gitkeep
Normal file
0
data-layer/infra/clickhouse/.gitkeep
Normal file
60
data-layer/infra/clickhouse/event_explorer.sql.tmpl
Normal file
60
data-layer/infra/clickhouse/event_explorer.sql.tmpl
Normal file
@@ -0,0 +1,60 @@
|
||||
-- Event Explorer -- filter raw events for one workspace inside a time range.
|
||||
--
|
||||
-- Required parameters (clickhouse.Named):
|
||||
-- workspace_id : String
|
||||
-- from : DateTime64(3,'UTC')
|
||||
-- to : DateTime64(3,'UTC')
|
||||
-- limit : UInt32
|
||||
-- offset : UInt32
|
||||
--
|
||||
-- Optional parameters (controlled by template flags):
|
||||
-- user_id : String (when .HasUserID)
|
||||
-- anonymous_id : String (when .HasAnonymousID)
|
||||
-- event : String (when .HasEventName, events_track only)
|
||||
--
|
||||
-- Template inputs:
|
||||
-- .Table : whitelisted enum (events_track | events_identify | events_page | events_group)
|
||||
-- .HasUserID : bool
|
||||
-- .HasAnonymousID : bool
|
||||
-- .HasEventName : bool
|
||||
SELECT
|
||||
workspace_id,
|
||||
source_id,
|
||||
message_id,
|
||||
anonymous_id,
|
||||
user_id,
|
||||
{{- if eq .Table "events_track" }}
|
||||
event,
|
||||
{{- end }}
|
||||
{{- if eq .Table "events_page" }}
|
||||
name,
|
||||
category,
|
||||
path,
|
||||
url,
|
||||
{{- end }}
|
||||
{{- if eq .Table "events_group" }}
|
||||
group_id,
|
||||
{{- end }}
|
||||
timestamp,
|
||||
received_at,
|
||||
{{- if or (eq .Table "events_identify") (eq .Table "events_group") }}
|
||||
traits
|
||||
{{- else }}
|
||||
properties
|
||||
{{- end }}
|
||||
FROM {{ .Table }}
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND received_at >= {from:DateTime64(3,'UTC')}
|
||||
AND received_at < {to:DateTime64(3,'UTC')}
|
||||
{{- if .HasUserID }}
|
||||
AND user_id = {user_id:String}
|
||||
{{- end }}
|
||||
{{- if .HasAnonymousID }}
|
||||
AND anonymous_id = {anonymous_id:String}
|
||||
{{- end }}
|
||||
{{- if .HasEventName }}
|
||||
AND event = {event:String}
|
||||
{{- end }}
|
||||
ORDER BY received_at DESC
|
||||
LIMIT {limit:UInt32}
|
||||
OFFSET {offset:UInt32}
|
||||
35
data-layer/infra/clickhouse/funnel_analysis.sql.tmpl
Normal file
35
data-layer/infra/clickhouse/funnel_analysis.sql.tmpl
Normal file
@@ -0,0 +1,35 @@
|
||||
-- Funnel Analysis -- count users reaching each step in order within window.
|
||||
--
|
||||
-- Required parameters (clickhouse.Named):
|
||||
-- workspace_id : String
|
||||
-- from : DateTime64(3,'UTC')
|
||||
-- to : DateTime64(3,'UTC')
|
||||
-- window_seconds : UInt32
|
||||
-- step{i} : String for i in 0..N-1
|
||||
--
|
||||
-- Template inputs:
|
||||
-- .Steps : []struct{ Index int; Last bool }
|
||||
-- .StepCount : int
|
||||
SELECT
|
||||
step,
|
||||
countIf(level >= step) AS reached,
|
||||
if(step = 1, 1.0, countIf(level >= step) / countIf(level >= 1)) AS conversion_rate
|
||||
FROM (
|
||||
SELECT
|
||||
user_id,
|
||||
windowFunnel({window_seconds:UInt32})(
|
||||
timestamp,
|
||||
{{- range $i, $s := .Steps }}
|
||||
event = {step{{ $s.Index }}:String}{{ if not $s.Last }},{{ end }}
|
||||
{{- end }}
|
||||
) AS level
|
||||
FROM events_track
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND received_at >= {from:DateTime64(3,'UTC')}
|
||||
AND received_at < {to:DateTime64(3,'UTC')}
|
||||
AND user_id != ''
|
||||
GROUP BY user_id
|
||||
) AS f
|
||||
ARRAY JOIN range(1, toUInt32({{ .StepCount }}) + 1) AS step
|
||||
GROUP BY step
|
||||
ORDER BY step
|
||||
57
data-layer/infra/clickhouse/profile_timeline.sql.tmpl
Normal file
57
data-layer/infra/clickhouse/profile_timeline.sql.tmpl
Normal file
@@ -0,0 +1,57 @@
|
||||
-- Profile timeline -- merged event stream for one user_id within a workspace.
|
||||
--
|
||||
-- Required parameters (clickhouse.Named):
|
||||
-- workspace_id : String
|
||||
-- user_id : String
|
||||
-- limit : UInt32
|
||||
-- offset : UInt32
|
||||
SELECT * FROM (
|
||||
SELECT
|
||||
'track' AS kind,
|
||||
message_id,
|
||||
event AS name,
|
||||
received_at,
|
||||
properties AS payload
|
||||
FROM events_track
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND user_id = {user_id:String}
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
'identify' AS kind,
|
||||
message_id,
|
||||
'' AS name,
|
||||
received_at,
|
||||
traits AS payload
|
||||
FROM events_identify
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND user_id = {user_id:String}
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
'page' AS kind,
|
||||
message_id,
|
||||
name AS name,
|
||||
received_at,
|
||||
properties AS payload
|
||||
FROM events_page
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND user_id = {user_id:String}
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
'group' AS kind,
|
||||
message_id,
|
||||
'' AS name,
|
||||
received_at,
|
||||
traits AS payload
|
||||
FROM events_group
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND user_id = {user_id:String}
|
||||
)
|
||||
ORDER BY received_at DESC
|
||||
LIMIT {limit:UInt32}
|
||||
OFFSET {offset:UInt32}
|
||||
41
data-layer/infra/clickhouse/retention_cohort.sql.tmpl
Normal file
41
data-layer/infra/clickhouse/retention_cohort.sql.tmpl
Normal file
@@ -0,0 +1,41 @@
|
||||
-- Retention Cohort -- of users whose first `initial_event` lands on day D,
|
||||
-- what share triggered `return_event` on day D+k for k in 1..Periods.
|
||||
--
|
||||
-- Required parameters (clickhouse.Named):
|
||||
-- workspace_id : String
|
||||
-- from : DateTime64(3,'UTC')
|
||||
-- to : DateTime64(3,'UTC')
|
||||
-- initial_event : String
|
||||
-- return_event : String
|
||||
--
|
||||
-- Template inputs:
|
||||
-- .Outer : []{ RIndex int; OffsetDay int; Last bool }
|
||||
-- One entry per follow-up day. RIndex is the position in the retention()
|
||||
-- output array; OffsetDay is the day delta from the cohort day.
|
||||
SELECT
|
||||
cohort_day,
|
||||
countIf(arrayElement(r, 1)) AS cohort_size,
|
||||
{{- range $p := .Outer }}
|
||||
countIf(arrayElement(r, {{ $p.RIndex }})) AS retained_d{{ $p.OffsetDay }}{{ if not $p.Last }},{{ end }}
|
||||
{{- end }}
|
||||
FROM (
|
||||
SELECT
|
||||
user_id,
|
||||
toDate(min(if(event = {initial_event:String}, timestamp, NULL))) AS cohort_day,
|
||||
retention(
|
||||
event = {initial_event:String} AND toDate(timestamp) = cohort_day,
|
||||
{{- range $p := .Outer }}
|
||||
event = {return_event:String} AND toDate(timestamp) = addDays(cohort_day, {{ $p.OffsetDay }}){{ if not $p.Last }},{{ end }}
|
||||
{{- end }}
|
||||
) AS r
|
||||
FROM events_track
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND received_at >= {from:DateTime64(3,'UTC')}
|
||||
AND received_at < {to:DateTime64(3,'UTC')}
|
||||
AND user_id != ''
|
||||
AND event IN ({initial_event:String}, {return_event:String})
|
||||
GROUP BY user_id
|
||||
HAVING cohort_day IS NOT NULL
|
||||
)
|
||||
GROUP BY cohort_day
|
||||
ORDER BY cohort_day
|
||||
52
data-layer/infra/clickhouse/session_analysis.sql.tmpl
Normal file
52
data-layer/infra/clickhouse/session_analysis.sql.tmpl
Normal file
@@ -0,0 +1,52 @@
|
||||
-- Session Analysis -- split each user's event stream into sessions based on
|
||||
-- inactivity gap, then aggregate per session.
|
||||
--
|
||||
-- Required parameters (clickhouse.Named):
|
||||
-- workspace_id : String
|
||||
-- from : DateTime64(3,'UTC')
|
||||
-- to : DateTime64(3,'UTC')
|
||||
-- timeout_seconds : UInt32
|
||||
-- limit : UInt32
|
||||
-- offset : UInt32
|
||||
--
|
||||
-- Optional parameters (template-driven):
|
||||
-- user_id : String (when .HasUserID)
|
||||
SELECT
|
||||
user_id,
|
||||
session_index AS session_id,
|
||||
min(timestamp) AS started_at,
|
||||
max(timestamp) AS ended_at,
|
||||
count() AS events,
|
||||
dateDiff('second', min(timestamp), max(timestamp)) AS duration_seconds
|
||||
FROM (
|
||||
SELECT
|
||||
user_id,
|
||||
timestamp,
|
||||
sum(is_new_session) OVER (PARTITION BY user_id ORDER BY timestamp) AS session_index
|
||||
FROM (
|
||||
SELECT
|
||||
user_id,
|
||||
timestamp,
|
||||
if(
|
||||
dateDiff(
|
||||
'second',
|
||||
lagInFrame(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp),
|
||||
timestamp
|
||||
) > {timeout_seconds:UInt32},
|
||||
1,
|
||||
0
|
||||
) AS is_new_session
|
||||
FROM events_track
|
||||
WHERE workspace_id = {workspace_id:String}
|
||||
AND received_at >= {from:DateTime64(3,'UTC')}
|
||||
AND received_at < {to:DateTime64(3,'UTC')}
|
||||
AND user_id != ''
|
||||
{{- if .HasUserID }}
|
||||
AND user_id = {user_id:String}
|
||||
{{- end }}
|
||||
)
|
||||
)
|
||||
GROUP BY user_id, session_index
|
||||
ORDER BY started_at DESC
|
||||
LIMIT {limit:UInt32}
|
||||
OFFSET {offset:UInt32}
|
||||
0
data-layer/infra/docker/.gitkeep
Normal file
0
data-layer/infra/docker/.gitkeep
Normal file
5
data-layer/infra/migrations/000001_init.down.sql
Normal file
5
data-layer/infra/migrations/000001_init.down.sql
Normal file
@@ -0,0 +1,5 @@
|
||||
DROP TABLE IF EXISTS saved_queries;
|
||||
DROP TABLE IF EXISTS segment_memberships;
|
||||
DROP TABLE IF EXISTS segment_definitions;
|
||||
DROP TABLE IF EXISTS profile_traits;
|
||||
DROP TABLE IF EXISTS trait_definitions;
|
||||
95
data-layer/infra/migrations/000001_init.up.sql
Normal file
95
data-layer/infra/migrations/000001_init.up.sql
Normal file
@@ -0,0 +1,95 @@
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- Initial schema for CDP Analytics (data-layer).
|
||||
--
|
||||
-- Tables owned by this service. Read-only access to ingestion-owned tables
|
||||
-- (workspaces, profiles, sources, destinations, schema_fields) is assumed.
|
||||
-- ---------------------------------------------------------------------------
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- trait_definitions -- declarative computed-trait specs maintained per workspace.
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE trait_definitions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
workspace_id UUID NOT NULL,
|
||||
key TEXT NOT NULL, -- column name on profile_traits
|
||||
name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
-- spec drives how the worker computes the trait (e.g. aggregation over
|
||||
-- ClickHouse events). Format is open during prototyping.
|
||||
spec JSONB NOT NULL,
|
||||
refresh_every INTERVAL NOT NULL DEFAULT '1 hour',
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE (workspace_id, key)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_trait_definitions_workspace ON trait_definitions (workspace_id) WHERE enabled;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- profile_traits -- computed values per profile, refreshed by the worker.
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE profile_traits (
|
||||
workspace_id UUID NOT NULL,
|
||||
profile_id UUID NOT NULL,
|
||||
trait_key TEXT NOT NULL,
|
||||
trait_value JSONB NOT NULL,
|
||||
computed_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
PRIMARY KEY (workspace_id, profile_id, trait_key)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_profile_traits_workspace_key ON profile_traits (workspace_id, trait_key);
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- segment_definitions -- audience segment specs.
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE segment_definitions (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
workspace_id UUID NOT NULL,
|
||||
slug TEXT NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
description TEXT,
|
||||
-- spec is the filter tree evaluated against profiles + events + traits.
|
||||
spec JSONB NOT NULL,
|
||||
refresh_every INTERVAL NOT NULL DEFAULT '1 hour',
|
||||
enabled BOOLEAN NOT NULL DEFAULT TRUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
last_refreshed_at TIMESTAMPTZ,
|
||||
UNIQUE (workspace_id, slug)
|
||||
);
|
||||
|
||||
CREATE INDEX idx_segment_definitions_workspace ON segment_definitions (workspace_id) WHERE enabled;
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- segment_memberships -- history table powering delta Reverse ETL.
|
||||
-- exited_at NULL means the profile is currently a member.
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE segment_memberships (
|
||||
segment_id UUID NOT NULL REFERENCES segment_definitions (id) ON DELETE CASCADE,
|
||||
profile_id UUID NOT NULL,
|
||||
entered_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
exited_at TIMESTAMPTZ
|
||||
);
|
||||
|
||||
CREATE INDEX idx_segment_memberships_active ON segment_memberships (segment_id, profile_id) WHERE exited_at IS NULL;
|
||||
CREATE INDEX idx_segment_memberships_profile ON segment_memberships (profile_id);
|
||||
|
||||
-- ---------------------------------------------------------------------------
|
||||
-- saved_queries -- user-saved query specs from the Explore / SQL UI.
|
||||
-- ---------------------------------------------------------------------------
|
||||
CREATE TABLE saved_queries (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
workspace_id UUID NOT NULL,
|
||||
owner_id UUID,
|
||||
name TEXT NOT NULL,
|
||||
kind TEXT NOT NULL
|
||||
CHECK (kind IN ('events', 'sql', 'funnel', 'retention', 'session')),
|
||||
spec JSONB NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_saved_queries_workspace ON saved_queries (workspace_id, kind);
|
||||
67
data-layer/infra/scripts/clickhouse_apply.sh
Executable file
67
data-layer/infra/scripts/clickhouse_apply.sh
Executable file
@@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env bash
|
||||
# Apply / drop ClickHouse DDL files in alphabetical order.
|
||||
#
|
||||
# Usage:
|
||||
# clickhouse_apply.sh up apply *.up.sql in infra/clickhouse/
|
||||
# clickhouse_apply.sh down apply *.down.sql in REVERSE order
|
||||
#
|
||||
# Env:
|
||||
# CLICKHOUSE_ADDR (default localhost:9000)
|
||||
# CLICKHOUSE_DB (default cdp)
|
||||
# CLICKHOUSE_USER (default default)
|
||||
# CLICKHOUSE_PASSWORD (default empty)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/clickhouse"
|
||||
ADDR="${CLICKHOUSE_ADDR:-localhost:9000}"
|
||||
DB="${CLICKHOUSE_DB:-cdp}"
|
||||
USER="${CLICKHOUSE_USER:-default}"
|
||||
PASS="${CLICKHOUSE_PASSWORD:-}"
|
||||
|
||||
MODE="${1:-up}"
|
||||
|
||||
host="${ADDR%%:*}"
|
||||
port="${ADDR##*:}"
|
||||
|
||||
run_sql() {
|
||||
local file="$1"
|
||||
echo ">>> applying $(basename "$file")"
|
||||
if [[ -n "$PASS" ]]; then
|
||||
clickhouse-client --host "$host" --port "$port" --user "$USER" --password "$PASS" \
|
||||
--database "$DB" --multiquery --queries-file "$file"
|
||||
else
|
||||
clickhouse-client --host "$host" --port "$port" --user "$USER" \
|
||||
--database "$DB" --multiquery --queries-file "$file"
|
||||
fi
|
||||
}
|
||||
|
||||
ensure_db() {
|
||||
if [[ -n "$PASS" ]]; then
|
||||
clickhouse-client --host "$host" --port "$port" --user "$USER" --password "$PASS" \
|
||||
--query "CREATE DATABASE IF NOT EXISTS $DB"
|
||||
else
|
||||
clickhouse-client --host "$host" --port "$port" --user "$USER" \
|
||||
--query "CREATE DATABASE IF NOT EXISTS $DB"
|
||||
fi
|
||||
}
|
||||
|
||||
case "$MODE" in
|
||||
up)
|
||||
ensure_db
|
||||
for f in $(ls "$DIR"/*.up.sql 2>/dev/null | sort); do
|
||||
run_sql "$f"
|
||||
done
|
||||
;;
|
||||
down)
|
||||
for f in $(ls "$DIR"/*.down.sql 2>/dev/null | sort -r); do
|
||||
run_sql "$f"
|
||||
done
|
||||
;;
|
||||
*)
|
||||
echo "usage: $0 {up|down}"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
echo "done."
|
||||
Reference in New Issue
Block a user