data layer

This commit is contained in:
2026-05-25 08:38:26 +07:00
parent 4e8c11d545
commit a428170fef
81 changed files with 3941 additions and 0 deletions

View File

View File

@@ -0,0 +1,60 @@
-- Event Explorer -- filter raw events for one workspace inside a time range.
--
-- Required parameters (clickhouse.Named):
-- workspace_id : String
-- from : DateTime64(3,'UTC')
-- to : DateTime64(3,'UTC')
-- limit : UInt32
-- offset : UInt32
--
-- Optional parameters (controlled by template flags):
-- user_id : String (when .HasUserID)
-- anonymous_id : String (when .HasAnonymousID)
-- event : String (when .HasEventName, events_track only)
--
-- Template inputs:
-- .Table : whitelisted enum (events_track | events_identify | events_page | events_group)
-- .HasUserID : bool
-- .HasAnonymousID : bool
-- .HasEventName : bool
SELECT
workspace_id,
source_id,
message_id,
anonymous_id,
user_id,
{{- if eq .Table "events_track" }}
event,
{{- end }}
{{- if eq .Table "events_page" }}
name,
category,
path,
url,
{{- end }}
{{- if eq .Table "events_group" }}
group_id,
{{- end }}
timestamp,
received_at,
{{- if or (eq .Table "events_identify") (eq .Table "events_group") }}
traits
{{- else }}
properties
{{- end }}
FROM {{ .Table }}
WHERE workspace_id = {workspace_id:String}
AND received_at >= {from:DateTime64(3,'UTC')}
AND received_at < {to:DateTime64(3,'UTC')}
{{- if .HasUserID }}
AND user_id = {user_id:String}
{{- end }}
{{- if .HasAnonymousID }}
AND anonymous_id = {anonymous_id:String}
{{- end }}
{{- if .HasEventName }}
AND event = {event:String}
{{- end }}
ORDER BY received_at DESC
LIMIT {limit:UInt32}
OFFSET {offset:UInt32}

View File

@@ -0,0 +1,35 @@
-- Funnel Analysis -- count users reaching each step in order within window.
--
-- Required parameters (clickhouse.Named):
-- workspace_id : String
-- from : DateTime64(3,'UTC')
-- to : DateTime64(3,'UTC')
-- window_seconds : UInt32
-- step{i} : String for i in 0..N-1
--
-- Template inputs:
-- .Steps : []struct{ Index int; Last bool }
-- .StepCount : int
SELECT
step,
countIf(level >= step) AS reached,
if(step = 1, 1.0, countIf(level >= step) / countIf(level >= 1)) AS conversion_rate
FROM (
SELECT
user_id,
windowFunnel({window_seconds:UInt32})(
timestamp,
{{- range $i, $s := .Steps }}
event = {step{{ $s.Index }}:String}{{ if not $s.Last }},{{ end }}
{{- end }}
) AS level
FROM events_track
WHERE workspace_id = {workspace_id:String}
AND received_at >= {from:DateTime64(3,'UTC')}
AND received_at < {to:DateTime64(3,'UTC')}
AND user_id != ''
GROUP BY user_id
) AS f
ARRAY JOIN range(1, toUInt32({{ .StepCount }}) + 1) AS step
GROUP BY step
ORDER BY step

View File

@@ -0,0 +1,57 @@
-- Profile timeline -- merged event stream for one user_id within a workspace.
--
-- Required parameters (clickhouse.Named):
-- workspace_id : String
-- user_id : String
-- limit : UInt32
-- offset : UInt32
SELECT * FROM (
SELECT
'track' AS kind,
message_id,
event AS name,
received_at,
properties AS payload
FROM events_track
WHERE workspace_id = {workspace_id:String}
AND user_id = {user_id:String}
UNION ALL
SELECT
'identify' AS kind,
message_id,
'' AS name,
received_at,
traits AS payload
FROM events_identify
WHERE workspace_id = {workspace_id:String}
AND user_id = {user_id:String}
UNION ALL
SELECT
'page' AS kind,
message_id,
name AS name,
received_at,
properties AS payload
FROM events_page
WHERE workspace_id = {workspace_id:String}
AND user_id = {user_id:String}
UNION ALL
SELECT
'group' AS kind,
message_id,
'' AS name,
received_at,
traits AS payload
FROM events_group
WHERE workspace_id = {workspace_id:String}
AND user_id = {user_id:String}
)
ORDER BY received_at DESC
LIMIT {limit:UInt32}
OFFSET {offset:UInt32}

View File

@@ -0,0 +1,41 @@
-- Retention Cohort -- of users whose first `initial_event` lands on day D,
-- what share triggered `return_event` on day D+k for k in 1..Periods.
--
-- Required parameters (clickhouse.Named):
-- workspace_id : String
-- from : DateTime64(3,'UTC')
-- to : DateTime64(3,'UTC')
-- initial_event : String
-- return_event : String
--
-- Template inputs:
-- .Outer : []{ RIndex int; OffsetDay int; Last bool }
-- One entry per follow-up day. RIndex is the position in the retention()
-- output array; OffsetDay is the day delta from the cohort day.
SELECT
cohort_day,
countIf(arrayElement(r, 1)) AS cohort_size,
{{- range $p := .Outer }}
countIf(arrayElement(r, {{ $p.RIndex }})) AS retained_d{{ $p.OffsetDay }}{{ if not $p.Last }},{{ end }}
{{- end }}
FROM (
SELECT
user_id,
toDate(min(if(event = {initial_event:String}, timestamp, NULL))) AS cohort_day,
retention(
event = {initial_event:String} AND toDate(timestamp) = cohort_day,
{{- range $p := .Outer }}
event = {return_event:String} AND toDate(timestamp) = addDays(cohort_day, {{ $p.OffsetDay }}){{ if not $p.Last }},{{ end }}
{{- end }}
) AS r
FROM events_track
WHERE workspace_id = {workspace_id:String}
AND received_at >= {from:DateTime64(3,'UTC')}
AND received_at < {to:DateTime64(3,'UTC')}
AND user_id != ''
AND event IN ({initial_event:String}, {return_event:String})
GROUP BY user_id
HAVING cohort_day IS NOT NULL
)
GROUP BY cohort_day
ORDER BY cohort_day

View File

@@ -0,0 +1,52 @@
-- Session Analysis -- split each user's event stream into sessions based on
-- inactivity gap, then aggregate per session.
--
-- Required parameters (clickhouse.Named):
-- workspace_id : String
-- from : DateTime64(3,'UTC')
-- to : DateTime64(3,'UTC')
-- timeout_seconds : UInt32
-- limit : UInt32
-- offset : UInt32
--
-- Optional parameters (template-driven):
-- user_id : String (when .HasUserID)
SELECT
user_id,
session_index AS session_id,
min(timestamp) AS started_at,
max(timestamp) AS ended_at,
count() AS events,
dateDiff('second', min(timestamp), max(timestamp)) AS duration_seconds
FROM (
SELECT
user_id,
timestamp,
sum(is_new_session) OVER (PARTITION BY user_id ORDER BY timestamp) AS session_index
FROM (
SELECT
user_id,
timestamp,
if(
dateDiff(
'second',
lagInFrame(timestamp) OVER (PARTITION BY user_id ORDER BY timestamp),
timestamp
) > {timeout_seconds:UInt32},
1,
0
) AS is_new_session
FROM events_track
WHERE workspace_id = {workspace_id:String}
AND received_at >= {from:DateTime64(3,'UTC')}
AND received_at < {to:DateTime64(3,'UTC')}
AND user_id != ''
{{- if .HasUserID }}
AND user_id = {user_id:String}
{{- end }}
)
)
GROUP BY user_id, session_index
ORDER BY started_at DESC
LIMIT {limit:UInt32}
OFFSET {offset:UInt32}

View File

View File

@@ -0,0 +1,5 @@
DROP TABLE IF EXISTS saved_queries;
DROP TABLE IF EXISTS segment_memberships;
DROP TABLE IF EXISTS segment_definitions;
DROP TABLE IF EXISTS profile_traits;
DROP TABLE IF EXISTS trait_definitions;

View File

@@ -0,0 +1,95 @@
-- ---------------------------------------------------------------------------
-- Initial schema for CDP Analytics (data-layer).
--
-- Tables owned by this service. Read-only access to ingestion-owned tables
-- (workspaces, profiles, sources, destinations, schema_fields) is assumed.
-- ---------------------------------------------------------------------------
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- ---------------------------------------------------------------------------
-- trait_definitions -- declarative computed-trait specs maintained per workspace.
-- ---------------------------------------------------------------------------
CREATE TABLE trait_definitions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL,
key TEXT NOT NULL, -- column name on profile_traits
name TEXT NOT NULL,
description TEXT,
-- spec drives how the worker computes the trait (e.g. aggregation over
-- ClickHouse events). Format is open during prototyping.
spec JSONB NOT NULL,
refresh_every INTERVAL NOT NULL DEFAULT '1 hour',
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (workspace_id, key)
);
CREATE INDEX idx_trait_definitions_workspace ON trait_definitions (workspace_id) WHERE enabled;
-- ---------------------------------------------------------------------------
-- profile_traits -- computed values per profile, refreshed by the worker.
-- ---------------------------------------------------------------------------
CREATE TABLE profile_traits (
workspace_id UUID NOT NULL,
profile_id UUID NOT NULL,
trait_key TEXT NOT NULL,
trait_value JSONB NOT NULL,
computed_at TIMESTAMPTZ NOT NULL DEFAULT now(),
PRIMARY KEY (workspace_id, profile_id, trait_key)
);
CREATE INDEX idx_profile_traits_workspace_key ON profile_traits (workspace_id, trait_key);
-- ---------------------------------------------------------------------------
-- segment_definitions -- audience segment specs.
-- ---------------------------------------------------------------------------
CREATE TABLE segment_definitions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL,
slug TEXT NOT NULL,
name TEXT NOT NULL,
description TEXT,
-- spec is the filter tree evaluated against profiles + events + traits.
spec JSONB NOT NULL,
refresh_every INTERVAL NOT NULL DEFAULT '1 hour',
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
last_refreshed_at TIMESTAMPTZ,
UNIQUE (workspace_id, slug)
);
CREATE INDEX idx_segment_definitions_workspace ON segment_definitions (workspace_id) WHERE enabled;
-- ---------------------------------------------------------------------------
-- segment_memberships -- history table powering delta Reverse ETL.
-- exited_at NULL means the profile is currently a member.
-- ---------------------------------------------------------------------------
CREATE TABLE segment_memberships (
segment_id UUID NOT NULL REFERENCES segment_definitions (id) ON DELETE CASCADE,
profile_id UUID NOT NULL,
entered_at TIMESTAMPTZ NOT NULL DEFAULT now(),
exited_at TIMESTAMPTZ
);
CREATE INDEX idx_segment_memberships_active ON segment_memberships (segment_id, profile_id) WHERE exited_at IS NULL;
CREATE INDEX idx_segment_memberships_profile ON segment_memberships (profile_id);
-- ---------------------------------------------------------------------------
-- saved_queries -- user-saved query specs from the Explore / SQL UI.
-- ---------------------------------------------------------------------------
CREATE TABLE saved_queries (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL,
owner_id UUID,
name TEXT NOT NULL,
kind TEXT NOT NULL
CHECK (kind IN ('events', 'sql', 'funnel', 'retention', 'session')),
spec JSONB NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_saved_queries_workspace ON saved_queries (workspace_id, kind);

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env bash
# Apply / drop ClickHouse DDL files in alphabetical order.
#
# Usage:
# clickhouse_apply.sh up apply *.up.sql in infra/clickhouse/
# clickhouse_apply.sh down apply *.down.sql in REVERSE order
#
# Env:
# CLICKHOUSE_ADDR (default localhost:9000)
# CLICKHOUSE_DB (default cdp)
# CLICKHOUSE_USER (default default)
# CLICKHOUSE_PASSWORD (default empty)
set -euo pipefail
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/clickhouse"
ADDR="${CLICKHOUSE_ADDR:-localhost:9000}"
DB="${CLICKHOUSE_DB:-cdp}"
USER="${CLICKHOUSE_USER:-default}"
PASS="${CLICKHOUSE_PASSWORD:-}"
MODE="${1:-up}"
host="${ADDR%%:*}"
port="${ADDR##*:}"
run_sql() {
local file="$1"
echo ">>> applying $(basename "$file")"
if [[ -n "$PASS" ]]; then
clickhouse-client --host "$host" --port "$port" --user "$USER" --password "$PASS" \
--database "$DB" --multiquery --queries-file "$file"
else
clickhouse-client --host "$host" --port "$port" --user "$USER" \
--database "$DB" --multiquery --queries-file "$file"
fi
}
ensure_db() {
if [[ -n "$PASS" ]]; then
clickhouse-client --host "$host" --port "$port" --user "$USER" --password "$PASS" \
--query "CREATE DATABASE IF NOT EXISTS $DB"
else
clickhouse-client --host "$host" --port "$port" --user "$USER" \
--query "CREATE DATABASE IF NOT EXISTS $DB"
fi
}
case "$MODE" in
up)
ensure_db
for f in $(ls "$DIR"/*.up.sql 2>/dev/null | sort); do
run_sql "$f"
done
;;
down)
for f in $(ls "$DIR"/*.down.sql 2>/dev/null | sort -r); do
run_sql "$f"
done
;;
*)
echo "usage: $0 {up|down}"
exit 1
;;
esac
echo "done."