init ingestion

This commit is contained in:
2026-05-24 22:59:24 +07:00
commit 4e8c11d545
80 changed files with 5639 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
DROP TABLE IF EXISTS events_dlq;
DROP TABLE IF EXISTS events_group;
DROP TABLE IF EXISTS events_page;
DROP TABLE IF EXISTS events_identify;
DROP TABLE IF EXISTS events_track;

View File

@@ -0,0 +1,117 @@
-- ---------------------------------------------------------------------------
-- ClickHouse event store.
--
-- One wide table per event family (track / identify / page / group).
-- All share the same key columns. Custom fields are flattened into the
-- properties / traits Map.
-- ---------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS events_track
(
workspace_id String,
source_id String,
message_id String,
anonymous_id String,
user_id String,
event String,
timestamp DateTime64(3, 'UTC'),
sent_at DateTime64(3, 'UTC'),
received_at DateTime64(3, 'UTC'),
properties Map(String, String),
context Map(String, String),
ip String,
user_agent String,
library_name String,
library_version String
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(received_at)
ORDER BY (workspace_id, source_id, received_at, message_id)
TTL toDateTime(received_at) + INTERVAL 18 MONTH
SETTINGS index_granularity = 8192;
CREATE TABLE IF NOT EXISTS events_identify
(
workspace_id String,
source_id String,
message_id String,
anonymous_id String,
user_id String,
timestamp DateTime64(3, 'UTC'),
sent_at DateTime64(3, 'UTC'),
received_at DateTime64(3, 'UTC'),
traits Map(String, String),
context Map(String, String),
ip String,
user_agent String
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(received_at)
ORDER BY (workspace_id, source_id, received_at, message_id)
TTL toDateTime(received_at) + INTERVAL 18 MONTH
SETTINGS index_granularity = 8192;
CREATE TABLE IF NOT EXISTS events_page
(
workspace_id String,
source_id String,
message_id String,
anonymous_id String,
user_id String,
name String,
category String,
timestamp DateTime64(3, 'UTC'),
sent_at DateTime64(3, 'UTC'),
received_at DateTime64(3, 'UTC'),
properties Map(String, String),
context Map(String, String),
ip String,
user_agent String,
referrer String,
path String,
url String
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(received_at)
ORDER BY (workspace_id, source_id, received_at, message_id)
TTL toDateTime(received_at) + INTERVAL 18 MONTH
SETTINGS index_granularity = 8192;
CREATE TABLE IF NOT EXISTS events_group
(
workspace_id String,
source_id String,
message_id String,
anonymous_id String,
user_id String,
group_id String,
timestamp DateTime64(3, 'UTC'),
sent_at DateTime64(3, 'UTC'),
received_at DateTime64(3, 'UTC'),
traits Map(String, String),
context Map(String, String),
ip String,
user_agent String
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(received_at)
ORDER BY (workspace_id, source_id, received_at, message_id)
TTL toDateTime(received_at) + INTERVAL 18 MONTH
SETTINGS index_granularity = 8192;
-- Dead-letter queue for events that failed validation / transformation.
CREATE TABLE IF NOT EXISTS events_dlq
(
workspace_id String,
source_id String,
message_id String,
received_at DateTime64(3, 'UTC'),
reason String,
field String,
raw_payload String
)
ENGINE = MergeTree
PARTITION BY toYYYYMM(received_at)
ORDER BY (workspace_id, received_at)
TTL toDateTime(received_at) + INTERVAL 30 DAY
SETTINGS index_granularity = 8192;

View File

@@ -0,0 +1,19 @@
<?xml version="1.0"?>
<clickhouse>
<!-- Reasonable defaults for local development -->
<logger>
<level>information</level>
<console>1</console>
</logger>
<listen_host>0.0.0.0</listen_host>
<!-- Allow connections from any user/host in dev -->
<profiles>
<default>
<max_memory_usage>4000000000</max_memory_usage>
<use_uncompressed_cache>0</use_uncompressed_cache>
<load_balancing>random</load_balancing>
</default>
</profiles>
</clickhouse>

View File

@@ -0,0 +1,113 @@
version: "3.9"
# ---------------------------------------------------------------------------
# CDP Ingestion - local development infrastructure
#
# Brings up: PostgreSQL, Redis, Redpanda (Kafka), ClickHouse, Redpanda Console
# ---------------------------------------------------------------------------
services:
postgres:
image: postgres:16-alpine
container_name: cdp-postgres
restart: unless-stopped
environment:
POSTGRES_USER: cdp
POSTGRES_PASSWORD: cdp
POSTGRES_DB: cdp
ports:
- "5432:5432"
volumes:
- postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U cdp -d cdp"]
interval: 5s
timeout: 3s
retries: 10
redis:
image: redis:7-alpine
container_name: cdp-redis
restart: unless-stopped
command: ["redis-server", "--appendonly", "yes", "--save", "60", "1"]
ports:
- "6379:6379"
volumes:
- redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 5s
timeout: 3s
retries: 10
redpanda:
image: redpandadata/redpanda:v24.2.7
container_name: cdp-redpanda
restart: unless-stopped
command:
- redpanda
- start
- --kafka-addr=internal://0.0.0.0:9092,external://0.0.0.0:19092
- --advertise-kafka-addr=internal://redpanda:9092,external://localhost:19092
- --pandaproxy-addr=internal://0.0.0.0:8082,external://0.0.0.0:18082
- --advertise-pandaproxy-addr=internal://redpanda:8082,external://localhost:18082
- --schema-registry-addr=internal://0.0.0.0:8081,external://0.0.0.0:18081
- --rpc-addr=0.0.0.0:33145
- --advertise-rpc-addr=redpanda:33145
- --smp=1
- --memory=1G
- --overprovisioned
- --node-id=0
- --check=false
ports:
- "9092:9092"
- "19092:19092"
- "9644:9644"
volumes:
- redpanda_data:/var/lib/redpanda/data
healthcheck:
test: ["CMD-SHELL", "rpk cluster health | grep -E 'Healthy:.+true' || exit 1"]
interval: 10s
timeout: 5s
retries: 10
redpanda-console:
image: redpandadata/console:v2.7.2
container_name: cdp-redpanda-console
restart: unless-stopped
depends_on:
- redpanda
environment:
KAFKA_BROKERS: redpanda:9092
ports:
- "8080:8080"
clickhouse:
image: clickhouse/clickhouse-server:24.8
container_name: cdp-clickhouse
restart: unless-stopped
ulimits:
nofile:
soft: 262144
hard: 262144
environment:
CLICKHOUSE_DB: cdp
CLICKHOUSE_USER: default
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT: "1"
ports:
- "8123:8123" # HTTP
- "9000:9000" # Native
volumes:
- clickhouse_data:/var/lib/clickhouse
- ./clickhouse-config.xml:/etc/clickhouse-server/config.d/local.xml:ro
healthcheck:
test: ["CMD-SHELL", "wget -qO- http://localhost:8123/ping | grep -q Ok"]
interval: 5s
timeout: 3s
retries: 10
volumes:
postgres_data:
redis_data:
redpanda_data:
clickhouse_data:

View File

@@ -0,0 +1,12 @@
DROP TABLE IF EXISTS audit_log;
DROP TABLE IF EXISTS schema_fields;
DROP TABLE IF EXISTS function_attachments;
DROP TABLE IF EXISTS functions;
DROP TABLE IF EXISTS source_destination_links;
DROP TABLE IF EXISTS destinations;
DROP TABLE IF EXISTS write_keys;
DROP TABLE IF EXISTS sources;
DROP TABLE IF EXISTS workspace_members;
DROP TABLE IF EXISTS users;
DROP TABLE IF EXISTS workspaces;
DROP EXTENSION IF EXISTS "pgcrypto";

View File

@@ -0,0 +1,178 @@
-- ---------------------------------------------------------------------------
-- Initial schema for CDP Ingestion control plane.
--
-- This database stores configuration, not events. Events live in ClickHouse.
-- ---------------------------------------------------------------------------
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
-- ---------------------------------------------------------------------------
-- workspaces
-- ---------------------------------------------------------------------------
CREATE TABLE workspaces (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
slug TEXT NOT NULL UNIQUE,
name TEXT NOT NULL,
tier TEXT NOT NULL DEFAULT 'default'
CHECK (tier IN ('default', 'pro', 'enterprise')),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
deleted_at TIMESTAMPTZ
);
CREATE INDEX idx_workspaces_slug ON workspaces (slug) WHERE deleted_at IS NULL;
-- ---------------------------------------------------------------------------
-- users (console operators)
-- ---------------------------------------------------------------------------
CREATE TABLE users (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
email TEXT NOT NULL UNIQUE,
password_hash TEXT NOT NULL,
name TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE workspace_members (
workspace_id UUID NOT NULL REFERENCES workspaces (id) ON DELETE CASCADE,
user_id UUID NOT NULL REFERENCES users (id) ON DELETE CASCADE,
role TEXT NOT NULL DEFAULT 'member'
CHECK (role IN ('owner', 'admin', 'member', 'viewer')),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
PRIMARY KEY (workspace_id, user_id)
);
-- ---------------------------------------------------------------------------
-- sources -- each source is something that pushes events (web, mobile, server)
-- ---------------------------------------------------------------------------
CREATE TABLE sources (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL REFERENCES workspaces (id) ON DELETE CASCADE,
slug TEXT NOT NULL,
name TEXT NOT NULL,
kind TEXT NOT NULL
CHECK (kind IN ('web', 'mobile', 'server', 'segment', 'webhook')),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
settings JSONB NOT NULL DEFAULT '{}',
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
deleted_at TIMESTAMPTZ,
UNIQUE (workspace_id, slug)
);
CREATE INDEX idx_sources_workspace ON sources (workspace_id) WHERE deleted_at IS NULL;
-- ---------------------------------------------------------------------------
-- write_keys -- API auth tokens, scoped to a source
-- ---------------------------------------------------------------------------
CREATE TABLE write_keys (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL REFERENCES workspaces (id) ON DELETE CASCADE,
source_id UUID NOT NULL REFERENCES sources (id) ON DELETE CASCADE,
key_hash TEXT NOT NULL UNIQUE, -- store hash, never raw
key_prefix TEXT NOT NULL, -- first ~8 chars for display
label TEXT,
revoked_at TIMESTAMPTZ,
last_used_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_write_keys_workspace ON write_keys (workspace_id) WHERE revoked_at IS NULL;
CREATE INDEX idx_write_keys_source ON write_keys (source_id) WHERE revoked_at IS NULL;
-- ---------------------------------------------------------------------------
-- destinations -- where events are forwarded (clickhouse, snowflake, bq, s3...)
-- ---------------------------------------------------------------------------
CREATE TABLE destinations (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL REFERENCES workspaces (id) ON DELETE CASCADE,
slug TEXT NOT NULL,
name TEXT NOT NULL,
kind TEXT NOT NULL
CHECK (kind IN ('clickhouse', 'postgres', 'snowflake', 'bigquery',
'redshift', 's3', 'webhook')),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
config JSONB NOT NULL DEFAULT '{}', -- credentials encrypted at rest
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
deleted_at TIMESTAMPTZ,
UNIQUE (workspace_id, slug)
);
CREATE INDEX idx_destinations_workspace ON destinations (workspace_id) WHERE deleted_at IS NULL;
-- source -> destination wiring
CREATE TABLE source_destination_links (
source_id UUID NOT NULL REFERENCES sources (id) ON DELETE CASCADE,
destination_id UUID NOT NULL REFERENCES destinations (id) ON DELETE CASCADE,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
PRIMARY KEY (source_id, destination_id)
);
-- ---------------------------------------------------------------------------
-- functions -- JS transformation code run by rotor
-- ---------------------------------------------------------------------------
CREATE TABLE functions (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
workspace_id UUID NOT NULL REFERENCES workspaces (id) ON DELETE CASCADE,
slug TEXT NOT NULL,
name TEXT NOT NULL,
code TEXT NOT NULL,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
version INTEGER NOT NULL DEFAULT 1,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
deleted_at TIMESTAMPTZ,
UNIQUE (workspace_id, slug)
);
CREATE TABLE function_attachments (
source_id UUID REFERENCES sources (id) ON DELETE CASCADE,
destination_id UUID REFERENCES destinations (id) ON DELETE CASCADE,
function_id UUID NOT NULL REFERENCES functions (id) ON DELETE CASCADE,
position INTEGER NOT NULL DEFAULT 0,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
CHECK (
(source_id IS NOT NULL AND destination_id IS NULL) OR
(source_id IS NULL AND destination_id IS NOT NULL)
)
);
CREATE INDEX idx_function_attachments_source ON function_attachments (source_id);
CREATE INDEX idx_function_attachments_destination ON function_attachments (destination_id);
-- ---------------------------------------------------------------------------
-- schema_fields -- discovered field types per (workspace, event_type, field)
-- ---------------------------------------------------------------------------
CREATE TABLE schema_fields (
workspace_id UUID NOT NULL REFERENCES workspaces (id) ON DELETE CASCADE,
event_type TEXT NOT NULL,
field TEXT NOT NULL,
data_type TEXT NOT NULL
CHECK (data_type IN ('string', 'number', 'boolean',
'object', 'array', 'timestamp', 'null')),
first_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
last_seen_at TIMESTAMPTZ NOT NULL DEFAULT now(),
sample_count BIGINT NOT NULL DEFAULT 1,
PRIMARY KEY (workspace_id, event_type, field)
);
CREATE INDEX idx_schema_fields_event ON schema_fields (workspace_id, event_type);
-- ---------------------------------------------------------------------------
-- audit_log -- security-relevant operations
-- ---------------------------------------------------------------------------
CREATE TABLE audit_log (
id BIGSERIAL PRIMARY KEY,
workspace_id UUID REFERENCES workspaces (id) ON DELETE SET NULL,
actor_id UUID REFERENCES users (id) ON DELETE SET NULL,
action TEXT NOT NULL,
target_type TEXT,
target_id TEXT,
metadata JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_audit_log_workspace ON audit_log (workspace_id, created_at DESC);

View File

@@ -0,0 +1,3 @@
DELETE FROM write_keys WHERE id = '00000000-0000-0000-0000-000000000100';
DELETE FROM sources WHERE id = '00000000-0000-0000-0000-000000000010';
DELETE FROM workspaces WHERE id = '00000000-0000-0000-0000-000000000001';

View File

@@ -0,0 +1,24 @@
-- ---------------------------------------------------------------------------
-- Local dev seed -- a default workspace + source + write key.
-- The plaintext write key for development is: cdp_dev_writekey_1234567890
-- key_hash below is sha256 of that string.
-- ---------------------------------------------------------------------------
INSERT INTO workspaces (id, slug, name, tier)
VALUES ('00000000-0000-0000-0000-000000000001', 'dev', 'Dev Workspace', 'default')
ON CONFLICT (slug) DO NOTHING;
INSERT INTO sources (id, workspace_id, slug, name, kind)
VALUES ('00000000-0000-0000-0000-000000000010',
'00000000-0000-0000-0000-000000000001',
'web', 'Dev Web Source', 'web')
ON CONFLICT (workspace_id, slug) DO NOTHING;
INSERT INTO write_keys (id, workspace_id, source_id, key_hash, key_prefix, label)
VALUES ('00000000-0000-0000-0000-000000000100',
'00000000-0000-0000-0000-000000000001',
'00000000-0000-0000-0000-000000000010',
encode(digest('cdp_dev_writekey_1234567890', 'sha256'), 'hex'),
'cdp_dev_',
'dev key')
ON CONFLICT (key_hash) DO NOTHING;

View File

@@ -0,0 +1,67 @@
#!/usr/bin/env bash
# Apply / drop ClickHouse DDL files in alphabetical order.
#
# Usage:
# clickhouse_apply.sh up apply *.up.sql in infra/clickhouse/
# clickhouse_apply.sh down apply *.down.sql in REVERSE order
#
# Env:
# CLICKHOUSE_ADDR (default localhost:9000)
# CLICKHOUSE_DB (default cdp)
# CLICKHOUSE_USER (default default)
# CLICKHOUSE_PASSWORD (default empty)
set -euo pipefail
DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)/clickhouse"
ADDR="${CLICKHOUSE_ADDR:-localhost:9000}"
DB="${CLICKHOUSE_DB:-cdp}"
USER="${CLICKHOUSE_USER:-default}"
PASS="${CLICKHOUSE_PASSWORD:-}"
MODE="${1:-up}"
host="${ADDR%%:*}"
port="${ADDR##*:}"
run_sql() {
local file="$1"
echo ">>> applying $(basename "$file")"
if [[ -n "$PASS" ]]; then
clickhouse-client --host "$host" --port "$port" --user "$USER" --password "$PASS" \
--database "$DB" --multiquery --queries-file "$file"
else
clickhouse-client --host "$host" --port "$port" --user "$USER" \
--database "$DB" --multiquery --queries-file "$file"
fi
}
ensure_db() {
if [[ -n "$PASS" ]]; then
clickhouse-client --host "$host" --port "$port" --user "$USER" --password "$PASS" \
--query "CREATE DATABASE IF NOT EXISTS $DB"
else
clickhouse-client --host "$host" --port "$port" --user "$USER" \
--query "CREATE DATABASE IF NOT EXISTS $DB"
fi
}
case "$MODE" in
up)
ensure_db
for f in $(ls "$DIR"/*.up.sql 2>/dev/null | sort); do
run_sql "$f"
done
;;
down)
for f in $(ls "$DIR"/*.down.sql 2>/dev/null | sort -r); do
run_sql "$f"
done
;;
*)
echo "usage: $0 {up|down}"
exit 1
;;
esac
echo "done."