data layer

This commit is contained in:
2026-05-25 08:38:26 +07:00
parent 4e8c11d545
commit a428170fef
81 changed files with 3941 additions and 0 deletions

View File

View File

@@ -0,0 +1,167 @@
package repo
import (
"context"
"fmt"
"time"
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/ClickHouse/clickhouse-go/v2/lib/driver"
"github.com/dbiz/cdp/data-layer/api/internal/model"
"github.com/dbiz/cdp/data-layer/api/internal/templates"
)
// AnalyticsRepo runs the higher-level P1 query templates (funnel, retention,
// session) against ClickHouse. It shares the read connection with EventRepo
// but lives in its own file because the templates need their own data shapes.
type AnalyticsRepo struct {
ch driver.Conn
tpl *templates.Store
}
func NewAnalyticsRepo(ch driver.Conn, tpl *templates.Store) *AnalyticsRepo {
return &AnalyticsRepo{ch: ch, tpl: tpl}
}
// ---------------------------------------------------------------------------
// Funnel
// ---------------------------------------------------------------------------
type FunnelQuery struct {
WorkspaceID string
Steps []string
From time.Time
To time.Time
WindowSeconds uint32
}
func (r *AnalyticsRepo) Funnel(ctx context.Context, q FunnelQuery) (*model.QueryResult, error) {
if len(q.Steps) < 2 {
return nil, fmt.Errorf("funnel requires at least 2 steps")
}
type stepTpl struct {
Index int
Last bool
}
stepsTpl := make([]stepTpl, len(q.Steps))
for i := range q.Steps {
stepsTpl[i] = stepTpl{Index: i, Last: i == len(q.Steps)-1}
}
sql, err := r.tpl.Render("funnel_analysis.sql.tmpl", map[string]any{
"Steps": stepsTpl,
"StepCount": len(q.Steps),
})
if err != nil {
return nil, err
}
args := []any{
clickhouse.Named("workspace_id", q.WorkspaceID),
clickhouse.DateNamed("from", q.From, clickhouse.MilliSeconds),
clickhouse.DateNamed("to", q.To, clickhouse.MilliSeconds),
clickhouse.Named("window_seconds", q.WindowSeconds),
}
for i, name := range q.Steps {
args = append(args, clickhouse.Named(fmt.Sprintf("step%d", i), name))
}
rows, err := r.ch.Query(ctx, sql, args...)
if err != nil {
return nil, fmt.Errorf("clickhouse funnel: %w", err)
}
defer rows.Close()
return ScanRows(rows)
}
// ---------------------------------------------------------------------------
// Retention
// ---------------------------------------------------------------------------
type RetentionQuery struct {
WorkspaceID string
InitialEvent string
ReturnEvent string
From time.Time
To time.Time
Periods int // e.g. 14 => D0..D13
}
func (r *AnalyticsRepo) Retention(ctx context.Context, q RetentionQuery) (*model.QueryResult, error) {
if q.Periods < 1 {
q.Periods = 14
}
type periodTpl struct {
RIndex int
OffsetDay int
Last bool
}
outer := make([]periodTpl, q.Periods)
for i := 0; i < q.Periods; i++ {
outer[i] = periodTpl{RIndex: i + 2, OffsetDay: i + 1, Last: i == q.Periods-1}
}
sql, err := r.tpl.Render("retention_cohort.sql.tmpl", map[string]any{
"Outer": outer,
})
if err != nil {
return nil, err
}
rows, err := r.ch.Query(ctx, sql,
clickhouse.Named("workspace_id", q.WorkspaceID),
clickhouse.DateNamed("from", q.From, clickhouse.MilliSeconds),
clickhouse.DateNamed("to", q.To, clickhouse.MilliSeconds),
clickhouse.Named("initial_event", q.InitialEvent),
clickhouse.Named("return_event", q.ReturnEvent),
)
if err != nil {
return nil, fmt.Errorf("clickhouse retention: %w", err)
}
defer rows.Close()
return ScanRows(rows)
}
// ---------------------------------------------------------------------------
// Session
// ---------------------------------------------------------------------------
type SessionQuery struct {
WorkspaceID string
UserID string // optional
From time.Time
To time.Time
TimeoutSeconds uint32
Limit int
Offset int
}
func (r *AnalyticsRepo) Sessions(ctx context.Context, q SessionQuery) (*model.QueryResult, error) {
sql, err := r.tpl.Render("session_analysis.sql.tmpl", map[string]any{
"HasUserID": q.UserID != "",
})
if err != nil {
return nil, err
}
args := []any{
clickhouse.Named("workspace_id", q.WorkspaceID),
clickhouse.DateNamed("from", q.From, clickhouse.MilliSeconds),
clickhouse.DateNamed("to", q.To, clickhouse.MilliSeconds),
clickhouse.Named("timeout_seconds", q.TimeoutSeconds),
clickhouse.Named("limit", uint32(q.Limit)),
clickhouse.Named("offset", uint32(q.Offset)),
}
if q.UserID != "" {
args = append(args, clickhouse.Named("user_id", q.UserID))
}
rows, err := r.ch.Query(ctx, sql, args...)
if err != nil {
return nil, fmt.Errorf("clickhouse session: %w", err)
}
defer rows.Close()
return ScanRows(rows)
}

View File

@@ -0,0 +1,58 @@
package repo
import (
"context"
"fmt"
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/ClickHouse/clickhouse-go/v2/lib/driver"
)
// NewClickHouse opens a native-protocol ClickHouse connection. The returned
// driver.Conn is safe for concurrent use. Caller owns Close().
func NewClickHouse(ctx context.Context, addr, db, user, password string) (driver.Conn, error) {
conn, err := clickhouse.Open(&clickhouse.Options{
Addr: []string{addr},
Auth: clickhouse.Auth{
Database: db,
Username: user,
Password: password,
},
Settings: clickhouse.Settings{
"readonly": 0, // analytics queries; per-user read-only enforced for /query/sql separately
},
})
if err != nil {
return nil, fmt.Errorf("open clickhouse: %w", err)
}
if err := conn.Ping(ctx); err != nil {
_ = conn.Close()
return nil, fmt.Errorf("ping clickhouse: %w", err)
}
return conn, nil
}
// NewClickHouseReadOnly opens a ClickHouse connection using a SELECT-only
// account. Used to back the /query/sql sandbox: DDL/DML are rejected at the DB
// level even if the app-level keyword guard is bypassed.
func NewClickHouseReadOnly(ctx context.Context, addr, db, user, password string) (driver.Conn, error) {
conn, err := clickhouse.Open(&clickhouse.Options{
Addr: []string{addr},
Auth: clickhouse.Auth{
Database: db,
Username: user,
Password: password,
},
Settings: clickhouse.Settings{
"readonly": 2, // belt-and-braces: server-side enforce read-only
},
})
if err != nil {
return nil, fmt.Errorf("open clickhouse (ro): %w", err)
}
if err := conn.Ping(ctx); err != nil {
_ = conn.Close()
return nil, fmt.Errorf("ping clickhouse (ro): %w", err)
}
return conn, nil
}

View File

@@ -0,0 +1,194 @@
package repo
import (
"context"
"fmt"
"github.com/ClickHouse/clickhouse-go/v2"
"github.com/ClickHouse/clickhouse-go/v2/lib/driver"
"github.com/dbiz/cdp/data-layer/api/internal/model"
"github.com/dbiz/cdp/data-layer/api/internal/templates"
)
type EventRepo struct {
ch driver.Conn
tpl *templates.Store
}
func NewEventRepo(ch driver.Conn, tpl *templates.Store) *EventRepo {
return &EventRepo{ch: ch, tpl: tpl}
}
// QueryEvents renders the event_explorer template against q.Table and returns
// columns+rows. The query is parameterized -- user input never lands in the
// SQL string, only in clickhouse.Named bindings.
func (r *EventRepo) QueryEvents(ctx context.Context, q model.EventQuery) (*model.QueryResult, error) {
if !q.Table.Valid() {
return nil, fmt.Errorf("invalid event table: %q", q.Table)
}
sql, err := r.tpl.Render("event_explorer.sql.tmpl", map[string]any{
"Table": string(q.Table),
"HasUserID": q.UserID != "",
"HasAnonymousID": q.AnonymousID != "",
"HasEventName": q.EventName != "" && q.Table == model.EventTableTrack,
})
if err != nil {
return nil, err
}
args := []any{
clickhouse.Named("workspace_id", q.WorkspaceID),
clickhouse.DateNamed("from", q.From, clickhouse.MilliSeconds),
clickhouse.DateNamed("to", q.To, clickhouse.MilliSeconds),
clickhouse.Named("limit", uint32(q.Limit)),
clickhouse.Named("offset", uint32(q.Offset)),
}
if q.UserID != "" {
args = append(args, clickhouse.Named("user_id", q.UserID))
}
if q.AnonymousID != "" {
args = append(args, clickhouse.Named("anonymous_id", q.AnonymousID))
}
if q.EventName != "" && q.Table == model.EventTableTrack {
args = append(args, clickhouse.Named("event", q.EventName))
}
rows, err := r.ch.Query(ctx, sql, args...)
if err != nil {
return nil, fmt.Errorf("clickhouse query: %w", err)
}
defer rows.Close()
return ScanRows(rows)
}
// QueryProfileTimeline returns recent events for a profile (resolved to
// user_id) across all four event tables, ordered by received_at desc.
func (r *EventRepo) QueryProfileTimeline(ctx context.Context, workspaceID, userID string, limit, offset int) (*model.QueryResult, error) {
sql, err := r.tpl.Render("profile_timeline.sql.tmpl", nil)
if err != nil {
return nil, err
}
rows, err := r.ch.Query(ctx, sql,
clickhouse.Named("workspace_id", workspaceID),
clickhouse.Named("user_id", userID),
clickhouse.Named("limit", uint32(limit)),
clickhouse.Named("offset", uint32(offset)),
)
if err != nil {
return nil, fmt.Errorf("clickhouse query: %w", err)
}
defer rows.Close()
return ScanRows(rows)
}
// ScanRows turns a driver.Rows iterator into a generic QueryResult. Column
// types come from rows.ColumnTypes() so we allocate the right pointer kinds.
func ScanRows(rows driver.Rows) (*model.QueryResult, error) {
cols := rows.Columns()
colTypes := rows.ColumnTypes()
out := &model.QueryResult{Columns: cols, Rows: [][]any{}}
for rows.Next() {
dest := make([]any, len(colTypes))
for i, ct := range colTypes {
dest[i] = newScanTarget(ct.ScanType().String())
}
if err := rows.Scan(dest...); err != nil {
return nil, fmt.Errorf("scan row: %w", err)
}
row := make([]any, len(dest))
for i, p := range dest {
row[i] = derefScanTarget(p)
}
out.Rows = append(out.Rows, row)
}
if err := rows.Err(); err != nil {
return nil, err
}
out.RowCount = len(out.Rows)
return out, nil
}
// newScanTarget returns a pointer matching ClickHouse's reported Go scan type.
// We keep this list small -- the analytics tables share a handful of types.
func newScanTarget(typeName string) any {
switch typeName {
case "string":
var v string
return &v
case "uint8":
var v uint8
return &v
case "uint16":
var v uint16
return &v
case "uint32":
var v uint32
return &v
case "uint64":
var v uint64
return &v
case "int32":
var v int32
return &v
case "int64":
var v int64
return &v
case "float32":
var v float32
return &v
case "float64":
var v float64
return &v
case "bool":
var v bool
return &v
case "time.Time":
return new(any) // let driver fill, deref below handles it
case "map[string]string":
var v map[string]string
return &v
case "[]string":
var v []string
return &v
default:
// Fallback: untyped pointer; driver decides.
var v any
return &v
}
}
func derefScanTarget(p any) any {
switch v := p.(type) {
case *string:
return *v
case *uint8:
return *v
case *uint16:
return *v
case *uint32:
return *v
case *uint64:
return *v
case *int32:
return *v
case *int64:
return *v
case *float32:
return *v
case *float64:
return *v
case *bool:
return *v
case *map[string]string:
return *v
case *[]string:
return *v
case *any:
return *v
default:
return v
}
}

View File

@@ -0,0 +1,28 @@
// Package repo holds data-access code. PostgreSQL handles owned tables
// (trait_definitions, profile_traits, segment_*, saved_queries) and read-only
// joins onto ingestion-owned tables (workspaces, profiles, sources, ...).
package repo
import (
"context"
"fmt"
"github.com/jackc/pgx/v5/pgxpool"
)
// NewPool returns a pgxpool ready for use. Caller owns Close().
func NewPool(ctx context.Context, dsn string) (*pgxpool.Pool, error) {
cfg, err := pgxpool.ParseConfig(dsn)
if err != nil {
return nil, fmt.Errorf("parse pg dsn: %w", err)
}
pool, err := pgxpool.NewWithConfig(ctx, cfg)
if err != nil {
return nil, fmt.Errorf("open pg pool: %w", err)
}
if err := pool.Ping(ctx); err != nil {
pool.Close()
return nil, fmt.Errorf("ping pg: %w", err)
}
return pool, nil
}

View File

@@ -0,0 +1,70 @@
package repo
import (
"context"
"encoding/json"
"errors"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/dbiz/cdp/data-layer/api/internal/apperr"
"github.com/dbiz/cdp/data-layer/api/internal/model"
)
// ProfileRepo reads the unified-profile table owned by cdp-ingestion.
//
// Assumed schema (TODO: align with cdp-ingestion once that migration lands):
//
// profiles (
// id UUID,
// workspace_id UUID,
// user_id TEXT,
// anonymous_ids TEXT[],
// traits JSONB,
// first_seen_at TIMESTAMPTZ,
// last_seen_at TIMESTAMPTZ
// )
type ProfileRepo struct {
pg *pgxpool.Pool
}
func NewProfileRepo(pg *pgxpool.Pool) *ProfileRepo { return &ProfileRepo{pg: pg} }
const selectProfileByID = `
SELECT id, workspace_id, user_id, anonymous_ids, traits, first_seen_at, last_seen_at
FROM profiles
WHERE workspace_id = $1 AND id = $2
`
func (r *ProfileRepo) GetByID(ctx context.Context, workspaceID, profileID string) (*model.Profile, error) {
row := r.pg.QueryRow(ctx, selectProfileByID, workspaceID, profileID)
var p model.Profile
var traitsRaw []byte
if err := row.Scan(&p.ID, &p.WorkspaceID, &p.UserID, &p.AnonymousIDs, &traitsRaw, &p.FirstSeenAt, &p.LastSeenAt); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, apperr.NotFound("profile not found")
}
return nil, apperr.Internal(err)
}
if len(traitsRaw) > 0 {
if err := json.Unmarshal(traitsRaw, &p.Traits); err != nil {
return nil, apperr.Internal(err)
}
}
return &p, nil
}
// GetUserIDForProfile resolves a profile UUID back to its primary user_id so
// the timeline query can target ClickHouse events on that key.
func (r *ProfileRepo) GetUserIDForProfile(ctx context.Context, workspaceID, profileID string) (string, error) {
const q = `SELECT user_id FROM profiles WHERE workspace_id = $1 AND id = $2`
var uid string
if err := r.pg.QueryRow(ctx, q, workspaceID, profileID).Scan(&uid); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return "", apperr.NotFound("profile not found")
}
return "", apperr.Internal(err)
}
return uid, nil
}

View File

@@ -0,0 +1,120 @@
package repo
import (
"context"
"encoding/json"
"errors"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/dbiz/cdp/data-layer/api/internal/apperr"
"github.com/dbiz/cdp/data-layer/api/internal/model"
)
type SavedQueryRepo struct {
pg *pgxpool.Pool
}
func NewSavedQueryRepo(pg *pgxpool.Pool) *SavedQueryRepo { return &SavedQueryRepo{pg: pg} }
const (
insertSavedQuery = `
INSERT INTO saved_queries (workspace_id, owner_id, name, kind, spec)
VALUES ($1, NULLIF($2, '')::uuid, $3, $4, $5)
RETURNING id, workspace_id, COALESCE(owner_id::text, '') AS owner_id, name, kind, spec, created_at, updated_at
`
selectSavedQueries = `
SELECT id, workspace_id, COALESCE(owner_id::text, '') AS owner_id, name, kind, spec, created_at, updated_at
FROM saved_queries
WHERE workspace_id = $1
ORDER BY updated_at DESC
LIMIT $2 OFFSET $3
`
selectSavedQuery = `
SELECT id, workspace_id, COALESCE(owner_id::text, '') AS owner_id, name, kind, spec, created_at, updated_at
FROM saved_queries
WHERE workspace_id = $1 AND id = $2
`
updateSavedQuery = `
UPDATE saved_queries
SET name = $3, spec = $4, updated_at = now()
WHERE workspace_id = $1 AND id = $2
RETURNING id, workspace_id, COALESCE(owner_id::text, '') AS owner_id, name, kind, spec, created_at, updated_at
`
deleteSavedQuery = `DELETE FROM saved_queries WHERE workspace_id = $1 AND id = $2`
)
func (r *SavedQueryRepo) Create(ctx context.Context, q model.SavedQuery) (*model.SavedQuery, error) {
spec, err := json.Marshal(q.Spec)
if err != nil {
return nil, apperr.BadRequest("spec must be valid json", "spec", err)
}
row := r.pg.QueryRow(ctx, insertSavedQuery, q.WorkspaceID, q.OwnerID, q.Name, q.Kind, spec)
return scanSavedQuery(row)
}
func (r *SavedQueryRepo) List(ctx context.Context, workspaceID string, limit, offset int) ([]model.SavedQuery, error) {
rows, err := r.pg.Query(ctx, selectSavedQueries, workspaceID, limit, offset)
if err != nil {
return nil, apperr.Internal(err)
}
defer rows.Close()
out := []model.SavedQuery{}
for rows.Next() {
q, err := scanSavedQuery(rows)
if err != nil {
return nil, err
}
out = append(out, *q)
}
return out, rows.Err()
}
func (r *SavedQueryRepo) Get(ctx context.Context, workspaceID, id string) (*model.SavedQuery, error) {
row := r.pg.QueryRow(ctx, selectSavedQuery, workspaceID, id)
return scanSavedQuery(row)
}
func (r *SavedQueryRepo) Update(ctx context.Context, workspaceID, id, name string, spec map[string]any) (*model.SavedQuery, error) {
specJSON, err := json.Marshal(spec)
if err != nil {
return nil, apperr.BadRequest("spec must be valid json", "spec", err)
}
row := r.pg.QueryRow(ctx, updateSavedQuery, workspaceID, id, name, specJSON)
return scanSavedQuery(row)
}
func (r *SavedQueryRepo) Delete(ctx context.Context, workspaceID, id string) error {
ct, err := r.pg.Exec(ctx, deleteSavedQuery, workspaceID, id)
if err != nil {
return apperr.Internal(err)
}
if ct.RowsAffected() == 0 {
return apperr.NotFound("saved query not found")
}
return nil
}
// scanSavedQuery accepts both pgx.Row and pgx.Rows (they share Scan).
type scanner interface {
Scan(dest ...any) error
}
func scanSavedQuery(s scanner) (*model.SavedQuery, error) {
var q model.SavedQuery
var specRaw []byte
if err := s.Scan(&q.ID, &q.WorkspaceID, &q.OwnerID, &q.Name, &q.Kind, &specRaw, &q.CreatedAt, &q.UpdatedAt); err != nil {
if errors.Is(err, pgx.ErrNoRows) {
return nil, apperr.NotFound("saved query not found")
}
return nil, apperr.Internal(err)
}
if len(specRaw) > 0 {
if err := json.Unmarshal(specRaw, &q.Spec); err != nil {
return nil, apperr.Internal(err)
}
}
return &q, nil
}