package repo import ( "context" "errors" "fmt" "sync" "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgxpool" "github.com/dbiz/cdp/ingestion/ingest/internal/apperr" ) // SchemaRepo records the data type observed for each (workspace, event_type, field) // triple. The ingest hot path calls GetType per field to detect type conflicts, // so we wrap PG with an in-memory cache. Cache misses fall through to PG; the // resolved type (including the "not seen yet" empty string) is memoised. // // Cache invalidation: UpsertField writes through, so the writer also refreshes. // Other ingest instances are eventually consistent -- a tier-1 PG conflict will // surface on the next request that re-fetches. Acceptable for an append-only // schema registry. type SchemaRepo interface { // GetType returns the recorded type, or "" if the field has never been seen. GetType(ctx context.Context, workspaceID, eventType, field string) (string, error) // UpsertField records a new (or re-confirmed) field type. UpsertField(ctx context.Context, workspaceID, eventType, field, dataType string) error } type schemaRepo struct { db *pgxpool.Pool cache *schemaCache } func NewSchemaRepo(db *pgxpool.Pool) SchemaRepo { return &schemaRepo{ db: db, cache: newSchemaCache(), } } // --------------------------------------------------------------------------- // cache // --------------------------------------------------------------------------- type schemaCache struct { mu sync.RWMutex // "" means "looked up, never seen" -- distinct from "absent from cache". data map[string]string } func newSchemaCache() *schemaCache { return &schemaCache{data: make(map[string]string, 256)} } func (c *schemaCache) key(ws, et, field string) string { return ws + "|" + et + "|" + field } func (c *schemaCache) get(ws, et, field string) (string, bool) { c.mu.RLock() defer c.mu.RUnlock() v, ok := c.data[c.key(ws, et, field)] return v, ok } func (c *schemaCache) set(ws, et, field, dataType string) { c.mu.Lock() defer c.mu.Unlock() c.data[c.key(ws, et, field)] = dataType } // --------------------------------------------------------------------------- // repo methods // --------------------------------------------------------------------------- func (r *schemaRepo) GetType(ctx context.Context, workspaceID, eventType, field string) (string, error) { if v, ok := r.cache.get(workspaceID, eventType, field); ok { return v, nil } const q = ` SELECT data_type FROM schema_fields WHERE workspace_id = $1::uuid AND event_type = $2 AND field = $3` var t string err := r.db.QueryRow(ctx, q, workspaceID, eventType, field).Scan(&t) if errors.Is(err, pgx.ErrNoRows) { // negative cache: avoid hammering PG for fields that don't exist yet. r.cache.set(workspaceID, eventType, field, "") return "", nil } if err != nil { return "", apperr.Internal(fmt.Errorf("schema get: %w", err)) } r.cache.set(workspaceID, eventType, field, t) return t, nil } func (r *schemaRepo) UpsertField(ctx context.Context, workspaceID, eventType, field, dataType string) error { const q = ` INSERT INTO schema_fields (workspace_id, event_type, field, data_type) VALUES ($1::uuid, $2, $3, $4) ON CONFLICT (workspace_id, event_type, field) DO UPDATE SET last_seen_at = now(), sample_count = schema_fields.sample_count + 1` if _, err := r.db.Exec(ctx, q, workspaceID, eventType, field, dataType); err != nil { return apperr.Internal(fmt.Errorf("schema upsert: %w", err)) } // Write-through: keep the local cache consistent with what we just stored. r.cache.set(workspaceID, eventType, field, dataType) return nil }