This commit is contained in:
2026-05-25 11:00:13 +07:00
parent c5e980aa52
commit 81ba67f346
12 changed files with 1534 additions and 77 deletions

View File

@@ -2,19 +2,25 @@ package repo
import (
"context"
"errors"
"fmt"
"sync"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
)
// SchemaRepo records the data type observed for each (workspace, event_type, field)
// triple. The bulker / analytics layer uses this to detect type conflicts.
// triple. The ingest hot path calls GetType per field to detect type conflicts,
// so we wrap PG with an in-memory cache. Cache misses fall through to PG; the
// resolved type (including the "not seen yet" empty string) is memoised.
//
// In the ingest hot path we only *check* for conflict via UpsertField; the
// rebuild of the cached map is left to a background loader. We do not block
// the request waiting for upsert -- it is fire-and-forget.
// Cache invalidation: UpsertField writes through, so the writer also refreshes.
// Other ingest instances are eventually consistent -- a tier-1 PG conflict will
// surface on the next request that re-fetches. Acceptable for an append-only
// schema registry.
type SchemaRepo interface {
// GetType returns the recorded type, or "" if the field has never been seen.
GetType(ctx context.Context, workspaceID, eventType, field string) (string, error)
@@ -23,26 +29,71 @@ type SchemaRepo interface {
}
type schemaRepo struct {
db *pgxpool.Pool
db *pgxpool.Pool
cache *schemaCache
}
func NewSchemaRepo(db *pgxpool.Pool) SchemaRepo {
return &schemaRepo{db: db}
return &schemaRepo{
db: db,
cache: newSchemaCache(),
}
}
// ---------------------------------------------------------------------------
// cache
// ---------------------------------------------------------------------------
type schemaCache struct {
mu sync.RWMutex
// "" means "looked up, never seen" -- distinct from "absent from cache".
data map[string]string
}
func newSchemaCache() *schemaCache {
return &schemaCache{data: make(map[string]string, 256)}
}
func (c *schemaCache) key(ws, et, field string) string {
return ws + "|" + et + "|" + field
}
func (c *schemaCache) get(ws, et, field string) (string, bool) {
c.mu.RLock()
defer c.mu.RUnlock()
v, ok := c.data[c.key(ws, et, field)]
return v, ok
}
func (c *schemaCache) set(ws, et, field, dataType string) {
c.mu.Lock()
defer c.mu.Unlock()
c.data[c.key(ws, et, field)] = dataType
}
// ---------------------------------------------------------------------------
// repo methods
// ---------------------------------------------------------------------------
func (r *schemaRepo) GetType(ctx context.Context, workspaceID, eventType, field string) (string, error) {
if v, ok := r.cache.get(workspaceID, eventType, field); ok {
return v, nil
}
const q = `
SELECT data_type FROM schema_fields
WHERE workspace_id = $1::uuid AND event_type = $2 AND field = $3`
var t string
err := r.db.QueryRow(ctx, q, workspaceID, eventType, field).Scan(&t)
if errors.Is(err, pgx.ErrNoRows) {
// negative cache: avoid hammering PG for fields that don't exist yet.
r.cache.set(workspaceID, eventType, field, "")
return "", nil
}
if err != nil {
// pgx.ErrNoRows → return "" with nil error so caller treats as new field
if err.Error() == "no rows in result set" {
return "", nil
}
return "", apperr.Internal(fmt.Errorf("schema get: %w", err))
}
r.cache.set(workspaceID, eventType, field, t)
return t, nil
}
@@ -53,9 +104,10 @@ func (r *schemaRepo) UpsertField(ctx context.Context, workspaceID, eventType, fi
ON CONFLICT (workspace_id, event_type, field) DO UPDATE
SET last_seen_at = now(),
sample_count = schema_fields.sample_count + 1`
_, err := r.db.Exec(ctx, q, workspaceID, eventType, field, dataType)
if err != nil {
if _, err := r.db.Exec(ctx, q, workspaceID, eventType, field, dataType); err != nil {
return apperr.Internal(fmt.Errorf("schema upsert: %w", err))
}
// Write-through: keep the local cache consistent with what we just stored.
r.cache.set(workspaceID, eventType, field, dataType)
return nil
}