testable

2026-05-25 11:00:13 +07:00
parent c5e980aa52
commit 81ba67f346
12 changed files with 1534 additions and 77 deletions
--- a/ingestion/ingest/internal/repo/schema_repo.go
+++ b/ingestion/ingest/internal/repo/schema_repo.go
@@ -2,19 +2,25 @@ package repo

 import (
 	"context"
+	"errors"
 	"fmt"
+	"sync"

+	"github.com/jackc/pgx/v5"
 	"github.com/jackc/pgx/v5/pgxpool"

 	"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
 )

 // SchemaRepo records the data type observed for each (workspace, event_type, field)
-// triple. The bulker / analytics layer uses this to detect type conflicts.
+// triple. The ingest hot path calls GetType per field to detect type conflicts,
+// so we wrap PG with an in-memory cache. Cache misses fall through to PG; the
+// resolved type (including the "not seen yet" empty string) is memoised.
 //
-// In the ingest hot path we only *check* for conflict via UpsertField; the
-// rebuild of the cached map is left to a background loader. We do not block
-// the request waiting for upsert -- it is fire-and-forget.
+// Cache invalidation: UpsertField writes through, so the writer also refreshes.
+// Other ingest instances are eventually consistent -- a tier-1 PG conflict will
+// surface on the next request that re-fetches. Acceptable for an append-only
+// schema registry.
 type SchemaRepo interface {
 	// GetType returns the recorded type, or "" if the field has never been seen.
 	GetType(ctx context.Context, workspaceID, eventType, field string) (string, error)
@@ -23,26 +29,71 @@ type SchemaRepo interface {
 }

 type schemaRepo struct {
-	db *pgxpool.Pool
+	db    *pgxpool.Pool
+	cache *schemaCache
 }

 func NewSchemaRepo(db *pgxpool.Pool) SchemaRepo {
-	return &schemaRepo{db: db}
+	return &schemaRepo{
+		db:    db,
+		cache: newSchemaCache(),
+	}
 }

+// ---------------------------------------------------------------------------
+// cache
+// ---------------------------------------------------------------------------
+
+type schemaCache struct {
+	mu sync.RWMutex
+	// "" means "looked up, never seen" -- distinct from "absent from cache".
+	data map[string]string
+}
+
+func newSchemaCache() *schemaCache {
+	return &schemaCache{data: make(map[string]string, 256)}
+}
+
+func (c *schemaCache) key(ws, et, field string) string {
+	return ws + "|" + et + "|" + field
+}
+
+func (c *schemaCache) get(ws, et, field string) (string, bool) {
+	c.mu.RLock()
+	defer c.mu.RUnlock()
+	v, ok := c.data[c.key(ws, et, field)]
+	return v, ok
+}
+
+func (c *schemaCache) set(ws, et, field, dataType string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.data[c.key(ws, et, field)] = dataType
+}
+
+// ---------------------------------------------------------------------------
+// repo methods
+// ---------------------------------------------------------------------------
+
 func (r *schemaRepo) GetType(ctx context.Context, workspaceID, eventType, field string) (string, error) {
+	if v, ok := r.cache.get(workspaceID, eventType, field); ok {
+		return v, nil
+	}
+
 	const q = `
        SELECT data_type FROM schema_fields
        WHERE workspace_id = $1::uuid AND event_type = $2 AND field = $3`
 	var t string
 	err := r.db.QueryRow(ctx, q, workspaceID, eventType, field).Scan(&t)
+	if errors.Is(err, pgx.ErrNoRows) {
+		// negative cache: avoid hammering PG for fields that don't exist yet.
+		r.cache.set(workspaceID, eventType, field, "")
+		return "", nil
+	}
 	if err != nil {
-		// pgx.ErrNoRows → return "" with nil error so caller treats as new field
-		if err.Error() == "no rows in result set" {
-			return "", nil
-		}
 		return "", apperr.Internal(fmt.Errorf("schema get: %w", err))
 	}
+	r.cache.set(workspaceID, eventType, field, t)
 	return t, nil
 }

@@ -53,9 +104,10 @@ func (r *schemaRepo) UpsertField(ctx context.Context, workspaceID, eventType, fi
        ON CONFLICT (workspace_id, event_type, field) DO UPDATE
        SET last_seen_at = now(),
            sample_count = schema_fields.sample_count + 1`
-	_, err := r.db.Exec(ctx, q, workspaceID, eventType, field, dataType)
-	if err != nil {
+	if _, err := r.db.Exec(ctx, q, workspaceID, eventType, field, dataType); err != nil {
 		return apperr.Internal(fmt.Errorf("schema upsert: %w", err))
 	}
+	// Write-through: keep the local cache consistent with what we just stored.
+	r.cache.set(workspaceID, eventType, field, dataType)
 	return nil
 }