init ingestion

This commit is contained in:
2026-05-24 22:59:24 +07:00
commit 4e8c11d545
80 changed files with 5639 additions and 0 deletions

View File

@@ -0,0 +1,79 @@
// Package apperr defines AppError, the single error type returned by every
// service/repo function. Handlers translate AppError into HTTP responses.
package apperr
import (
"errors"
"fmt"
"net/http"
)
type AppError struct {
Code int // HTTP status to return
Message string // user-facing message (safe to expose)
Field string // optional: which field caused the error
RetryAfter int // seconds, for 429
Err error // original error for logging (never exposed)
}
func (e *AppError) Error() string {
if e.Err != nil {
return fmt.Sprintf("%s: %v", e.Message, e.Err)
}
return e.Message
}
func (e *AppError) Unwrap() error { return e.Err }
// As reports whether err is or wraps an *AppError.
func As(err error) (*AppError, bool) {
var ae *AppError
if errors.As(err, &ae) {
return ae, true
}
return nil, false
}
// ---------------------------------------------------------------------------
// Constructors
// ---------------------------------------------------------------------------
func BadRequest(msg, field string, err error) *AppError {
return &AppError{Code: http.StatusBadRequest, Message: msg, Field: field, Err: err}
}
func Unauthorized(msg string) *AppError {
return &AppError{Code: http.StatusUnauthorized, Message: msg}
}
func Forbidden(msg string) *AppError {
return &AppError{Code: http.StatusForbidden, Message: msg}
}
func NotFound(msg string) *AppError {
return &AppError{Code: http.StatusNotFound, Message: msg}
}
func Conflict(msg string, err error) *AppError {
return &AppError{Code: http.StatusConflict, Message: msg, Err: err}
}
func PayloadTooLarge(msg string) *AppError {
return &AppError{Code: http.StatusRequestEntityTooLarge, Message: msg}
}
func UnprocessableEntity(msg string) *AppError {
return &AppError{Code: http.StatusUnprocessableEntity, Message: msg}
}
func TooManyRequests(retryAfterSeconds int) *AppError {
return &AppError{
Code: http.StatusTooManyRequests,
Message: "rate limit exceeded",
RetryAfter: retryAfterSeconds,
}
}
func Internal(err error) *AppError {
return &AppError{Code: http.StatusInternalServerError, Message: "internal server error", Err: err}
}

View File

@@ -0,0 +1,41 @@
// Package config loads runtime configuration from environment variables.
//
// All env vars are prefixed with INGEST_ except shared infra ones
// (POSTGRES_DSN, REDIS_ADDR, KAFKA_BROKERS).
package config
import (
"fmt"
"time"
"github.com/caarlos0/env/v11"
)
type Config struct {
HTTPAddr string `env:"INGEST_HTTP_ADDR" envDefault:":3049"`
LogLevel string `env:"INGEST_LOG_LEVEL" envDefault:"info"`
PayloadLimitKB int `env:"INGEST_PAYLOAD_LIMIT_KB" envDefault:"100"`
BatchLimitKB int `env:"INGEST_BATCH_LIMIT_KB" envDefault:"4000"`
LateEventHours int `env:"INGEST_LATE_EVENT_HOURS" envDefault:"24"`
DedupTTLHours int `env:"INGEST_DEDUP_TTL_HOURS" envDefault:"24"`
WriteKeyCacheTTL time.Duration `env:"INGEST_WRITE_KEY_CACHE_TTL_SECONDS" envDefault:"45s"`
ShutdownTimeout time.Duration `env:"INGEST_SHUTDOWN_TIMEOUT_SECONDS" envDefault:"30s"`
LogPayloadOnSuccess bool `env:"INGEST_LOG_PAYLOAD_ON_SUCCESS" envDefault:"false"`
LogPayloadOnError bool `env:"INGEST_LOG_PAYLOAD_ON_ERROR" envDefault:"true"`
PostgresDSN string `env:"POSTGRES_DSN,required"`
RedisAddr string `env:"REDIS_ADDR" envDefault:"localhost:6379"`
KafkaBrokers []string `env:"KAFKA_BROKERS" envSeparator:"," envDefault:"localhost:9092"`
KafkaTopicIngest string `env:"KAFKA_TOPIC_INGEST" envDefault:"events.ingest"`
KafkaTopicDLQ string `env:"KAFKA_TOPIC_DLQ" envDefault:"events.dlq"`
KafkaTopicRetry string `env:"KAFKA_TOPIC_RETRY" envDefault:"events.retry"`
}
func Load() (*Config, error) {
cfg := &Config{}
if err := env.Parse(cfg); err != nil {
return nil, fmt.Errorf("config load: %w", err)
}
return cfg, nil
}

View File

@@ -0,0 +1,50 @@
// Package dedup provides idempotent event acceptance via Redis SETNX.
//
// Key shape: dedup:{workspace_id}:{message_id}
// TTL: 24h by default (configurable)
//
// CheckAndSet returns true when the message_id is new (first time seen).
// If it returns false the caller MUST drop the event silently and return 200.
package dedup
import (
"context"
"fmt"
"time"
"github.com/redis/rueidis"
)
type Dedup interface {
CheckAndSet(ctx context.Context, workspaceID, messageID string) (bool, error)
}
type redisDedup struct {
client rueidis.Client
ttl time.Duration
}
func New(client rueidis.Client, ttl time.Duration) Dedup {
return &redisDedup{client: client, ttl: ttl}
}
func key(workspaceID, messageID string) string {
return fmt.Sprintf("dedup:%s:%s", workspaceID, messageID)
}
func (d *redisDedup) CheckAndSet(ctx context.Context, workspaceID, messageID string) (bool, error) {
k := key(workspaceID, messageID)
cmd := d.client.B().Set().Key(k).Value("1").
Nx().
Ex(d.ttl).
Build()
resp := d.client.Do(ctx, cmd)
if err := resp.Error(); err != nil {
return false, fmt.Errorf("dedup setnx: %w", err)
}
// SET with NX returns "OK" when set, nil reply when key already exists.
if resp.IsNil() {
return false, nil
}
return true, nil
}

View File

@@ -0,0 +1,209 @@
package handler
import (
"bytes"
"encoding/json"
"errors"
"io"
"net/http"
"strconv"
"github.com/go-playground/validator/v10"
"go.uber.org/zap"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
"github.com/dbiz/cdp/ingestion/ingest/internal/middleware"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
"github.com/dbiz/cdp/ingestion/ingest/internal/service"
)
type EventHandler struct {
svc *service.IngestService
val *validator.Validate
log *zap.Logger
}
func NewEventHandler(svc *service.IngestService, log *zap.Logger) *EventHandler {
return &EventHandler{
svc: svc,
val: validator.New(),
log: log,
}
}
// ---------------------------------------------------------------------------
// Routes
// ---------------------------------------------------------------------------
// Single-event endpoints. They differ only in the `type` they force on the
// body, so they all funnel into one handler.
func (h *EventHandler) Track(w http.ResponseWriter, r *http.Request) {
h.handleSingle(w, r, model.EventTypeTrack)
}
func (h *EventHandler) Identify(w http.ResponseWriter, r *http.Request) {
h.handleSingle(w, r, model.EventTypeIdentify)
}
func (h *EventHandler) Page(w http.ResponseWriter, r *http.Request) {
h.handleSingle(w, r, model.EventTypePage)
}
func (h *EventHandler) Group(w http.ResponseWriter, r *http.Request) {
h.handleSingle(w, r, model.EventTypeGroup)
}
func (h *EventHandler) Alias(w http.ResponseWriter, r *http.Request) {
h.handleSingle(w, r, model.EventTypeAlias)
}
func (h *EventHandler) Screen(w http.ResponseWriter, r *http.Request) {
h.handleSingle(w, r, model.EventTypeScreen)
}
func (h *EventHandler) Batch(w http.ResponseWriter, r *http.Request) {
body, err := io.ReadAll(r.Body)
if err != nil {
h.writeErr(w, apperr.PayloadTooLarge("payload too large"))
return
}
var env model.BatchEnvelope
if err := json.Unmarshal(body, &env); err != nil {
h.writeErr(w, apperr.BadRequest("invalid json body", "", err))
return
}
if err := h.val.Struct(&env); err != nil {
h.writeErr(w, apperr.BadRequest("validation failed", firstField(err), err))
return
}
ictx := h.ingestCtx(r, body)
results := h.svc.IngestBatch(r.Context(), ictx, env.Batch)
// Per-event status -- 200 OK, with an array of {message_id, ok, error}.
type item struct {
MessageID string `json:"messageId"`
OK bool `json:"ok"`
Error string `json:"error,omitempty"`
Field string `json:"field,omitempty"`
}
out := make([]item, len(env.Batch))
for i, e := range env.Batch {
it := item{MessageID: e.MessageID, OK: true}
if results[i] != nil {
it.OK = false
if ae, ok := apperr.As(results[i]); ok {
it.Error = ae.Message
it.Field = ae.Field
} else {
it.Error = "internal error"
}
}
out[i] = it
}
writeJSON(w, http.StatusOK, map[string]any{"results": out})
}
// ---------------------------------------------------------------------------
// Health / Ready
// ---------------------------------------------------------------------------
func (h *EventHandler) Health(w http.ResponseWriter, r *http.Request) {
writeJSON(w, http.StatusOK, map[string]string{"status": "ok"})
}
func (h *EventHandler) Ready(w http.ResponseWriter, r *http.Request) {
// Liveness is enough for k8s readiness in this scaffold. Wire in real
// dependency checks (PG ping, Kafka ping) when needed.
writeJSON(w, http.StatusOK, map[string]string{"status": "ready"})
}
// ---------------------------------------------------------------------------
// shared helpers
// ---------------------------------------------------------------------------
func (h *EventHandler) handleSingle(w http.ResponseWriter, r *http.Request, t model.EventType) {
body, err := io.ReadAll(r.Body)
if err != nil {
h.writeErr(w, apperr.PayloadTooLarge("payload too large"))
return
}
var raw model.RawEvent
if err := json.NewDecoder(bytes.NewReader(body)).Decode(&raw); err != nil {
h.writeErr(w, apperr.BadRequest("invalid json body", "", err))
return
}
if raw.Type == "" {
raw.Type = t
}
if err := h.val.Struct(&raw); err != nil {
h.writeErr(w, apperr.BadRequest("validation failed", firstField(err), err))
return
}
ictx := h.ingestCtx(r, body)
if err := h.svc.Ingest(r.Context(), ictx, &raw); err != nil {
h.writeErr(w, err)
return
}
writeJSON(w, http.StatusOK, map[string]bool{"ok": true})
}
func (h *EventHandler) ingestCtx(r *http.Request, body []byte) service.IngestContext {
wk := middleware.WriteKeyFromCtx(r.Context())
return service.IngestContext{
WorkspaceID: wk.WorkspaceID,
SourceID: wk.SourceID,
IP: clientIP(r),
UserAgent: r.UserAgent(),
RawBody: body,
}
}
func (h *EventHandler) writeErr(w http.ResponseWriter, err error) {
if ae, ok := apperr.As(err); ok {
if ae.RetryAfter > 0 {
w.Header().Set("Retry-After", strconv.Itoa(ae.RetryAfter))
}
writeJSON(w, ae.Code, errorResponse{Error: ae.Message, Field: ae.Field})
if ae.Err != nil {
h.log.Warn("request error",
zap.Int("code", ae.Code),
zap.String("msg", ae.Message),
zap.Error(ae.Err))
}
return
}
h.log.Error("unhandled error", zap.Error(err))
writeJSON(w, http.StatusInternalServerError, errorResponse{Error: "internal server error"})
}
type errorResponse struct {
Error string `json:"error"`
Field string `json:"field,omitempty"`
}
func writeJSON(w http.ResponseWriter, status int, body any) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(status)
_ = json.NewEncoder(w).Encode(body)
}
func firstField(err error) string {
var verrs validator.ValidationErrors
if errors.As(err, &verrs) && len(verrs) > 0 {
return verrs[0].Field()
}
return ""
}
// clientIP duplicates middleware.clientIP -- intentionally small, no shared types.
func clientIP(r *http.Request) string {
if h := r.Header.Get("X-Forwarded-For"); h != "" {
return h
}
return r.RemoteAddr
}

View File

@@ -0,0 +1,110 @@
// Package kafka wraps franz-go for the ingest producer.
//
// Design notes:
// - We use ProduceSync only for DLQ writes (rare; correctness > latency).
// - Happy-path Produce is fire-and-forget: we return 200 OK before the
// ack lands. franz-go buffers internally and retries.
// - Partition key = anonymous_id for the happy topic so that all events
// for a single visitor land on the same partition (ordering for stitching).
package kafka
import (
"context"
"encoding/json"
"fmt"
"github.com/twmb/franz-go/pkg/kgo"
"go.uber.org/zap"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
)
type Producer struct {
client *kgo.Client
log *zap.Logger
topicIngest string
topicDLQ string
topicRetry string
}
func NewProducer(brokers []string, topicIngest, topicDLQ, topicRetry string, log *zap.Logger) (*Producer, error) {
cl, err := kgo.NewClient(
kgo.SeedBrokers(brokers...),
kgo.ProducerLinger(5_000_000), // 5ms linger -> batch small bursts
kgo.ProducerBatchCompression(kgo.ZstdCompression()),
kgo.MaxBufferedRecords(100_000),
kgo.RequiredAcks(kgo.LeaderAck()),
kgo.ClientID("cdp-ingest"),
)
if err != nil {
return nil, fmt.Errorf("kafka client: %w", err)
}
if err := cl.Ping(context.Background()); err != nil {
cl.Close()
return nil, fmt.Errorf("kafka ping: %w", err)
}
return &Producer{
client: cl,
log: log,
topicIngest: topicIngest,
topicDLQ: topicDLQ,
topicRetry: topicRetry,
}, nil
}
func (p *Producer) Close() {
p.client.Close()
}
// Produce sends an event to the happy-path topic. Fire-and-forget.
func (p *Producer) Produce(ctx context.Context, ev *model.IngestedEvent) error {
payload, err := json.Marshal(ev)
if err != nil {
return fmt.Errorf("marshal event: %w", err)
}
rec := &kgo.Record{
Topic: p.topicIngest,
Key: []byte(ev.PartitionKey()),
Value: payload,
Headers: []kgo.RecordHeader{
{Key: "workspace_id", Value: []byte(ev.WorkspaceID)},
{Key: "source_id", Value: []byte(ev.SourceID)},
{Key: "type", Value: []byte(ev.Type)},
},
}
p.client.Produce(ctx, rec, func(r *kgo.Record, err error) {
if err != nil {
p.log.Error("kafka produce failed",
zap.String("topic", r.Topic),
zap.String("message_id", ev.MessageID),
zap.Error(err))
}
})
return nil
}
// ProduceDLQ writes a failed event to the DLQ topic synchronously so we know
// it landed before responding to the user with the error.
func (p *Producer) ProduceDLQ(ctx context.Context, workspaceID, sourceID, messageID, reason, field string, raw []byte) error {
envelope := map[string]any{
"workspace_id": workspaceID,
"source_id": sourceID,
"message_id": messageID,
"reason": reason,
"field": field,
"raw_payload": string(raw),
}
payload, _ := json.Marshal(envelope)
rec := &kgo.Record{
Topic: p.topicDLQ,
Key: []byte(workspaceID),
Value: payload,
Headers: []kgo.RecordHeader{
{Key: "reason", Value: []byte(reason)},
},
}
if err := p.client.ProduceSync(ctx, rec).FirstErr(); err != nil {
return fmt.Errorf("dlq produce: %w", err)
}
return nil
}

View File

@@ -0,0 +1,193 @@
// Package middleware provides chi-compatible HTTP middleware: auth, logging,
// payload-limit, request-id, panic recovery, CORS.
package middleware
import (
"context"
"encoding/base64"
"net/http"
"runtime/debug"
"strings"
"time"
"github.com/google/uuid"
"go.uber.org/zap"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
"github.com/dbiz/cdp/ingestion/ingest/internal/service"
)
type ctxKey string
const (
ctxKeyRequestID ctxKey = "request_id"
ctxKeyWriteKey ctxKey = "write_key"
)
// RequestID assigns a uuid v4 to each request and stores it in context.
func RequestID(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
id := r.Header.Get("X-Request-Id")
if id == "" {
id = uuid.NewString()
}
ctx := context.WithValue(r.Context(), ctxKeyRequestID, id)
w.Header().Set("X-Request-Id", id)
next.ServeHTTP(w, r.WithContext(ctx))
})
}
func RequestIDFromCtx(ctx context.Context) string {
v, _ := ctx.Value(ctxKeyRequestID).(string)
return v
}
// Recover handles panics so a buggy handler can't take down the server.
func Recover(log *zap.Logger) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
defer func() {
if rec := recover(); rec != nil {
log.Error("panic in handler",
zap.Any("panic", rec),
zap.String("path", r.URL.Path),
zap.ByteString("stack", debug.Stack()))
http.Error(w, `{"error":"internal server error"}`, http.StatusInternalServerError)
}
}()
next.ServeHTTP(w, r)
})
}
}
// PayloadLimit caps the request body size to limitKB kilobytes.
func PayloadLimit(limitKB int) func(http.Handler) http.Handler {
max := int64(limitKB) * 1024
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
r.Body = http.MaxBytesReader(w, r.Body, max)
next.ServeHTTP(w, r)
})
}
}
// Logger logs one structured line per request.
func Logger(log *zap.Logger) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
rw := &statusRecorder{ResponseWriter: w, status: 200}
next.ServeHTTP(rw, r)
log.Info("http",
zap.String("method", r.Method),
zap.String("path", r.URL.Path),
zap.Int("status", rw.status),
zap.Int64("duration_ms", time.Since(start).Milliseconds()),
zap.String("request_id", RequestIDFromCtx(r.Context())),
zap.String("ip", clientIP(r)))
})
}
}
// CORS returns a permissive CORS handler. Browser SDKs (web tracker) require it.
func CORS(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Access-Control-Allow-Methods", "GET, POST, OPTIONS")
w.Header().Set("Access-Control-Allow-Headers", "Authorization, Content-Type, X-Request-Id")
w.Header().Set("Access-Control-Max-Age", "86400")
if r.Method == http.MethodOptions {
w.WriteHeader(http.StatusNoContent)
return
}
next.ServeHTTP(w, r)
})
}
// Auth resolves the write key from the request and stores it in context.
// Accepts both `Authorization: Basic <base64(key:)>` (Segment-style) and
// `Authorization: Bearer <key>`.
func Auth(s *service.AuthService) func(http.Handler) http.Handler {
return func(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
key, err := extractWriteKey(r)
if err != nil {
writeAuthError(w, err)
return
}
wk, err := s.Resolve(r.Context(), key)
if err != nil {
writeAuthError(w, err)
return
}
ctx := context.WithValue(r.Context(), ctxKeyWriteKey, wk)
next.ServeHTTP(w, r.WithContext(ctx))
})
}
}
// WriteKeyFromCtx returns the resolved key set by Auth middleware.
func WriteKeyFromCtx(ctx context.Context) *model.WriteKey {
v, _ := ctx.Value(ctxKeyWriteKey).(*model.WriteKey)
return v
}
// ---------------------------------------------------------------------------
// helpers
// ---------------------------------------------------------------------------
func extractWriteKey(r *http.Request) (string, error) {
h := r.Header.Get("Authorization")
if h == "" {
return "", apperr.Unauthorized("missing Authorization header")
}
if strings.HasPrefix(h, "Bearer ") {
return strings.TrimPrefix(h, "Bearer "), nil
}
if strings.HasPrefix(h, "Basic ") {
raw, err := base64.StdEncoding.DecodeString(strings.TrimPrefix(h, "Basic "))
if err != nil {
return "", apperr.Unauthorized("invalid basic auth")
}
// Segment uses `key:` (no password). Take everything before the first colon.
s := string(raw)
if i := strings.Index(s, ":"); i >= 0 {
return s[:i], nil
}
return s, nil
}
return "", apperr.Unauthorized("unsupported auth scheme")
}
func writeAuthError(w http.ResponseWriter, err error) {
if ae, ok := apperr.As(err); ok {
http.Error(w, `{"error":"`+ae.Message+`"}`, ae.Code)
return
}
http.Error(w, `{"error":"unauthorized"}`, http.StatusUnauthorized)
}
func clientIP(r *http.Request) string {
if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
if i := strings.Index(xff, ","); i >= 0 {
return strings.TrimSpace(xff[:i])
}
return strings.TrimSpace(xff)
}
if rip := r.Header.Get("X-Real-Ip"); rip != "" {
return rip
}
return r.RemoteAddr
}
type statusRecorder struct {
http.ResponseWriter
status int
}
func (s *statusRecorder) WriteHeader(code int) {
s.status = code
s.ResponseWriter.WriteHeader(code)
}

View File

@@ -0,0 +1,81 @@
// Package model holds the wire and domain types passed between layers.
package model
import (
"encoding/json"
"time"
)
// EventType is the Segment-compatible call kind.
type EventType string
const (
EventTypeTrack EventType = "track"
EventTypeIdentify EventType = "identify"
EventTypePage EventType = "page"
EventTypeGroup EventType = "group"
EventTypeAlias EventType = "alias"
EventTypeScreen EventType = "screen"
)
// RawEvent is the parsed-but-not-yet-validated payload from a client.
// We keep Properties / Traits / Context as json.RawMessage so the handler can
// pass them through to the service untouched; flattening happens in service.
type RawEvent struct {
Type EventType `json:"type" validate:"required,oneof=track identify page group alias screen"`
MessageID string `json:"messageId" validate:"required,max=128"`
AnonymousID string `json:"anonymousId" validate:"max=128"`
UserID string `json:"userId" validate:"max=128"`
GroupID string `json:"groupId" validate:"max=128"`
Event string `json:"event" validate:"max=255"`
Name string `json:"name" validate:"max=255"`
Category string `json:"category" validate:"max=255"`
Properties json.RawMessage `json:"properties"`
Traits json.RawMessage `json:"traits"`
Context json.RawMessage `json:"context"`
Timestamp *time.Time `json:"timestamp"`
SentAt *time.Time `json:"sentAt"`
}
// BatchEnvelope is the body of /batch — Segment-compatible.
type BatchEnvelope struct {
Batch []RawEvent `json:"batch" validate:"required,min=1,max=1000,dive"`
SentAt *time.Time `json:"sentAt"`
Context json.RawMessage `json:"context"`
}
// IngestedEvent is the canonical record we push onto Kafka. Flat fields,
// timestamps already normalized, payload sanitized.
type IngestedEvent struct {
WorkspaceID string `json:"workspace_id"`
SourceID string `json:"source_id"`
MessageID string `json:"message_id"`
Type EventType `json:"type"`
AnonymousID string `json:"anonymous_id,omitempty"`
UserID string `json:"user_id,omitempty"`
GroupID string `json:"group_id,omitempty"`
Event string `json:"event,omitempty"`
Name string `json:"name,omitempty"`
Category string `json:"category,omitempty"`
Properties map[string]any `json:"properties,omitempty"`
Traits map[string]any `json:"traits,omitempty"`
Context map[string]any `json:"context,omitempty"`
IP string `json:"ip,omitempty"`
UserAgent string `json:"user_agent,omitempty"`
Timestamp time.Time `json:"timestamp"`
SentAt time.Time `json:"sent_at"`
ReceivedAt time.Time `json:"received_at"`
}
// PartitionKey returns the key used for Kafka partitioning. We use
// anonymous_id to keep identity-stitching ordering per visitor.
func (e *IngestedEvent) PartitionKey() string {
if e.AnonymousID != "" {
return e.AnonymousID
}
if e.UserID != "" {
return e.UserID
}
return e.MessageID
}

View File

@@ -0,0 +1,19 @@
package model
import "time"
// WriteKey is the auth credential supplied via Authorization header.
// We never store the raw value — only its sha256 hash and a short prefix
// for display in the console.
type WriteKey struct {
ID string
WorkspaceID string
SourceID string
KeyPrefix string
Label string
RevokedAt *time.Time
LastUsedAt *time.Time
CreatedAt time.Time
}
func (k *WriteKey) Revoked() bool { return k.RevokedAt != nil }

View File

@@ -0,0 +1,102 @@
// Package ratelimit implements a Redis-backed sliding-window limiter.
//
// We use a sorted-set per workspace where the score is the unix-nano
// timestamp. On each request we:
// 1. ZREMRANGEBYSCORE -- evict entries older than window
// 2. ZCARD -- count current
// 3. if count < limit : ZADD + EXPIRE, allow
// 4. else : compute retry-after from oldest entry, deny
//
// Steps 1-3/4 are wrapped in a Lua script for atomicity.
package ratelimit
import (
"context"
"fmt"
"strconv"
"time"
"github.com/redis/rueidis"
)
type Decision struct {
Allowed bool
Remaining int
RetryAfterMS int
}
type Limiter interface {
Allow(ctx context.Context, workspaceID string, limit int, window time.Duration) (Decision, error)
}
type redisLimiter struct {
client rueidis.Client
}
func New(client rueidis.Client) Limiter {
return &redisLimiter{client: client}
}
// Lua script: KEYS[1]=zset key, ARGV[1]=now_ms, ARGV[2]=window_ms,
// ARGV[3]=limit, ARGV[4]=member (unique per request).
//
// Returns: {allowed (1/0), remaining, retry_after_ms}
const slidingWindowLua = `
local key = KEYS[1]
local now = tonumber(ARGV[1])
local window = tonumber(ARGV[2])
local limit = tonumber(ARGV[3])
local member = ARGV[4]
local cutoff = now - window
redis.call('ZREMRANGEBYSCORE', key, 0, cutoff)
local count = tonumber(redis.call('ZCARD', key))
if count < limit then
redis.call('ZADD', key, now, member)
redis.call('PEXPIRE', key, window)
return {1, limit - count - 1, 0}
end
local oldest = redis.call('ZRANGE', key, 0, 0, 'WITHSCORES')
local retry = window
if oldest and oldest[2] then
retry = (tonumber(oldest[2]) + window) - now
if retry < 0 then retry = 0 end
end
return {0, 0, retry}
`
func (l *redisLimiter) Allow(ctx context.Context, workspaceID string, limit int, window time.Duration) (Decision, error) {
key := "rate:" + workspaceID
now := time.Now().UnixMilli()
member := strconv.FormatInt(now, 10) + ":" + workspaceID
cmd := l.client.B().Eval().Script(slidingWindowLua).
Numkeys(1).
Key(key).
Arg(strconv.FormatInt(now, 10),
strconv.FormatInt(window.Milliseconds(), 10),
strconv.Itoa(limit),
member).
Build()
res := l.client.Do(ctx, cmd)
if err := res.Error(); err != nil {
return Decision{}, fmt.Errorf("ratelimit eval: %w", err)
}
arr, err := res.ToArray()
if err != nil || len(arr) != 3 {
return Decision{}, fmt.Errorf("ratelimit bad reply: %w", err)
}
allowed, _ := arr[0].AsInt64()
remaining, _ := arr[1].AsInt64()
retry, _ := arr[2].AsInt64()
return Decision{
Allowed: allowed == 1,
Remaining: int(remaining),
RetryAfterMS: int(retry),
}, nil
}

View File

@@ -0,0 +1,33 @@
package repo
import (
"context"
"fmt"
"time"
"github.com/jackc/pgx/v5/pgxpool"
)
// NewPool creates a pgxpool with sensible defaults for ingest workloads.
// Pool size is small because ingest is mostly cache hits — Postgres is only
// touched on cache miss (write key lookup, schema upsert).
func NewPool(ctx context.Context, dsn string) (*pgxpool.Pool, error) {
cfg, err := pgxpool.ParseConfig(dsn)
if err != nil {
return nil, fmt.Errorf("parse pg dsn: %w", err)
}
cfg.MaxConns = 16
cfg.MinConns = 2
cfg.MaxConnIdleTime = 5 * time.Minute
cfg.HealthCheckPeriod = 30 * time.Second
pool, err := pgxpool.NewWithConfig(ctx, cfg)
if err != nil {
return nil, fmt.Errorf("pg connect: %w", err)
}
if err := pool.Ping(ctx); err != nil {
pool.Close()
return nil, fmt.Errorf("pg ping: %w", err)
}
return pool, nil
}

View File

@@ -0,0 +1,61 @@
package repo
import (
"context"
"fmt"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
)
// SchemaRepo records the data type observed for each (workspace, event_type, field)
// triple. The bulker / analytics layer uses this to detect type conflicts.
//
// In the ingest hot path we only *check* for conflict via UpsertField; the
// rebuild of the cached map is left to a background loader. We do not block
// the request waiting for upsert -- it is fire-and-forget.
type SchemaRepo interface {
// GetType returns the recorded type, or "" if the field has never been seen.
GetType(ctx context.Context, workspaceID, eventType, field string) (string, error)
// UpsertField records a new (or re-confirmed) field type.
UpsertField(ctx context.Context, workspaceID, eventType, field, dataType string) error
}
type schemaRepo struct {
db *pgxpool.Pool
}
func NewSchemaRepo(db *pgxpool.Pool) SchemaRepo {
return &schemaRepo{db: db}
}
func (r *schemaRepo) GetType(ctx context.Context, workspaceID, eventType, field string) (string, error) {
const q = `
SELECT data_type FROM schema_fields
WHERE workspace_id = $1::uuid AND event_type = $2 AND field = $3`
var t string
err := r.db.QueryRow(ctx, q, workspaceID, eventType, field).Scan(&t)
if err != nil {
// pgx.ErrNoRows → return "" with nil error so caller treats as new field
if err.Error() == "no rows in result set" {
return "", nil
}
return "", apperr.Internal(fmt.Errorf("schema get: %w", err))
}
return t, nil
}
func (r *schemaRepo) UpsertField(ctx context.Context, workspaceID, eventType, field, dataType string) error {
const q = `
INSERT INTO schema_fields (workspace_id, event_type, field, data_type)
VALUES ($1::uuid, $2, $3, $4)
ON CONFLICT (workspace_id, event_type, field) DO UPDATE
SET last_seen_at = now(),
sample_count = schema_fields.sample_count + 1`
_, err := r.db.Exec(ctx, q, workspaceID, eventType, field, dataType)
if err != nil {
return apperr.Internal(fmt.Errorf("schema upsert: %w", err))
}
return nil
}

View File

@@ -0,0 +1,66 @@
package repo
import (
"context"
"crypto/sha256"
"encoding/hex"
"errors"
"fmt"
"github.com/jackc/pgx/v5"
"github.com/jackc/pgx/v5/pgxpool"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
)
// WriteKeyRepo loads WriteKey records by their plaintext value.
// The plaintext is hashed before the lookup; the DB only stores hashes.
type WriteKeyRepo interface {
FindByPlaintext(ctx context.Context, plaintext string) (*model.WriteKey, error)
MarkUsed(ctx context.Context, id string) error
}
type writeKeyRepo struct {
db *pgxpool.Pool
}
func NewWriteKeyRepo(db *pgxpool.Pool) WriteKeyRepo {
return &writeKeyRepo{db: db}
}
func hashKey(plaintext string) string {
sum := sha256.Sum256([]byte(plaintext))
return hex.EncodeToString(sum[:])
}
func (r *writeKeyRepo) FindByPlaintext(ctx context.Context, plaintext string) (*model.WriteKey, error) {
const q = `
SELECT id::text, workspace_id::text, source_id::text,
key_prefix, COALESCE(label, ''),
revoked_at, last_used_at, created_at
FROM write_keys
WHERE key_hash = $1`
row := r.db.QueryRow(ctx, q, hashKey(plaintext))
var k model.WriteKey
err := row.Scan(&k.ID, &k.WorkspaceID, &k.SourceID,
&k.KeyPrefix, &k.Label,
&k.RevokedAt, &k.LastUsedAt, &k.CreatedAt)
if errors.Is(err, pgx.ErrNoRows) {
return nil, apperr.Unauthorized("invalid write key")
}
if err != nil {
return nil, apperr.Internal(fmt.Errorf("writekey lookup: %w", err))
}
return &k, nil
}
func (r *writeKeyRepo) MarkUsed(ctx context.Context, id string) error {
const q = `UPDATE write_keys SET last_used_at = now() WHERE id = $1`
_, err := r.db.Exec(ctx, q, id)
if err != nil {
return apperr.Internal(fmt.Errorf("writekey mark used: %w", err))
}
return nil
}

View File

@@ -0,0 +1,92 @@
// Package schema turns nested JSON objects into flat key/value maps and
// classifies field types for conflict detection.
//
// Rules:
// - keys are joined with "_" : {"a": {"b": 1}} -> {"a_b": 1}
// - arrays are preserved as-is and not descended into
// - keys are sanitized: lowercase, non-[a-z0-9_] replaced with "_"
package schema
import (
"strings"
"unicode"
)
type DataType string
const (
TypeString DataType = "string"
TypeNumber DataType = "number"
TypeBoolean DataType = "boolean"
TypeObject DataType = "object"
TypeArray DataType = "array"
TypeTimestamp DataType = "timestamp"
TypeNull DataType = "null"
)
// Flatten flattens nested objects under a snake_case prefix.
// Returns a new map, never mutates input.
func Flatten(in map[string]any) map[string]any {
out := make(map[string]any, len(in))
for k, v := range in {
flattenInto(out, sanitize(k), v)
}
return out
}
func flattenInto(out map[string]any, prefix string, v any) {
switch x := v.(type) {
case map[string]any:
if len(x) == 0 {
out[prefix] = x
return
}
for k, child := range x {
flattenInto(out, prefix+"_"+sanitize(k), child)
}
default:
out[prefix] = v
}
}
// sanitize replaces characters outside [a-z0-9_] with "_" and lowercases.
// Leading underscores are kept; trailing underscores are trimmed.
func sanitize(k string) string {
if k == "" {
return k
}
var b strings.Builder
b.Grow(len(k))
for _, r := range k {
switch {
case unicode.IsLetter(r):
b.WriteRune(unicode.ToLower(r))
case unicode.IsDigit(r) || r == '_':
b.WriteRune(r)
default:
b.WriteRune('_')
}
}
return strings.TrimRight(b.String(), "_")
}
// Classify maps a Go value (from json.Unmarshal) to a DataType.
func Classify(v any) DataType {
switch x := v.(type) {
case nil:
return TypeNull
case bool:
return TypeBoolean
case float64, float32, int, int32, int64, uint, uint32, uint64:
return TypeNumber
case string:
_ = x
return TypeString
case []any:
return TypeArray
case map[string]any:
return TypeObject
default:
return TypeString
}
}

View File

@@ -0,0 +1,53 @@
package schema
import (
"testing"
"github.com/stretchr/testify/assert"
)
func TestFlatten_NestedObject(t *testing.T) {
in := map[string]any{
"user": map[string]any{
"id": "u_1",
"profile": map[string]any{"age": 30, "name": "Phuoc"},
},
"plan": "pro",
}
got := Flatten(in)
assert.Equal(t, "u_1", got["user_id"])
assert.Equal(t, 30, got["user_profile_age"])
assert.Equal(t, "Phuoc", got["user_profile_name"])
assert.Equal(t, "pro", got["plan"])
}
func TestFlatten_SanitizesKeys(t *testing.T) {
in := map[string]any{
"User Email": "x@y",
"price.usd": 9.99,
"meta!": map[string]any{"X-Y": 1},
}
got := Flatten(in)
assert.Equal(t, "x@y", got["user_email"])
assert.Equal(t, 9.99, got["price_usd"])
assert.Equal(t, 1, got["meta_x_y"])
}
func TestFlatten_PreservesArrays(t *testing.T) {
in := map[string]any{
"tags": []any{"a", "b"},
}
got := Flatten(in)
arr, ok := got["tags"].([]any)
assert.True(t, ok)
assert.Equal(t, 2, len(arr))
}
func TestClassify(t *testing.T) {
assert.Equal(t, TypeString, Classify("hi"))
assert.Equal(t, TypeNumber, Classify(float64(1.5)))
assert.Equal(t, TypeBoolean, Classify(true))
assert.Equal(t, TypeNull, Classify(nil))
assert.Equal(t, TypeArray, Classify([]any{1, 2}))
assert.Equal(t, TypeObject, Classify(map[string]any{}))
}

View File

@@ -0,0 +1,115 @@
package service
import (
"context"
"sync"
"time"
"github.com/redis/rueidis"
"go.uber.org/zap"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
"github.com/dbiz/cdp/ingestion/ingest/internal/repo"
)
// AuthService resolves a plaintext Write Key into the workspace + source it
// authorizes for. Lookups are cached in process AND in Redis. Pub/sub
// invalidation lets the console revoke a key and have it propagate within
// the cache TTL.
type AuthService struct {
repo repo.WriteKeyRepo
redis rueidis.Client
log *zap.Logger
ttl time.Duration
mu sync.RWMutex
cache map[string]cachedKey
}
type cachedKey struct {
key *model.WriteKey
expires time.Time
}
const (
redisKeyWritePrefix = "wk:" // wk:{plaintext} -> json
pubsubChannel = "wk:invalidate"
)
func NewAuthService(r repo.WriteKeyRepo, redis rueidis.Client, ttl time.Duration, log *zap.Logger) *AuthService {
s := &AuthService{
repo: r,
redis: redis,
log: log,
ttl: ttl,
cache: make(map[string]cachedKey),
}
go s.watchInvalidations()
return s
}
// Resolve returns the WriteKey for a plaintext token. Cached.
func (s *AuthService) Resolve(ctx context.Context, plaintext string) (*model.WriteKey, error) {
if plaintext == "" {
return nil, apperr.Unauthorized("missing write key")
}
// in-process cache
s.mu.RLock()
if entry, ok := s.cache[plaintext]; ok && time.Now().Before(entry.expires) {
s.mu.RUnlock()
if entry.key.Revoked() {
return nil, apperr.Unauthorized("write key revoked")
}
return entry.key, nil
}
s.mu.RUnlock()
// fall through to DB (Redis cache is optional and intentionally skipped
// here -- the in-process map is plenty fast; Redis is only used for the
// pub/sub invalidation channel below)
k, err := s.repo.FindByPlaintext(ctx, plaintext)
if err != nil {
return nil, err
}
if k.Revoked() {
return nil, apperr.Unauthorized("write key revoked")
}
s.mu.Lock()
s.cache[plaintext] = cachedKey{key: k, expires: time.Now().Add(s.ttl)}
s.mu.Unlock()
return k, nil
}
// Invalidate clears the cache entry for one key. Called by the console via
// pub/sub when a key is revoked.
func (s *AuthService) Invalidate(plaintext string) {
s.mu.Lock()
delete(s.cache, plaintext)
s.mu.Unlock()
}
func (s *AuthService) watchInvalidations() {
if s.redis == nil {
return
}
ctx := context.Background()
err := s.redis.Receive(ctx, s.redis.B().Subscribe().Channel(pubsubChannel).Build(),
func(msg rueidis.PubSubMessage) {
s.Invalidate(msg.Message)
s.log.Info("write key invalidated via pubsub", zap.String("prefix", maskKey(msg.Message)))
})
if err != nil {
s.log.Warn("pubsub subscribe ended", zap.Error(err))
}
}
// maskKey returns the first 8 chars + "***" for safe logging.
func maskKey(k string) string {
if len(k) <= 8 {
return "***"
}
return k[:8] + "***"
}

View File

@@ -0,0 +1,223 @@
package service
import (
"context"
"encoding/json"
"time"
"go.uber.org/zap"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
"github.com/dbiz/cdp/ingestion/ingest/internal/dedup"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
"github.com/dbiz/cdp/ingestion/ingest/internal/ratelimit"
"github.com/dbiz/cdp/ingestion/ingest/internal/repo"
"github.com/dbiz/cdp/ingestion/ingest/internal/schema"
)
// Producer is the small surface IngestService needs from the Kafka client.
// Defined here so it can be stubbed in tests without pulling in franz-go.
type Producer interface {
Produce(ctx context.Context, ev *model.IngestedEvent) error
ProduceDLQ(ctx context.Context, workspaceID, sourceID, messageID, reason, field string, raw []byte) error
}
// IngestService is the core pipeline: validate → ratelimit → timestamp normalize
// → late-check → dedup → flatten → schema-conflict → push Kafka.
type IngestService struct {
producer Producer
limiter ratelimit.Limiter
dedup dedup.Dedup
schema repo.SchemaRepo
log *zap.Logger
lateAfter time.Duration
}
// IngestDeps groups dependencies for cleaner construction.
type IngestDeps struct {
Producer Producer
Limiter ratelimit.Limiter
Dedup dedup.Dedup
Schema repo.SchemaRepo
Log *zap.Logger
LateAfter time.Duration
}
func NewIngestService(d IngestDeps) *IngestService {
return &IngestService{
producer: d.Producer,
limiter: d.Limiter,
dedup: d.Dedup,
schema: d.Schema,
log: d.Log,
lateAfter: d.LateAfter,
}
}
// IngestContext carries per-request data set by middleware.
type IngestContext struct {
WorkspaceID string
SourceID string
IP string
UserAgent string
RawBody []byte // original body, used for DLQ payload
}
// Ingest runs the full pipeline for a single event.
func (s *IngestService) Ingest(ctx context.Context, ictx IngestContext, raw *model.RawEvent) error {
now := time.Now().UTC()
// 3. rate limit per workspace
dec, err := s.limiter.Allow(ctx, ictx.WorkspaceID, defaultTierLimit, time.Second)
if err != nil {
return apperr.Internal(err)
}
if !dec.Allowed {
retry := (dec.RetryAfterMS / 1000) + 1
return apperr.TooManyRequests(retry)
}
// 4-5. timestamps + late-event check
sentAt := derefTime(raw.SentAt, now)
if now.Sub(sentAt) > s.lateAfter {
return apperr.UnprocessableEntity("event too old (>24h)")
}
timestamp := derefTime(raw.Timestamp, sentAt)
// 6. dedup
if raw.MessageID == "" {
return apperr.BadRequest("messageId required", "messageId", nil)
}
fresh, err := s.dedup.CheckAndSet(ctx, ictx.WorkspaceID, raw.MessageID)
if err != nil {
return apperr.Internal(err)
}
if !fresh {
// silently drop -- duplicate message
return nil
}
// 7. flatten properties / traits / context
props, err := decodeAndFlatten(raw.Properties)
if err != nil {
_ = s.toDLQ(ctx, ictx, raw, "properties_invalid_json", "properties")
return apperr.BadRequest("properties is not valid JSON object", "properties", err)
}
traits, err := decodeAndFlatten(raw.Traits)
if err != nil {
_ = s.toDLQ(ctx, ictx, raw, "traits_invalid_json", "traits")
return apperr.BadRequest("traits is not valid JSON object", "traits", err)
}
contextMap, err := decodeAndFlatten(raw.Context)
if err != nil {
// context is best-effort: keep going without it
contextMap = nil
}
// 8. schema validation -- type conflict detection (best-effort, async upsert)
if err := s.checkSchema(ctx, ictx.WorkspaceID, string(raw.Type), props); err != nil {
_ = s.toDLQ(ctx, ictx, raw, "schema_conflict", "")
return err
}
ev := &model.IngestedEvent{
WorkspaceID: ictx.WorkspaceID,
SourceID: ictx.SourceID,
MessageID: raw.MessageID,
Type: raw.Type,
AnonymousID: raw.AnonymousID,
UserID: raw.UserID,
GroupID: raw.GroupID,
Event: raw.Event,
Name: raw.Name,
Category: raw.Category,
Properties: props,
Traits: traits,
Context: contextMap,
IP: ictx.IP,
UserAgent: ictx.UserAgent,
Timestamp: timestamp,
SentAt: sentAt,
ReceivedAt: now,
}
// 9. push Kafka -- fire-and-forget
if err := s.producer.Produce(ctx, ev); err != nil {
return apperr.Internal(err)
}
return nil
}
// IngestBatch processes a batch envelope; each failure is recorded but the
// good events still ship. Returns the first error so the handler can pick a
// status; in practice batch endpoints return 200 with per-event status.
func (s *IngestService) IngestBatch(ctx context.Context, ictx IngestContext, batch []model.RawEvent) []error {
errs := make([]error, len(batch))
for i := range batch {
errs[i] = s.Ingest(ctx, ictx, &batch[i])
}
return errs
}
// ---------------------------------------------------------------------------
// helpers
// ---------------------------------------------------------------------------
const defaultTierLimit = 100 // rps; per-tier override comes from workspace.tier later
func derefTime(p *time.Time, fallback time.Time) time.Time {
if p == nil || p.IsZero() {
return fallback
}
return p.UTC()
}
func decodeAndFlatten(raw json.RawMessage) (map[string]any, error) {
if len(raw) == 0 {
return nil, nil
}
var m map[string]any
if err := json.Unmarshal(raw, &m); err != nil {
return nil, err
}
if m == nil {
return nil, nil
}
return schema.Flatten(m), nil
}
// checkSchema looks up the recorded type per (workspace, event_type, field)
// and rejects with 400 on conflict. New fields are recorded asynchronously --
// we do not block the request waiting on the DB write.
func (s *IngestService) checkSchema(ctx context.Context, workspaceID, eventType string, props map[string]any) error {
for field, v := range props {
dt := string(schema.Classify(v))
if dt == string(schema.TypeNull) {
continue
}
existing, err := s.schema.GetType(ctx, workspaceID, eventType, field)
if err != nil {
// soft-fail: don't block ingest on schema DB errors
s.log.Warn("schema lookup failed", zap.String("field", field), zap.Error(err))
continue
}
if existing == "" {
// fire-and-forget upsert
go func(f, t string) {
if err := s.schema.UpsertField(context.Background(), workspaceID, eventType, f, t); err != nil {
s.log.Warn("schema upsert failed", zap.String("field", f), zap.Error(err))
}
}(field, dt)
continue
}
if existing != dt {
return apperr.BadRequest("schema type conflict", field, nil)
}
}
return nil
}
func (s *IngestService) toDLQ(ctx context.Context, ictx IngestContext, raw *model.RawEvent, reason, field string) error {
return s.producer.ProduceDLQ(ctx,
ictx.WorkspaceID, ictx.SourceID, raw.MessageID, reason, field, ictx.RawBody)
}

View File

@@ -0,0 +1,150 @@
package service
import (
"context"
"encoding/json"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/zap"
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
"github.com/dbiz/cdp/ingestion/ingest/internal/ratelimit"
)
// ---------------------------------------------------------------------------
// Stubs -- enough surface to drive the IngestService without spinning Kafka
// or Redis. We exercise the pipeline branches: late event, dedup hit, schema
// conflict, happy path.
// ---------------------------------------------------------------------------
type fakeLimiter struct{ allow bool }
func (f *fakeLimiter) Allow(_ context.Context, _ string, _ int, _ time.Duration) (ratelimit.Decision, error) {
if f.allow {
return ratelimit.Decision{Allowed: true, Remaining: 99}, nil
}
return ratelimit.Decision{Allowed: false, RetryAfterMS: 500}, nil
}
type fakeDedup struct{ fresh bool }
func (f *fakeDedup) CheckAndSet(_ context.Context, _, _ string) (bool, error) { return f.fresh, nil }
type fakeSchema struct {
stored map[string]string
}
func (f *fakeSchema) GetType(_ context.Context, _, _, field string) (string, error) {
if t, ok := f.stored[field]; ok {
return t, nil
}
return "", nil
}
func (f *fakeSchema) UpsertField(_ context.Context, _, _, field, dt string) error {
if f.stored == nil {
f.stored = map[string]string{}
}
f.stored[field] = dt
return nil
}
// fakeProducer captures pushes so tests can assert side effects.
type fakeProducer struct {
produced []*model.IngestedEvent
dlq []string // reason values
}
func (f *fakeProducer) Produce(_ context.Context, ev *model.IngestedEvent) error {
f.produced = append(f.produced, ev)
return nil
}
func (f *fakeProducer) ProduceDLQ(_ context.Context, _, _, _, reason, _ string, _ []byte) error {
f.dlq = append(f.dlq, reason)
return nil
}
// ---------------------------------------------------------------------------
func newSvc(t *testing.T, limiter *fakeLimiter, dedupSvc *fakeDedup, sch *fakeSchema) (*IngestService, *fakeProducer) {
t.Helper()
prod := &fakeProducer{}
return &IngestService{
producer: prod,
limiter: limiter,
dedup: dedupSvc,
schema: sch,
log: zap.NewNop(),
lateAfter: 24 * time.Hour,
}, prod
}
func TestIngest_RateLimited(t *testing.T) {
svc, _ := newSvc(t, &fakeLimiter{allow: false}, &fakeDedup{fresh: true}, &fakeSchema{})
err := svc.Ingest(context.Background(), IngestContext{WorkspaceID: "ws"},
&model.RawEvent{Type: model.EventTypeTrack, MessageID: "m1"})
ae, ok := apperr.As(err)
require.True(t, ok)
assert.Equal(t, 429, ae.Code)
assert.Greater(t, ae.RetryAfter, 0)
}
func TestIngest_LateEvent(t *testing.T) {
svc, _ := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: true}, &fakeSchema{})
old := time.Now().Add(-48 * time.Hour)
err := svc.Ingest(context.Background(), IngestContext{WorkspaceID: "ws"},
&model.RawEvent{Type: model.EventTypeTrack, MessageID: "m1", SentAt: &old})
ae, ok := apperr.As(err)
require.True(t, ok)
assert.Equal(t, 422, ae.Code)
}
func TestIngest_DuplicateMessageSilentlyDropped(t *testing.T) {
svc, prod := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: false}, &fakeSchema{})
err := svc.Ingest(context.Background(), IngestContext{WorkspaceID: "ws"},
&model.RawEvent{Type: model.EventTypeTrack, MessageID: "m1"})
assert.NoError(t, err)
assert.Empty(t, prod.produced, "duplicate must not be produced")
}
func TestIngest_SchemaConflict(t *testing.T) {
svc, prod := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: true},
&fakeSchema{stored: map[string]string{"price": "string"}})
props, _ := json.Marshal(map[string]any{"price": 9.99})
err := svc.Ingest(context.Background(),
IngestContext{WorkspaceID: "ws"},
&model.RawEvent{
Type: model.EventTypeTrack,
MessageID: "m1",
Properties: props,
})
ae, ok := apperr.As(err)
require.True(t, ok)
assert.Equal(t, 400, ae.Code)
assert.Equal(t, "price", ae.Field)
assert.Equal(t, []string{"schema_conflict"}, prod.dlq)
assert.Empty(t, prod.produced)
}
func TestIngest_HappyPath(t *testing.T) {
svc, prod := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: true}, &fakeSchema{})
props, _ := json.Marshal(map[string]any{"plan": "pro"})
err := svc.Ingest(context.Background(),
IngestContext{WorkspaceID: "ws", SourceID: "src", IP: "1.1.1.1"},
&model.RawEvent{
Type: model.EventTypeTrack,
MessageID: "m1",
AnonymousID: "anon-1",
Event: "Signed Up",
Properties: props,
})
require.NoError(t, err)
require.Len(t, prod.produced, 1)
ev := prod.produced[0]
assert.Equal(t, "ws", ev.WorkspaceID)
assert.Equal(t, "anon-1", ev.PartitionKey())
assert.Equal(t, "pro", ev.Properties["plan"])
}