init ingestion
This commit is contained in:
115
ingestion/ingest/internal/service/auth.go
Normal file
115
ingestion/ingest/internal/service/auth.go
Normal file
@@ -0,0 +1,115 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/redis/rueidis"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/repo"
|
||||
)
|
||||
|
||||
// AuthService resolves a plaintext Write Key into the workspace + source it
|
||||
// authorizes for. Lookups are cached in process AND in Redis. Pub/sub
|
||||
// invalidation lets the console revoke a key and have it propagate within
|
||||
// the cache TTL.
|
||||
type AuthService struct {
|
||||
repo repo.WriteKeyRepo
|
||||
redis rueidis.Client
|
||||
log *zap.Logger
|
||||
ttl time.Duration
|
||||
|
||||
mu sync.RWMutex
|
||||
cache map[string]cachedKey
|
||||
}
|
||||
|
||||
type cachedKey struct {
|
||||
key *model.WriteKey
|
||||
expires time.Time
|
||||
}
|
||||
|
||||
const (
|
||||
redisKeyWritePrefix = "wk:" // wk:{plaintext} -> json
|
||||
pubsubChannel = "wk:invalidate"
|
||||
)
|
||||
|
||||
func NewAuthService(r repo.WriteKeyRepo, redis rueidis.Client, ttl time.Duration, log *zap.Logger) *AuthService {
|
||||
s := &AuthService{
|
||||
repo: r,
|
||||
redis: redis,
|
||||
log: log,
|
||||
ttl: ttl,
|
||||
cache: make(map[string]cachedKey),
|
||||
}
|
||||
go s.watchInvalidations()
|
||||
return s
|
||||
}
|
||||
|
||||
// Resolve returns the WriteKey for a plaintext token. Cached.
|
||||
func (s *AuthService) Resolve(ctx context.Context, plaintext string) (*model.WriteKey, error) {
|
||||
if plaintext == "" {
|
||||
return nil, apperr.Unauthorized("missing write key")
|
||||
}
|
||||
|
||||
// in-process cache
|
||||
s.mu.RLock()
|
||||
if entry, ok := s.cache[plaintext]; ok && time.Now().Before(entry.expires) {
|
||||
s.mu.RUnlock()
|
||||
if entry.key.Revoked() {
|
||||
return nil, apperr.Unauthorized("write key revoked")
|
||||
}
|
||||
return entry.key, nil
|
||||
}
|
||||
s.mu.RUnlock()
|
||||
|
||||
// fall through to DB (Redis cache is optional and intentionally skipped
|
||||
// here -- the in-process map is plenty fast; Redis is only used for the
|
||||
// pub/sub invalidation channel below)
|
||||
k, err := s.repo.FindByPlaintext(ctx, plaintext)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if k.Revoked() {
|
||||
return nil, apperr.Unauthorized("write key revoked")
|
||||
}
|
||||
|
||||
s.mu.Lock()
|
||||
s.cache[plaintext] = cachedKey{key: k, expires: time.Now().Add(s.ttl)}
|
||||
s.mu.Unlock()
|
||||
return k, nil
|
||||
}
|
||||
|
||||
// Invalidate clears the cache entry for one key. Called by the console via
|
||||
// pub/sub when a key is revoked.
|
||||
func (s *AuthService) Invalidate(plaintext string) {
|
||||
s.mu.Lock()
|
||||
delete(s.cache, plaintext)
|
||||
s.mu.Unlock()
|
||||
}
|
||||
|
||||
func (s *AuthService) watchInvalidations() {
|
||||
if s.redis == nil {
|
||||
return
|
||||
}
|
||||
ctx := context.Background()
|
||||
err := s.redis.Receive(ctx, s.redis.B().Subscribe().Channel(pubsubChannel).Build(),
|
||||
func(msg rueidis.PubSubMessage) {
|
||||
s.Invalidate(msg.Message)
|
||||
s.log.Info("write key invalidated via pubsub", zap.String("prefix", maskKey(msg.Message)))
|
||||
})
|
||||
if err != nil {
|
||||
s.log.Warn("pubsub subscribe ended", zap.Error(err))
|
||||
}
|
||||
}
|
||||
|
||||
// maskKey returns the first 8 chars + "***" for safe logging.
|
||||
func maskKey(k string) string {
|
||||
if len(k) <= 8 {
|
||||
return "***"
|
||||
}
|
||||
return k[:8] + "***"
|
||||
}
|
||||
223
ingestion/ingest/internal/service/ingest.go
Normal file
223
ingestion/ingest/internal/service/ingest.go
Normal file
@@ -0,0 +1,223 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"time"
|
||||
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/dedup"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/ratelimit"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/repo"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/schema"
|
||||
)
|
||||
|
||||
// Producer is the small surface IngestService needs from the Kafka client.
|
||||
// Defined here so it can be stubbed in tests without pulling in franz-go.
|
||||
type Producer interface {
|
||||
Produce(ctx context.Context, ev *model.IngestedEvent) error
|
||||
ProduceDLQ(ctx context.Context, workspaceID, sourceID, messageID, reason, field string, raw []byte) error
|
||||
}
|
||||
|
||||
// IngestService is the core pipeline: validate → ratelimit → timestamp normalize
|
||||
// → late-check → dedup → flatten → schema-conflict → push Kafka.
|
||||
type IngestService struct {
|
||||
producer Producer
|
||||
limiter ratelimit.Limiter
|
||||
dedup dedup.Dedup
|
||||
schema repo.SchemaRepo
|
||||
log *zap.Logger
|
||||
lateAfter time.Duration
|
||||
}
|
||||
|
||||
// IngestDeps groups dependencies for cleaner construction.
|
||||
type IngestDeps struct {
|
||||
Producer Producer
|
||||
Limiter ratelimit.Limiter
|
||||
Dedup dedup.Dedup
|
||||
Schema repo.SchemaRepo
|
||||
Log *zap.Logger
|
||||
LateAfter time.Duration
|
||||
}
|
||||
|
||||
func NewIngestService(d IngestDeps) *IngestService {
|
||||
return &IngestService{
|
||||
producer: d.Producer,
|
||||
limiter: d.Limiter,
|
||||
dedup: d.Dedup,
|
||||
schema: d.Schema,
|
||||
log: d.Log,
|
||||
lateAfter: d.LateAfter,
|
||||
}
|
||||
}
|
||||
|
||||
// IngestContext carries per-request data set by middleware.
|
||||
type IngestContext struct {
|
||||
WorkspaceID string
|
||||
SourceID string
|
||||
IP string
|
||||
UserAgent string
|
||||
RawBody []byte // original body, used for DLQ payload
|
||||
}
|
||||
|
||||
// Ingest runs the full pipeline for a single event.
|
||||
func (s *IngestService) Ingest(ctx context.Context, ictx IngestContext, raw *model.RawEvent) error {
|
||||
now := time.Now().UTC()
|
||||
|
||||
// 3. rate limit per workspace
|
||||
dec, err := s.limiter.Allow(ctx, ictx.WorkspaceID, defaultTierLimit, time.Second)
|
||||
if err != nil {
|
||||
return apperr.Internal(err)
|
||||
}
|
||||
if !dec.Allowed {
|
||||
retry := (dec.RetryAfterMS / 1000) + 1
|
||||
return apperr.TooManyRequests(retry)
|
||||
}
|
||||
|
||||
// 4-5. timestamps + late-event check
|
||||
sentAt := derefTime(raw.SentAt, now)
|
||||
if now.Sub(sentAt) > s.lateAfter {
|
||||
return apperr.UnprocessableEntity("event too old (>24h)")
|
||||
}
|
||||
timestamp := derefTime(raw.Timestamp, sentAt)
|
||||
|
||||
// 6. dedup
|
||||
if raw.MessageID == "" {
|
||||
return apperr.BadRequest("messageId required", "messageId", nil)
|
||||
}
|
||||
fresh, err := s.dedup.CheckAndSet(ctx, ictx.WorkspaceID, raw.MessageID)
|
||||
if err != nil {
|
||||
return apperr.Internal(err)
|
||||
}
|
||||
if !fresh {
|
||||
// silently drop -- duplicate message
|
||||
return nil
|
||||
}
|
||||
|
||||
// 7. flatten properties / traits / context
|
||||
props, err := decodeAndFlatten(raw.Properties)
|
||||
if err != nil {
|
||||
_ = s.toDLQ(ctx, ictx, raw, "properties_invalid_json", "properties")
|
||||
return apperr.BadRequest("properties is not valid JSON object", "properties", err)
|
||||
}
|
||||
traits, err := decodeAndFlatten(raw.Traits)
|
||||
if err != nil {
|
||||
_ = s.toDLQ(ctx, ictx, raw, "traits_invalid_json", "traits")
|
||||
return apperr.BadRequest("traits is not valid JSON object", "traits", err)
|
||||
}
|
||||
contextMap, err := decodeAndFlatten(raw.Context)
|
||||
if err != nil {
|
||||
// context is best-effort: keep going without it
|
||||
contextMap = nil
|
||||
}
|
||||
|
||||
// 8. schema validation -- type conflict detection (best-effort, async upsert)
|
||||
if err := s.checkSchema(ctx, ictx.WorkspaceID, string(raw.Type), props); err != nil {
|
||||
_ = s.toDLQ(ctx, ictx, raw, "schema_conflict", "")
|
||||
return err
|
||||
}
|
||||
|
||||
ev := &model.IngestedEvent{
|
||||
WorkspaceID: ictx.WorkspaceID,
|
||||
SourceID: ictx.SourceID,
|
||||
MessageID: raw.MessageID,
|
||||
Type: raw.Type,
|
||||
AnonymousID: raw.AnonymousID,
|
||||
UserID: raw.UserID,
|
||||
GroupID: raw.GroupID,
|
||||
Event: raw.Event,
|
||||
Name: raw.Name,
|
||||
Category: raw.Category,
|
||||
Properties: props,
|
||||
Traits: traits,
|
||||
Context: contextMap,
|
||||
IP: ictx.IP,
|
||||
UserAgent: ictx.UserAgent,
|
||||
Timestamp: timestamp,
|
||||
SentAt: sentAt,
|
||||
ReceivedAt: now,
|
||||
}
|
||||
|
||||
// 9. push Kafka -- fire-and-forget
|
||||
if err := s.producer.Produce(ctx, ev); err != nil {
|
||||
return apperr.Internal(err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IngestBatch processes a batch envelope; each failure is recorded but the
|
||||
// good events still ship. Returns the first error so the handler can pick a
|
||||
// status; in practice batch endpoints return 200 with per-event status.
|
||||
func (s *IngestService) IngestBatch(ctx context.Context, ictx IngestContext, batch []model.RawEvent) []error {
|
||||
errs := make([]error, len(batch))
|
||||
for i := range batch {
|
||||
errs[i] = s.Ingest(ctx, ictx, &batch[i])
|
||||
}
|
||||
return errs
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const defaultTierLimit = 100 // rps; per-tier override comes from workspace.tier later
|
||||
|
||||
func derefTime(p *time.Time, fallback time.Time) time.Time {
|
||||
if p == nil || p.IsZero() {
|
||||
return fallback
|
||||
}
|
||||
return p.UTC()
|
||||
}
|
||||
|
||||
func decodeAndFlatten(raw json.RawMessage) (map[string]any, error) {
|
||||
if len(raw) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
var m map[string]any
|
||||
if err := json.Unmarshal(raw, &m); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if m == nil {
|
||||
return nil, nil
|
||||
}
|
||||
return schema.Flatten(m), nil
|
||||
}
|
||||
|
||||
// checkSchema looks up the recorded type per (workspace, event_type, field)
|
||||
// and rejects with 400 on conflict. New fields are recorded asynchronously --
|
||||
// we do not block the request waiting on the DB write.
|
||||
func (s *IngestService) checkSchema(ctx context.Context, workspaceID, eventType string, props map[string]any) error {
|
||||
for field, v := range props {
|
||||
dt := string(schema.Classify(v))
|
||||
if dt == string(schema.TypeNull) {
|
||||
continue
|
||||
}
|
||||
existing, err := s.schema.GetType(ctx, workspaceID, eventType, field)
|
||||
if err != nil {
|
||||
// soft-fail: don't block ingest on schema DB errors
|
||||
s.log.Warn("schema lookup failed", zap.String("field", field), zap.Error(err))
|
||||
continue
|
||||
}
|
||||
if existing == "" {
|
||||
// fire-and-forget upsert
|
||||
go func(f, t string) {
|
||||
if err := s.schema.UpsertField(context.Background(), workspaceID, eventType, f, t); err != nil {
|
||||
s.log.Warn("schema upsert failed", zap.String("field", f), zap.Error(err))
|
||||
}
|
||||
}(field, dt)
|
||||
continue
|
||||
}
|
||||
if existing != dt {
|
||||
return apperr.BadRequest("schema type conflict", field, nil)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *IngestService) toDLQ(ctx context.Context, ictx IngestContext, raw *model.RawEvent, reason, field string) error {
|
||||
return s.producer.ProduceDLQ(ctx,
|
||||
ictx.WorkspaceID, ictx.SourceID, raw.MessageID, reason, field, ictx.RawBody)
|
||||
}
|
||||
150
ingestion/ingest/internal/service/ingest_test.go
Normal file
150
ingestion/ingest/internal/service/ingest_test.go
Normal file
@@ -0,0 +1,150 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/zap"
|
||||
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/apperr"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/model"
|
||||
"github.com/dbiz/cdp/ingestion/ingest/internal/ratelimit"
|
||||
)
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Stubs -- enough surface to drive the IngestService without spinning Kafka
|
||||
// or Redis. We exercise the pipeline branches: late event, dedup hit, schema
|
||||
// conflict, happy path.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
type fakeLimiter struct{ allow bool }
|
||||
|
||||
func (f *fakeLimiter) Allow(_ context.Context, _ string, _ int, _ time.Duration) (ratelimit.Decision, error) {
|
||||
if f.allow {
|
||||
return ratelimit.Decision{Allowed: true, Remaining: 99}, nil
|
||||
}
|
||||
return ratelimit.Decision{Allowed: false, RetryAfterMS: 500}, nil
|
||||
}
|
||||
|
||||
type fakeDedup struct{ fresh bool }
|
||||
|
||||
func (f *fakeDedup) CheckAndSet(_ context.Context, _, _ string) (bool, error) { return f.fresh, nil }
|
||||
|
||||
type fakeSchema struct {
|
||||
stored map[string]string
|
||||
}
|
||||
|
||||
func (f *fakeSchema) GetType(_ context.Context, _, _, field string) (string, error) {
|
||||
if t, ok := f.stored[field]; ok {
|
||||
return t, nil
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
func (f *fakeSchema) UpsertField(_ context.Context, _, _, field, dt string) error {
|
||||
if f.stored == nil {
|
||||
f.stored = map[string]string{}
|
||||
}
|
||||
f.stored[field] = dt
|
||||
return nil
|
||||
}
|
||||
|
||||
// fakeProducer captures pushes so tests can assert side effects.
|
||||
type fakeProducer struct {
|
||||
produced []*model.IngestedEvent
|
||||
dlq []string // reason values
|
||||
}
|
||||
|
||||
func (f *fakeProducer) Produce(_ context.Context, ev *model.IngestedEvent) error {
|
||||
f.produced = append(f.produced, ev)
|
||||
return nil
|
||||
}
|
||||
func (f *fakeProducer) ProduceDLQ(_ context.Context, _, _, _, reason, _ string, _ []byte) error {
|
||||
f.dlq = append(f.dlq, reason)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
func newSvc(t *testing.T, limiter *fakeLimiter, dedupSvc *fakeDedup, sch *fakeSchema) (*IngestService, *fakeProducer) {
|
||||
t.Helper()
|
||||
prod := &fakeProducer{}
|
||||
return &IngestService{
|
||||
producer: prod,
|
||||
limiter: limiter,
|
||||
dedup: dedupSvc,
|
||||
schema: sch,
|
||||
log: zap.NewNop(),
|
||||
lateAfter: 24 * time.Hour,
|
||||
}, prod
|
||||
}
|
||||
|
||||
func TestIngest_RateLimited(t *testing.T) {
|
||||
svc, _ := newSvc(t, &fakeLimiter{allow: false}, &fakeDedup{fresh: true}, &fakeSchema{})
|
||||
err := svc.Ingest(context.Background(), IngestContext{WorkspaceID: "ws"},
|
||||
&model.RawEvent{Type: model.EventTypeTrack, MessageID: "m1"})
|
||||
ae, ok := apperr.As(err)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, 429, ae.Code)
|
||||
assert.Greater(t, ae.RetryAfter, 0)
|
||||
}
|
||||
|
||||
func TestIngest_LateEvent(t *testing.T) {
|
||||
svc, _ := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: true}, &fakeSchema{})
|
||||
old := time.Now().Add(-48 * time.Hour)
|
||||
err := svc.Ingest(context.Background(), IngestContext{WorkspaceID: "ws"},
|
||||
&model.RawEvent{Type: model.EventTypeTrack, MessageID: "m1", SentAt: &old})
|
||||
ae, ok := apperr.As(err)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, 422, ae.Code)
|
||||
}
|
||||
|
||||
func TestIngest_DuplicateMessageSilentlyDropped(t *testing.T) {
|
||||
svc, prod := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: false}, &fakeSchema{})
|
||||
err := svc.Ingest(context.Background(), IngestContext{WorkspaceID: "ws"},
|
||||
&model.RawEvent{Type: model.EventTypeTrack, MessageID: "m1"})
|
||||
assert.NoError(t, err)
|
||||
assert.Empty(t, prod.produced, "duplicate must not be produced")
|
||||
}
|
||||
|
||||
func TestIngest_SchemaConflict(t *testing.T) {
|
||||
svc, prod := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: true},
|
||||
&fakeSchema{stored: map[string]string{"price": "string"}})
|
||||
props, _ := json.Marshal(map[string]any{"price": 9.99})
|
||||
err := svc.Ingest(context.Background(),
|
||||
IngestContext{WorkspaceID: "ws"},
|
||||
&model.RawEvent{
|
||||
Type: model.EventTypeTrack,
|
||||
MessageID: "m1",
|
||||
Properties: props,
|
||||
})
|
||||
ae, ok := apperr.As(err)
|
||||
require.True(t, ok)
|
||||
assert.Equal(t, 400, ae.Code)
|
||||
assert.Equal(t, "price", ae.Field)
|
||||
assert.Equal(t, []string{"schema_conflict"}, prod.dlq)
|
||||
assert.Empty(t, prod.produced)
|
||||
}
|
||||
|
||||
func TestIngest_HappyPath(t *testing.T) {
|
||||
svc, prod := newSvc(t, &fakeLimiter{allow: true}, &fakeDedup{fresh: true}, &fakeSchema{})
|
||||
props, _ := json.Marshal(map[string]any{"plan": "pro"})
|
||||
err := svc.Ingest(context.Background(),
|
||||
IngestContext{WorkspaceID: "ws", SourceID: "src", IP: "1.1.1.1"},
|
||||
&model.RawEvent{
|
||||
Type: model.EventTypeTrack,
|
||||
MessageID: "m1",
|
||||
AnonymousID: "anon-1",
|
||||
Event: "Signed Up",
|
||||
Properties: props,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, prod.produced, 1)
|
||||
ev := prod.produced[0]
|
||||
assert.Equal(t, "ws", ev.WorkspaceID)
|
||||
assert.Equal(t, "anon-1", ev.PartitionKey())
|
||||
assert.Equal(t, "pro", ev.Properties["plan"])
|
||||
}
|
||||
Reference in New Issue
Block a user