init ingestion

This commit is contained in:
2026-05-24 22:59:24 +07:00
commit 4e8c11d545
80 changed files with 5639 additions and 0 deletions

View File

@@ -0,0 +1,18 @@
FROM node:20-bookworm-slim AS build
WORKDIR /app
COPY package.json ./
# isolated-vm needs a build toolchain
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 make g++ \
&& npm install --omit=dev \
&& apt-get purge -y python3 make g++ && apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
COPY src ./src
FROM node:20-bookworm-slim
WORKDIR /app
COPY --from=build /app /app
ENV NODE_ENV=production
EXPOSE 3401
USER node
CMD ["node", "src/index.js"]

45
ingestion/rotor/README.md Normal file
View File

@@ -0,0 +1,45 @@
# rotor
CDP JS Functions runner. Executes user-supplied JavaScript inside V8 isolates
(via `isolated-vm`), enforcing a memory + wall-clock limit per invocation.
## Endpoints
| Method | Path | Body | Notes |
|--------|------|------|-------|
| `POST` | `/v1/run` | `{ code, event }` | run ad-hoc code on one event |
| `POST` | `/v1/transform` | `{ workspace_id, function, event }` | run a registered function |
| `POST` | `/v1/functions` | `{ workspace_id, slug, code }` | upsert function code (admin) |
| `DELETE` | `/v1/functions/:workspace/:slug` | — | invalidate cache entry |
| `GET` | `/health` | — | liveness |
| `GET` | `/ready` | — | readiness |
## User function contract
The submitted code must define a global function `transform(event)`. The
function can return:
- `event` (possibly mutated) — emit one event
- `null` / `undefined` — drop the event
- `Array<event>` — fan-out into multiple events
Sync return only (no `async`). The runner enforces:
- Memory limit: `ROTOR_ISOLATE_MEMORY_MB` (default 128MB)
- CPU/wall limit: `ROTOR_FUNCTION_TIMEOUT_MS` (default 2000ms)
## Local dev
```bash
npm install
npm run dev
```
Smoke test:
```bash
curl -s -X POST localhost:3401/v1/run -H 'content-type: application/json' -d '{
"code": "function transform(event) { event.properties = { tagged: true }; return event; }",
"event": { "message_id": "m1", "workspace_id": "ws", "type": "track" }
}'
```

View File

@@ -0,0 +1,23 @@
{
"name": "cdp-rotor",
"version": "0.1.0",
"private": true,
"description": "CDP JS Functions runner -- V8 isolates",
"type": "module",
"main": "src/index.js",
"scripts": {
"dev": "node --enable-source-maps src/index.js",
"start": "node src/index.js",
"test": "node --test test"
},
"engines": {
"node": ">=20"
},
"dependencies": {
"fastify": "^4.28.1",
"isolated-vm": "^5.0.1",
"pino": "^9.4.0",
"zod": "^3.23.8"
},
"devDependencies": {}
}

View File

@@ -0,0 +1,126 @@
// HTTP API for rotor.
//
// Endpoints
// POST /v1/run run an ad-hoc function against a single event
// POST /v1/transform run a registered function (by workspace/slug)
// POST /v1/functions upsert function code (admin)
// DELETE /v1/functions/:workspace/:slug invalidate
// GET /health liveness
// GET /ready readiness
//
// All responses are JSON. Errors return { error, kind, field? }.
import Fastify from 'fastify';
import { z } from 'zod';
import { FunctionError, IsolateRunner } from '../runtime/isolate.js';
import { Registry } from '../registry/registry.js';
const eventSchema = z.object({
workspace_id: z.string(),
source_id: z.string().optional(),
message_id: z.string(),
type: z.enum(['track', 'identify', 'page', 'group', 'alias', 'screen']),
anonymous_id: z.string().optional(),
user_id: z.string().optional(),
event: z.string().optional(),
properties: z.record(z.unknown()).optional(),
traits: z.record(z.unknown()).optional(),
context: z.record(z.unknown()).optional(),
timestamp: z.string().optional(),
sent_at: z.string().optional(),
received_at: z.string().optional(),
}).passthrough();
const runSchema = z.object({
code: z.string().min(1).max(64 * 1024),
event: eventSchema,
});
const transformSchema = z.object({
workspace_id: z.string(),
function: z.string(),
event: eventSchema,
});
const upsertSchema = z.object({
workspace_id: z.string(),
slug: z.string(),
code: z.string().min(1).max(64 * 1024),
version: z.number().int().optional(),
});
export function buildServer({ config, logger }) {
const runner = new IsolateRunner(config.isolate);
const registry = new Registry();
const app = Fastify({
logger: logger,
bodyLimit: 1 * 1024 * 1024, // 1MB cap on incoming code/payloads
disableRequestLogging: false,
});
app.get('/health', async () => ({ status: 'ok' }));
app.get('/ready', async () => ({ status: 'ready' }));
app.post('/v1/run', async (req, reply) => {
const parsed = runSchema.safeParse(req.body);
if (!parsed.success) {
reply.status(400);
return { error: 'invalid request', issues: parsed.error.flatten() };
}
try {
const result = await runner.run(parsed.data.code, parsed.data.event);
return { result };
} catch (err) {
return handleFnError(reply, err);
}
});
app.post('/v1/transform', async (req, reply) => {
const parsed = transformSchema.safeParse(req.body);
if (!parsed.success) {
reply.status(400);
return { error: 'invalid request', issues: parsed.error.flatten() };
}
const { workspace_id, function: slug, event } = parsed.data;
const entry = await registry.get(workspace_id, slug);
if (!entry) {
reply.status(404);
return { error: 'function not found', workspace_id, slug };
}
try {
const result = await runner.run(entry.code, event);
return { result, version: entry.version };
} catch (err) {
return handleFnError(reply, err);
}
});
app.post('/v1/functions', async (req, reply) => {
const parsed = upsertSchema.safeParse(req.body);
if (!parsed.success) {
reply.status(400);
return { error: 'invalid request', issues: parsed.error.flatten() };
}
const { workspace_id, slug, code, version } = parsed.data;
registry.set(workspace_id, slug, code, version ?? 1);
return { ok: true };
});
app.delete('/v1/functions/:workspace/:slug', async (req) => {
registry.invalidate(req.params.workspace, req.params.slug);
return { ok: true };
});
return { app, registry, runner };
}
function handleFnError(reply, err) {
if (err instanceof FunctionError) {
const code = err.kind === 'timeout' ? 422 : 400;
reply.status(code);
return { error: err.message, kind: err.kind };
}
reply.status(500);
return { error: 'internal error' };
}

View File

@@ -0,0 +1,9 @@
// Runtime config loaded from env. Defaults mirror .env.example.
export const config = {
port: parseInt(process.env.ROTOR_PORT ?? '3401', 10),
logLevel: process.env.ROTOR_LOG_LEVEL ?? 'info',
isolate: {
memoryLimitMB: parseInt(process.env.ROTOR_ISOLATE_MEMORY_MB ?? '128', 10),
timeoutMs: parseInt(process.env.ROTOR_FUNCTION_TIMEOUT_MS ?? '2000', 10),
},
};

View File

@@ -0,0 +1,34 @@
// rotor entry point -- starts the Fastify HTTP API.
import pino from 'pino';
import { config } from './config.js';
import { buildServer } from './api/server.js';
const logger = pino({ level: config.logLevel });
const { app } = buildServer({ config, logger });
async function start() {
try {
await app.listen({ host: '0.0.0.0', port: config.port });
logger.info({ port: config.port }, 'rotor listening');
} catch (err) {
logger.error({ err }, 'rotor failed to start');
process.exit(1);
}
}
function shutdown(signal) {
logger.info({ signal }, 'shutdown signal received');
app.close()
.then(() => process.exit(0))
.catch((err) => {
logger.error({ err }, 'shutdown error');
process.exit(1);
});
}
process.on('SIGINT', () => shutdown('SIGINT'));
process.on('SIGTERM', () => shutdown('SIGTERM'));
start();

View File

@@ -0,0 +1,41 @@
// Function registry -- an in-memory cache of (workspace_id, slug) -> code.
//
// The console writes function code into Postgres; rotor loads it lazily on
// first miss and refreshes on pub/sub invalidation.
//
// For this scaffold we keep it dumb: a Map you can preload via /api/admin
// or set directly in tests. Replace `loader` with a real PG loader when the
// console exists.
export class Registry {
/**
* @param {{ loader?: (workspaceId: string, slug: string) => Promise<string|null> }} opts
*/
constructor(opts = {}) {
this.loader = opts.loader ?? (async () => null);
/** @type {Map<string, { code: string, version: number }>} */
this.cache = new Map();
}
key(workspaceId, slug) { return `${workspaceId}:${slug}`; }
async get(workspaceId, slug) {
const k = this.key(workspaceId, slug);
if (this.cache.has(k)) return this.cache.get(k);
const code = await this.loader(workspaceId, slug);
if (code == null) return null;
const entry = { code, version: 1 };
this.cache.set(k, entry);
return entry;
}
set(workspaceId, slug, code, version = 1) {
this.cache.set(this.key(workspaceId, slug), { code, version });
}
invalidate(workspaceId, slug) {
this.cache.delete(this.key(workspaceId, slug));
}
}

View File

@@ -0,0 +1,97 @@
// V8 isolate wrapper for running user-supplied JS functions safely.
//
// Each invocation:
// 1. Build a fresh isolate + context (cheap to recycle for cold safety).
// 2. Compile the user code once per function (cached in registry).
// 3. Call `transform(event)` with a deep-copied event payload.
// 4. Receive a return value (deep-copied back) within `timeoutMs`.
//
// Failures (compile error / runtime error / timeout) are surfaced as
// FunctionError so the caller can route the event to DLQ.
import ivm from 'isolated-vm';
export class FunctionError extends Error {
constructor(kind, message, cause) {
super(message);
this.name = 'FunctionError';
this.kind = kind; // 'compile' | 'runtime' | 'timeout' | 'oom'
this.cause = cause;
}
}
export class IsolateRunner {
constructor({ memoryLimitMB, timeoutMs }) {
this.memoryLimitMB = memoryLimitMB;
this.timeoutMs = timeoutMs;
}
/**
* Run `code` against `event` and return the transformed value.
* `code` must export a function named `transform` -- e.g.:
*
* function transform(event) {
* event.properties.hashed_email = sha256(event.user_id);
* return event;
* }
*
* The function may return:
* - the event (possibly mutated)
* - null -- drop the event
* - array -- fan-out into multiple events
*/
async run(code, event) {
const isolate = new ivm.Isolate({ memoryLimit: this.memoryLimitMB });
try {
const context = await isolate.createContext();
const jail = context.global;
await jail.set('global', jail.derefInto());
let script;
try {
script = await isolate.compileScript(buildHarness(code));
} catch (err) {
throw new FunctionError('compile', err.message, err);
}
try {
await script.run(context, { timeout: this.timeoutMs });
} catch (err) {
if (err.message?.includes('Script execution timed out')) {
throw new FunctionError('timeout', `function timed out after ${this.timeoutMs}ms`, err);
}
if (err.message?.includes('Isolate was disposed')) {
throw new FunctionError('oom', 'isolate ran out of memory', err);
}
throw new FunctionError('runtime', err.message, err);
}
const fn = await context.global.get('__cdp_transform__', { reference: true });
const result = await fn.apply(undefined, [new ivm.ExternalCopy(event).copyInto()], {
timeout: this.timeoutMs,
result: { copy: true },
});
return result;
} finally {
isolate.dispose();
}
}
}
/**
* Wrap user code so we can call it deterministically. The harness:
* - injects a `console.log` shim that drops output (we'll add capture later)
* - exposes `transform` on the global as `__cdp_transform__`
*/
function buildHarness(code) {
return `
const console = {
log: () => {}, warn: () => {}, error: () => {}, info: () => {},
};
${code}
if (typeof transform !== 'function') {
throw new Error('user code must define a global function named "transform"');
}
global.__cdp_transform__ = transform;
`;
}

View File

@@ -0,0 +1,51 @@
import { test } from 'node:test';
import assert from 'node:assert/strict';
import { IsolateRunner, FunctionError } from '../src/runtime/isolate.js';
test('passes event through a noop transform', async () => {
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
const code = `
function transform(event) {
return event;
}
`;
const got = await r.run(code, { message_id: 'm1', type: 'track' });
assert.equal(got.message_id, 'm1');
});
test('mutates event properties', async () => {
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
const code = `
function transform(event) {
event.properties = event.properties || {};
event.properties.tagged = true;
return event;
}
`;
const got = await r.run(code, { message_id: 'm1', type: 'track' });
assert.equal(got.properties.tagged, true);
});
test('returns null to drop event', async () => {
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
const code = `function transform(event) { return null; }`;
const got = await r.run(code, { message_id: 'm1', type: 'track' });
assert.equal(got, null);
});
test('rejects code without transform()', async () => {
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
await assert.rejects(
() => r.run(`const x = 1;`, { message_id: 'm1', type: 'track' }),
(err) => err instanceof FunctionError && err.kind === 'runtime',
);
});
test('times out infinite loops', async () => {
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 100 });
const code = `function transform(event) { while (true) {} return event; }`;
await assert.rejects(
() => r.run(code, { message_id: 'm1', type: 'track' }),
(err) => err instanceof FunctionError && err.kind === 'timeout',
);
});