init ingestion
This commit is contained in:
18
ingestion/rotor/Dockerfile
Normal file
18
ingestion/rotor/Dockerfile
Normal file
@@ -0,0 +1,18 @@
|
||||
FROM node:20-bookworm-slim AS build
|
||||
WORKDIR /app
|
||||
COPY package.json ./
|
||||
# isolated-vm needs a build toolchain
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
python3 make g++ \
|
||||
&& npm install --omit=dev \
|
||||
&& apt-get purge -y python3 make g++ && apt-get autoremove -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
COPY src ./src
|
||||
|
||||
FROM node:20-bookworm-slim
|
||||
WORKDIR /app
|
||||
COPY --from=build /app /app
|
||||
ENV NODE_ENV=production
|
||||
EXPOSE 3401
|
||||
USER node
|
||||
CMD ["node", "src/index.js"]
|
||||
45
ingestion/rotor/README.md
Normal file
45
ingestion/rotor/README.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# rotor
|
||||
|
||||
CDP JS Functions runner. Executes user-supplied JavaScript inside V8 isolates
|
||||
(via `isolated-vm`), enforcing a memory + wall-clock limit per invocation.
|
||||
|
||||
## Endpoints
|
||||
|
||||
| Method | Path | Body | Notes |
|
||||
|--------|------|------|-------|
|
||||
| `POST` | `/v1/run` | `{ code, event }` | run ad-hoc code on one event |
|
||||
| `POST` | `/v1/transform` | `{ workspace_id, function, event }` | run a registered function |
|
||||
| `POST` | `/v1/functions` | `{ workspace_id, slug, code }` | upsert function code (admin) |
|
||||
| `DELETE` | `/v1/functions/:workspace/:slug` | — | invalidate cache entry |
|
||||
| `GET` | `/health` | — | liveness |
|
||||
| `GET` | `/ready` | — | readiness |
|
||||
|
||||
## User function contract
|
||||
|
||||
The submitted code must define a global function `transform(event)`. The
|
||||
function can return:
|
||||
|
||||
- `event` (possibly mutated) — emit one event
|
||||
- `null` / `undefined` — drop the event
|
||||
- `Array<event>` — fan-out into multiple events
|
||||
|
||||
Sync return only (no `async`). The runner enforces:
|
||||
|
||||
- Memory limit: `ROTOR_ISOLATE_MEMORY_MB` (default 128MB)
|
||||
- CPU/wall limit: `ROTOR_FUNCTION_TIMEOUT_MS` (default 2000ms)
|
||||
|
||||
## Local dev
|
||||
|
||||
```bash
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
Smoke test:
|
||||
|
||||
```bash
|
||||
curl -s -X POST localhost:3401/v1/run -H 'content-type: application/json' -d '{
|
||||
"code": "function transform(event) { event.properties = { tagged: true }; return event; }",
|
||||
"event": { "message_id": "m1", "workspace_id": "ws", "type": "track" }
|
||||
}'
|
||||
```
|
||||
23
ingestion/rotor/package.json
Normal file
23
ingestion/rotor/package.json
Normal file
@@ -0,0 +1,23 @@
|
||||
{
|
||||
"name": "cdp-rotor",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"description": "CDP JS Functions runner -- V8 isolates",
|
||||
"type": "module",
|
||||
"main": "src/index.js",
|
||||
"scripts": {
|
||||
"dev": "node --enable-source-maps src/index.js",
|
||||
"start": "node src/index.js",
|
||||
"test": "node --test test"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=20"
|
||||
},
|
||||
"dependencies": {
|
||||
"fastify": "^4.28.1",
|
||||
"isolated-vm": "^5.0.1",
|
||||
"pino": "^9.4.0",
|
||||
"zod": "^3.23.8"
|
||||
},
|
||||
"devDependencies": {}
|
||||
}
|
||||
126
ingestion/rotor/src/api/server.js
Normal file
126
ingestion/rotor/src/api/server.js
Normal file
@@ -0,0 +1,126 @@
|
||||
// HTTP API for rotor.
|
||||
//
|
||||
// Endpoints
|
||||
// POST /v1/run run an ad-hoc function against a single event
|
||||
// POST /v1/transform run a registered function (by workspace/slug)
|
||||
// POST /v1/functions upsert function code (admin)
|
||||
// DELETE /v1/functions/:workspace/:slug invalidate
|
||||
// GET /health liveness
|
||||
// GET /ready readiness
|
||||
//
|
||||
// All responses are JSON. Errors return { error, kind, field? }.
|
||||
|
||||
import Fastify from 'fastify';
|
||||
import { z } from 'zod';
|
||||
import { FunctionError, IsolateRunner } from '../runtime/isolate.js';
|
||||
import { Registry } from '../registry/registry.js';
|
||||
|
||||
const eventSchema = z.object({
|
||||
workspace_id: z.string(),
|
||||
source_id: z.string().optional(),
|
||||
message_id: z.string(),
|
||||
type: z.enum(['track', 'identify', 'page', 'group', 'alias', 'screen']),
|
||||
anonymous_id: z.string().optional(),
|
||||
user_id: z.string().optional(),
|
||||
event: z.string().optional(),
|
||||
properties: z.record(z.unknown()).optional(),
|
||||
traits: z.record(z.unknown()).optional(),
|
||||
context: z.record(z.unknown()).optional(),
|
||||
timestamp: z.string().optional(),
|
||||
sent_at: z.string().optional(),
|
||||
received_at: z.string().optional(),
|
||||
}).passthrough();
|
||||
|
||||
const runSchema = z.object({
|
||||
code: z.string().min(1).max(64 * 1024),
|
||||
event: eventSchema,
|
||||
});
|
||||
|
||||
const transformSchema = z.object({
|
||||
workspace_id: z.string(),
|
||||
function: z.string(),
|
||||
event: eventSchema,
|
||||
});
|
||||
|
||||
const upsertSchema = z.object({
|
||||
workspace_id: z.string(),
|
||||
slug: z.string(),
|
||||
code: z.string().min(1).max(64 * 1024),
|
||||
version: z.number().int().optional(),
|
||||
});
|
||||
|
||||
export function buildServer({ config, logger }) {
|
||||
const runner = new IsolateRunner(config.isolate);
|
||||
const registry = new Registry();
|
||||
|
||||
const app = Fastify({
|
||||
logger: logger,
|
||||
bodyLimit: 1 * 1024 * 1024, // 1MB cap on incoming code/payloads
|
||||
disableRequestLogging: false,
|
||||
});
|
||||
|
||||
app.get('/health', async () => ({ status: 'ok' }));
|
||||
app.get('/ready', async () => ({ status: 'ready' }));
|
||||
|
||||
app.post('/v1/run', async (req, reply) => {
|
||||
const parsed = runSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.status(400);
|
||||
return { error: 'invalid request', issues: parsed.error.flatten() };
|
||||
}
|
||||
try {
|
||||
const result = await runner.run(parsed.data.code, parsed.data.event);
|
||||
return { result };
|
||||
} catch (err) {
|
||||
return handleFnError(reply, err);
|
||||
}
|
||||
});
|
||||
|
||||
app.post('/v1/transform', async (req, reply) => {
|
||||
const parsed = transformSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.status(400);
|
||||
return { error: 'invalid request', issues: parsed.error.flatten() };
|
||||
}
|
||||
const { workspace_id, function: slug, event } = parsed.data;
|
||||
const entry = await registry.get(workspace_id, slug);
|
||||
if (!entry) {
|
||||
reply.status(404);
|
||||
return { error: 'function not found', workspace_id, slug };
|
||||
}
|
||||
try {
|
||||
const result = await runner.run(entry.code, event);
|
||||
return { result, version: entry.version };
|
||||
} catch (err) {
|
||||
return handleFnError(reply, err);
|
||||
}
|
||||
});
|
||||
|
||||
app.post('/v1/functions', async (req, reply) => {
|
||||
const parsed = upsertSchema.safeParse(req.body);
|
||||
if (!parsed.success) {
|
||||
reply.status(400);
|
||||
return { error: 'invalid request', issues: parsed.error.flatten() };
|
||||
}
|
||||
const { workspace_id, slug, code, version } = parsed.data;
|
||||
registry.set(workspace_id, slug, code, version ?? 1);
|
||||
return { ok: true };
|
||||
});
|
||||
|
||||
app.delete('/v1/functions/:workspace/:slug', async (req) => {
|
||||
registry.invalidate(req.params.workspace, req.params.slug);
|
||||
return { ok: true };
|
||||
});
|
||||
|
||||
return { app, registry, runner };
|
||||
}
|
||||
|
||||
function handleFnError(reply, err) {
|
||||
if (err instanceof FunctionError) {
|
||||
const code = err.kind === 'timeout' ? 422 : 400;
|
||||
reply.status(code);
|
||||
return { error: err.message, kind: err.kind };
|
||||
}
|
||||
reply.status(500);
|
||||
return { error: 'internal error' };
|
||||
}
|
||||
9
ingestion/rotor/src/config.js
Normal file
9
ingestion/rotor/src/config.js
Normal file
@@ -0,0 +1,9 @@
|
||||
// Runtime config loaded from env. Defaults mirror .env.example.
|
||||
export const config = {
|
||||
port: parseInt(process.env.ROTOR_PORT ?? '3401', 10),
|
||||
logLevel: process.env.ROTOR_LOG_LEVEL ?? 'info',
|
||||
isolate: {
|
||||
memoryLimitMB: parseInt(process.env.ROTOR_ISOLATE_MEMORY_MB ?? '128', 10),
|
||||
timeoutMs: parseInt(process.env.ROTOR_FUNCTION_TIMEOUT_MS ?? '2000', 10),
|
||||
},
|
||||
};
|
||||
34
ingestion/rotor/src/index.js
Normal file
34
ingestion/rotor/src/index.js
Normal file
@@ -0,0 +1,34 @@
|
||||
// rotor entry point -- starts the Fastify HTTP API.
|
||||
|
||||
import pino from 'pino';
|
||||
import { config } from './config.js';
|
||||
import { buildServer } from './api/server.js';
|
||||
|
||||
const logger = pino({ level: config.logLevel });
|
||||
|
||||
const { app } = buildServer({ config, logger });
|
||||
|
||||
async function start() {
|
||||
try {
|
||||
await app.listen({ host: '0.0.0.0', port: config.port });
|
||||
logger.info({ port: config.port }, 'rotor listening');
|
||||
} catch (err) {
|
||||
logger.error({ err }, 'rotor failed to start');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
function shutdown(signal) {
|
||||
logger.info({ signal }, 'shutdown signal received');
|
||||
app.close()
|
||||
.then(() => process.exit(0))
|
||||
.catch((err) => {
|
||||
logger.error({ err }, 'shutdown error');
|
||||
process.exit(1);
|
||||
});
|
||||
}
|
||||
|
||||
process.on('SIGINT', () => shutdown('SIGINT'));
|
||||
process.on('SIGTERM', () => shutdown('SIGTERM'));
|
||||
|
||||
start();
|
||||
41
ingestion/rotor/src/registry/registry.js
Normal file
41
ingestion/rotor/src/registry/registry.js
Normal file
@@ -0,0 +1,41 @@
|
||||
// Function registry -- an in-memory cache of (workspace_id, slug) -> code.
|
||||
//
|
||||
// The console writes function code into Postgres; rotor loads it lazily on
|
||||
// first miss and refreshes on pub/sub invalidation.
|
||||
//
|
||||
// For this scaffold we keep it dumb: a Map you can preload via /api/admin
|
||||
// or set directly in tests. Replace `loader` with a real PG loader when the
|
||||
// console exists.
|
||||
|
||||
export class Registry {
|
||||
/**
|
||||
* @param {{ loader?: (workspaceId: string, slug: string) => Promise<string|null> }} opts
|
||||
*/
|
||||
constructor(opts = {}) {
|
||||
this.loader = opts.loader ?? (async () => null);
|
||||
/** @type {Map<string, { code: string, version: number }>} */
|
||||
this.cache = new Map();
|
||||
}
|
||||
|
||||
key(workspaceId, slug) { return `${workspaceId}:${slug}`; }
|
||||
|
||||
async get(workspaceId, slug) {
|
||||
const k = this.key(workspaceId, slug);
|
||||
if (this.cache.has(k)) return this.cache.get(k);
|
||||
|
||||
const code = await this.loader(workspaceId, slug);
|
||||
if (code == null) return null;
|
||||
|
||||
const entry = { code, version: 1 };
|
||||
this.cache.set(k, entry);
|
||||
return entry;
|
||||
}
|
||||
|
||||
set(workspaceId, slug, code, version = 1) {
|
||||
this.cache.set(this.key(workspaceId, slug), { code, version });
|
||||
}
|
||||
|
||||
invalidate(workspaceId, slug) {
|
||||
this.cache.delete(this.key(workspaceId, slug));
|
||||
}
|
||||
}
|
||||
97
ingestion/rotor/src/runtime/isolate.js
Normal file
97
ingestion/rotor/src/runtime/isolate.js
Normal file
@@ -0,0 +1,97 @@
|
||||
// V8 isolate wrapper for running user-supplied JS functions safely.
|
||||
//
|
||||
// Each invocation:
|
||||
// 1. Build a fresh isolate + context (cheap to recycle for cold safety).
|
||||
// 2. Compile the user code once per function (cached in registry).
|
||||
// 3. Call `transform(event)` with a deep-copied event payload.
|
||||
// 4. Receive a return value (deep-copied back) within `timeoutMs`.
|
||||
//
|
||||
// Failures (compile error / runtime error / timeout) are surfaced as
|
||||
// FunctionError so the caller can route the event to DLQ.
|
||||
|
||||
import ivm from 'isolated-vm';
|
||||
|
||||
export class FunctionError extends Error {
|
||||
constructor(kind, message, cause) {
|
||||
super(message);
|
||||
this.name = 'FunctionError';
|
||||
this.kind = kind; // 'compile' | 'runtime' | 'timeout' | 'oom'
|
||||
this.cause = cause;
|
||||
}
|
||||
}
|
||||
|
||||
export class IsolateRunner {
|
||||
constructor({ memoryLimitMB, timeoutMs }) {
|
||||
this.memoryLimitMB = memoryLimitMB;
|
||||
this.timeoutMs = timeoutMs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run `code` against `event` and return the transformed value.
|
||||
* `code` must export a function named `transform` -- e.g.:
|
||||
*
|
||||
* function transform(event) {
|
||||
* event.properties.hashed_email = sha256(event.user_id);
|
||||
* return event;
|
||||
* }
|
||||
*
|
||||
* The function may return:
|
||||
* - the event (possibly mutated)
|
||||
* - null -- drop the event
|
||||
* - array -- fan-out into multiple events
|
||||
*/
|
||||
async run(code, event) {
|
||||
const isolate = new ivm.Isolate({ memoryLimit: this.memoryLimitMB });
|
||||
try {
|
||||
const context = await isolate.createContext();
|
||||
const jail = context.global;
|
||||
await jail.set('global', jail.derefInto());
|
||||
|
||||
let script;
|
||||
try {
|
||||
script = await isolate.compileScript(buildHarness(code));
|
||||
} catch (err) {
|
||||
throw new FunctionError('compile', err.message, err);
|
||||
}
|
||||
|
||||
try {
|
||||
await script.run(context, { timeout: this.timeoutMs });
|
||||
} catch (err) {
|
||||
if (err.message?.includes('Script execution timed out')) {
|
||||
throw new FunctionError('timeout', `function timed out after ${this.timeoutMs}ms`, err);
|
||||
}
|
||||
if (err.message?.includes('Isolate was disposed')) {
|
||||
throw new FunctionError('oom', 'isolate ran out of memory', err);
|
||||
}
|
||||
throw new FunctionError('runtime', err.message, err);
|
||||
}
|
||||
|
||||
const fn = await context.global.get('__cdp_transform__', { reference: true });
|
||||
const result = await fn.apply(undefined, [new ivm.ExternalCopy(event).copyInto()], {
|
||||
timeout: this.timeoutMs,
|
||||
result: { copy: true },
|
||||
});
|
||||
return result;
|
||||
} finally {
|
||||
isolate.dispose();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap user code so we can call it deterministically. The harness:
|
||||
* - injects a `console.log` shim that drops output (we'll add capture later)
|
||||
* - exposes `transform` on the global as `__cdp_transform__`
|
||||
*/
|
||||
function buildHarness(code) {
|
||||
return `
|
||||
const console = {
|
||||
log: () => {}, warn: () => {}, error: () => {}, info: () => {},
|
||||
};
|
||||
${code}
|
||||
if (typeof transform !== 'function') {
|
||||
throw new Error('user code must define a global function named "transform"');
|
||||
}
|
||||
global.__cdp_transform__ = transform;
|
||||
`;
|
||||
}
|
||||
51
ingestion/rotor/test/isolate.test.js
Normal file
51
ingestion/rotor/test/isolate.test.js
Normal file
@@ -0,0 +1,51 @@
|
||||
import { test } from 'node:test';
|
||||
import assert from 'node:assert/strict';
|
||||
import { IsolateRunner, FunctionError } from '../src/runtime/isolate.js';
|
||||
|
||||
test('passes event through a noop transform', async () => {
|
||||
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
|
||||
const code = `
|
||||
function transform(event) {
|
||||
return event;
|
||||
}
|
||||
`;
|
||||
const got = await r.run(code, { message_id: 'm1', type: 'track' });
|
||||
assert.equal(got.message_id, 'm1');
|
||||
});
|
||||
|
||||
test('mutates event properties', async () => {
|
||||
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
|
||||
const code = `
|
||||
function transform(event) {
|
||||
event.properties = event.properties || {};
|
||||
event.properties.tagged = true;
|
||||
return event;
|
||||
}
|
||||
`;
|
||||
const got = await r.run(code, { message_id: 'm1', type: 'track' });
|
||||
assert.equal(got.properties.tagged, true);
|
||||
});
|
||||
|
||||
test('returns null to drop event', async () => {
|
||||
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
|
||||
const code = `function transform(event) { return null; }`;
|
||||
const got = await r.run(code, { message_id: 'm1', type: 'track' });
|
||||
assert.equal(got, null);
|
||||
});
|
||||
|
||||
test('rejects code without transform()', async () => {
|
||||
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 1000 });
|
||||
await assert.rejects(
|
||||
() => r.run(`const x = 1;`, { message_id: 'm1', type: 'track' }),
|
||||
(err) => err instanceof FunctionError && err.kind === 'runtime',
|
||||
);
|
||||
});
|
||||
|
||||
test('times out infinite loops', async () => {
|
||||
const r = new IsolateRunner({ memoryLimitMB: 64, timeoutMs: 100 });
|
||||
const code = `function transform(event) { while (true) {} return event; }`;
|
||||
await assert.rejects(
|
||||
() => r.run(code, { message_id: 'm1', type: 'track' }),
|
||||
(err) => err instanceof FunctionError && err.kind === 'timeout',
|
||||
);
|
||||
});
|
||||
Reference in New Issue
Block a user