Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .server-changes/incident-notifications.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
area: webapp
type: feature
---

Add an inbound webhook (`POST /webhooks/v1/betterstack-incidents`) that receives
status-page incident updates and proactively notifies customers over Slack
(channels matching a configurable name prefix), email (org admins, via the
alerts email transport), and Discord (an incoming webhook). Delivery runs on the
alerts redis-worker with per-surface jobs and is deduped on the incident update
id. Gated by `INCIDENT_NOTIFY_ENABLED` plus a shared-secret token in the webhook
URL; each surface no-ops unless its own config is present.
16 changes: 16 additions & 0 deletions apps/webapp/app/env.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1606,6 +1606,14 @@ const EnvironmentSchema = z
BETTERSTACK_API_KEY: z.string().optional(),
BETTERSTACK_STATUS_PAGE_ID: z.string().optional(),

// Incident notifications: fan a published status report out over
// Slack/email/Discord. Each surface no-ops unless configured; the unsigned
// webhook is gated by a shared secret in the URL.
INCIDENT_NOTIFY_ENABLED: z.string().default("0"),
BETTERSTACK_INCIDENT_WEBHOOK_SECRET: z.string().optional(),
Comment thread
kathiekiwi marked this conversation as resolved.
INCIDENT_NOTIFY_SLACK_CHANNEL_PREFIX: z.string().optional(),
INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL: z.string().optional(),

RUN_REPLICATION_REDIS_HOST: z
.string()
.optional()
Expand Down Expand Up @@ -2010,6 +2018,14 @@ const EnvironmentSchema = z
.and(GithubAppEnvSchema)
.and(S2EnvSchema)
.superRefine((env, ctx) => {
if (env.INCIDENT_NOTIFY_ENABLED === "1" && !env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET) {
ctx.addIssue({
code: z.ZodIssueCode.custom,
path: ["BETTERSTACK_INCIDENT_WEBHOOK_SECRET"],
message: "BETTERSTACK_INCIDENT_WEBHOOK_SECRET is required when INCIDENT_NOTIFY_ENABLED=1",
});
}

const presets = new Set(env.COMPUTE_TEMPLATE_MACHINE_PRESETS);
for (const required of env.COMPUTE_TEMPLATE_MACHINE_PRESETS_REQUIRED) {
if (!presets.has(required)) {
Expand Down
72 changes: 72 additions & 0 deletions apps/webapp/app/routes/webhooks.v1.betterstack-incidents.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import { type ActionFunctionArgs, json } from "@remix-run/server-runtime";
import { createHash, timingSafeEqual } from "node:crypto";
import { env } from "~/env.server";
import {
IncidentWebhookSchema,
isCustomerNotifiableEvent,
normalizeIncidentUpdate,
} from "~/services/betterstack/incidentWebhook";
import { logger } from "~/services/logger.server";
import { alertsWorker } from "~/v3/alertsWorker.server";

// Inbound status-page webhook. BetterStack can't send custom headers, so we
// auth via a `?token=` shared secret (redacted from logs at ingress). 404 when
// disabled or unconfigured. We 200 fast and hand off to the worker; the enqueue
// is deduped on the update id since BetterStack redelivers on failure.
export async function action({ request }: ActionFunctionArgs) {
if (request.method.toUpperCase() !== "POST") {
return json({ error: "Method not allowed" }, { status: 405 });
}

const secret = env.BETTERSTACK_INCIDENT_WEBHOOK_SECRET;
if (env.INCIDENT_NOTIFY_ENABLED !== "1" || !secret) {
return json({ error: "Not found" }, { status: 404 });
}

const token = new URL(request.url).searchParams.get("token") ?? "";
if (!secretsMatch(token, secret)) {
return json({ error: "Invalid token" }, { status: 401 });
}

const rawBody = await request.text();

let parsed: unknown;
try {
parsed = JSON.parse(rawBody);
} catch {
return json({ error: "Invalid JSON" }, { status: 400 });
}

const payload = IncidentWebhookSchema.safeParse(parsed);
if (!payload.success) {
logger.warn("BetterStack incident webhook: invalid payload", {
issues: payload.error.issues,
});
return json({ error: "Invalid payload", issues: payload.error.issues }, { status: 400 });
}

// Maintenance and component-update events are not customer incidents.
if (!isCustomerNotifiableEvent(payload.data)) {
return json({ ignored: true, reason: "non_incident_event" }, { status: 200 });
}

const update = normalizeIncidentUpdate(payload.data);
if (!update) {
return json({ ignored: true, reason: "no_updates" }, { status: 200 });
}

await alertsWorker.enqueueOnce({
id: `incident-notify:${update.updateId}`,
job: "v3.fanoutIncidentNotification",
payload: update,
});

return json({ received: true }, { status: 200 });
}

// Hash both sides so timingSafeEqual gets equal-length buffers without leaking length.
function secretsMatch(a: string, b: string): boolean {
const aHash = createHash("sha256").update(a).digest();
const bHash = createHash("sha256").update(b).digest();
return timingSafeEqual(aHash, bHash);
}
96 changes: 96 additions & 0 deletions apps/webapp/app/services/betterstack/incidentWebhook.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import { z } from "zod";

// Payload for the BetterStack status-page webhook. The endpoint is unsigned, so
// the route auths via a shared secret in the URL.

// BetterStack sends ids as numbers; accept either and normalize to string.
const IdSchema = z.union([z.string(), z.number()]).transform((v) => String(v));

export const IncidentUpdateSchema = z.object({
id: IdSchema,
status_report_id: IdSchema.optional(),
body: z.string().nullish(),
created_at: z.string().nullish(),
updated_at: z.string().nullish(),
});

export const IncidentWebhookSchema = z.object({
event_type: z.string(),
page: z
.object({
id: IdSchema.optional(),
status_indicator: z.string().nullish(),
status_description: z.string().nullish(),
})
.optional(),
// Optional so non-incident callbacks (maintenance/component) parse and are
// ignored instead of 400ing.
incident: z
.object({
id: IdSchema,
name: z.string().nullish(),
created_at: z.string().nullish(),
updated_at: z.string().nullish(),
shortlink: z.string().nullish(),
incident_updates: z.array(IncidentUpdateSchema).default([]),
})
.optional(),
});
Comment thread
kathiekiwi marked this conversation as resolved.

export type IncidentWebhook = z.infer<typeof IncidentWebhookSchema>;

export const NormalizedIncidentUpdateSchema = z.object({
incidentId: z.string(),
updateId: z.string(),
name: z.string(),
statusIndicator: z.string(),
body: z.string(),
shortlink: z.string().nullable(),
updatedAt: z.string().nullable(),
});

export type NormalizedIncidentUpdate = {
incidentId: string;
/** The specific update id — our idempotency key. */
updateId: string;
name: string;
/** operational | degraded | downtime | maintenance */
statusIndicator: string;
body: string;
shortlink: string | null;
updatedAt: string | null;
};

/** Only published "incident" events notify customers, not monitor auto-alerts. */
export function isCustomerNotifiableEvent(payload: IncidentWebhook): boolean {
return payload.event_type === "incident" && !!payload.incident;
}

/** Reduce the webhook to its most recent update, or null if there are none. */
export function normalizeIncidentUpdate(payload: IncidentWebhook): NormalizedIncidentUpdate | null {
if (!payload.incident) {
return null;
}

const updates = payload.incident.incident_updates;
if (updates.length === 0) {
return null;
}

// Sort by created_at so we don't rely on BetterStack's ordering.
const mostRecent = [...updates].sort((a, b) => {
const aTime = a.created_at ? Date.parse(a.created_at) : 0;
const bTime = b.created_at ? Date.parse(b.created_at) : 0;
return bTime - aTime;
})[0];

return {
incidentId: payload.incident.id,
updateId: mostRecent.id,
name: payload.incident.name?.trim() || "Service incident",
statusIndicator: payload.page?.status_indicator?.trim() || "downtime",
body: mostRecent.body?.trim() || "",
shortlink: payload.incident.shortlink?.trim() || null,
updatedAt: mostRecent.created_at ?? payload.incident.updated_at ?? null,
};
}
28 changes: 28 additions & 0 deletions apps/webapp/app/utils/redactUrl.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Credential query params redacted from logs and traces (some webhooks can only
// auth via a URL token).
export const SENSITIVE_QUERY_PARAMS = ["token", "secret", "access_token", "api_key"];

/**
* Replace sensitive query param values with `[redacted]`. Accepts absolute or
* path+query URLs; returns malformed input unchanged (never throws).
*/
export function redactSensitiveQueryParams(url: string): string {
const queryStart = url.indexOf("?");
if (queryStart === -1) {
return url;
}

try {
const params = new URLSearchParams(url.slice(queryStart + 1));
let didRedact = false;
for (const key of SENSITIVE_QUERY_PARAMS) {
if (params.has(key)) {
params.set(key, "[redacted]");
didRedact = true;
}
}
return didRedact ? `${url.slice(0, queryStart)}?${params.toString()}` : url;
} catch {
return url;
}
}
69 changes: 69 additions & 0 deletions apps/webapp/app/v3/alertsWorker.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,17 @@ import { z } from "zod";
import { env } from "~/env.server";
import { logger } from "~/services/logger.server";
import { singleton } from "~/utils/singleton";
import { NormalizedIncidentUpdateSchema } from "~/services/betterstack/incidentWebhook";
import { DeliverAlertService } from "./services/alerts/deliverAlert.server";
import { DeliverErrorGroupAlertService } from "./services/alerts/deliverErrorGroupAlert.server";
import { ErrorAlertEvaluator } from "./services/alerts/errorAlertEvaluator.server";
import { deliverIncidentToDiscord } from "./services/alerts/incidentNotifications/deliverDiscord.server";
import {
deliverIncidentEmailPage,
deliverIncidentEmailToRecipient,
} from "./services/alerts/incidentNotifications/deliverEmail.server";
import { deliverIncidentToSlack } from "./services/alerts/incidentNotifications/deliverSlack.server";
import { fanoutIncidentNotification } from "./services/alerts/incidentNotifications/fanout.server";
import { PerformDeploymentAlertsService } from "./services/alerts/performDeploymentAlerts.server";
import { PerformTaskRunAlertsService } from "./services/alerts/performTaskRunAlerts.server";

Expand Down Expand Up @@ -93,6 +101,52 @@ function initializeWorker() {
},
logErrors: true,
},
"v3.fanoutIncidentNotification": {
schema: NormalizedIncidentUpdateSchema,
visibilityTimeoutMs: 30_000,
retry: {
maxAttempts: 3,
},
logErrors: true,
},
"v3.deliverIncidentSlack": {
schema: z.object({ update: NormalizedIncidentUpdateSchema }),
visibilityTimeoutMs: 60_000,
retry: {
maxAttempts: 3,
},
logErrors: true,
},
"v3.deliverIncidentDiscord": {
schema: z.object({ update: NormalizedIncidentUpdateSchema }),
visibilityTimeoutMs: 30_000,
retry: {
maxAttempts: 3,
},
logErrors: true,
},
"v3.deliverIncidentEmail": {
schema: z.object({
update: NormalizedIncidentUpdateSchema,
cursor: z.string().nullable(),
}),
visibilityTimeoutMs: 60_000,
retry: {
maxAttempts: 3,
},
logErrors: true,
},
"v3.deliverIncidentEmailRecipient": {
schema: z.object({
update: NormalizedIncidentUpdateSchema,
recipient: z.object({ userId: z.string(), email: z.string() }),
}),
visibilityTimeoutMs: 30_000,
retry: {
maxAttempts: 3,
},
logErrors: true,
},
},
concurrency: {
workers: env.ALERTS_WORKER_CONCURRENCY_WORKERS,
Expand Down Expand Up @@ -126,6 +180,21 @@ function initializeWorker() {
const service = new DeliverErrorGroupAlertService();
await service.call(payload);
},
"v3.fanoutIncidentNotification": async ({ payload }) => {
await fanoutIncidentNotification(payload);
},
"v3.deliverIncidentSlack": async ({ payload }) => {
await deliverIncidentToSlack(payload.update);
},
"v3.deliverIncidentDiscord": async ({ payload }) => {
await deliverIncidentToDiscord(payload.update);
},
"v3.deliverIncidentEmail": async ({ payload }) => {
await deliverIncidentEmailPage(payload);
},
"v3.deliverIncidentEmailRecipient": async ({ payload }) => {
await deliverIncidentEmailToRecipient(payload);
},
},
});

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import { env } from "~/env.server";
import { type NormalizedIncidentUpdate } from "~/services/betterstack/incidentWebhook";
import { logger } from "~/services/logger.server";
import { buildDiscordPayload } from "./messages";

/** Post to the Discord webhook. No-op if unconfigured; throws on non-2xx to retry. */
export async function deliverIncidentToDiscord(update: NormalizedIncidentUpdate): Promise<void> {
const webhookUrl = env.INCIDENT_NOTIFY_DISCORD_WEBHOOK_URL;
if (!webhookUrl) {
logger.debug("Incident Discord delivery skipped: no webhook URL configured");
return;
}

const response = await fetch(webhookUrl, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(buildDiscordPayload(update)),
signal: AbortSignal.timeout(10_000),
});

if (!response.ok) {
const detail = await response.text().catch(() => "");
throw new Error(`Discord webhook returned ${response.status}: ${detail.slice(0, 200)}`);
}

logger.info("Incident Discord delivery complete", { updateId: update.updateId });
}
Loading