From 6bfb56b98fb9ccb3f4d884911bbdf46e00a25d03 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Sun, 1 Mar 2026 20:19:13 +0000 Subject: [PATCH 01/12] First commit for Errors feature --- .../app/components/logs/LogsSearchInput.tsx | 12 +- .../app/components/navigation/SideMenu.tsx | 10 + .../v3/ErrorGroupPresenter.server.ts | 191 ++++++++++ .../v3/ErrorsListPresenter.server.ts | 247 +++++++++++++ .../route.tsx | 287 +++++++++++++++ .../route.tsx | 335 ++++++++++++++++++ ...Param.env.$envParam.errors.$fingerprint.ts | 51 +++ ...ects.$projectParam.env.$envParam.errors.ts | 67 ++++ .../services/runsReplicationService.server.ts | 10 +- apps/webapp/app/utils/errorFingerprinting.ts | 91 +++++ apps/webapp/app/utils/pathBuilder.ts | 17 + apps/webapp/test/errorFingerprinting.test.ts | 327 +++++++++++++++++ ..._add_error_fingerprint_to_task_runs_v2.sql | 11 + .../schema/022_create_errors_v1_table.sql | 83 +++++ .../clickhouse/src/client/queryBuilder.ts | 38 ++ internal-packages/clickhouse/src/errors.ts | 206 +++++++++++ internal-packages/clickhouse/src/index.ts | 16 + internal-packages/clickhouse/src/taskRuns.ts | 4 + 18 files changed, 1997 insertions(+), 6 deletions(-) create mode 100644 apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts create mode 100644 apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts create mode 100644 apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx create mode 100644 apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx create mode 100644 apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts create mode 100644 apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts create mode 100644 apps/webapp/app/utils/errorFingerprinting.ts create mode 100644 apps/webapp/test/errorFingerprinting.test.ts create mode 100644 internal-packages/clickhouse/schema/021_add_error_fingerprint_to_task_runs_v2.sql create mode 100644 internal-packages/clickhouse/schema/022_create_errors_v1_table.sql create mode 100644 internal-packages/clickhouse/src/errors.ts diff --git a/apps/webapp/app/components/logs/LogsSearchInput.tsx b/apps/webapp/app/components/logs/LogsSearchInput.tsx index 58316cead88..44f4d130185 100644 --- a/apps/webapp/app/components/logs/LogsSearchInput.tsx +++ b/apps/webapp/app/components/logs/LogsSearchInput.tsx @@ -3,12 +3,14 @@ import { motion } from "framer-motion"; import { useCallback, useEffect, useRef, useState } from "react"; import { Input } from "~/components/primitives/Input"; import { ShortcutKey } from "~/components/primitives/ShortcutKey"; -import { cn } from "~/utils/cn"; -import { useOptimisticLocation } from "~/hooks/useOptimisticLocation"; import { useSearchParams } from "~/hooks/useSearchParam"; +import { cn } from "~/utils/cn"; + +export type LogsSearchInputProps = { + placeholder?: string; +}; -export function LogsSearchInput() { - const location = useOptimisticLocation(); +export function LogsSearchInput({ placeholder = "Search logs…" }: LogsSearchInputProps) { const inputRef = useRef(null); const { value, replace, del } = useSearchParams(); @@ -61,7 +63,7 @@ export function LogsSearchInput() { type="text" ref={inputRef} variant="secondary-small" - placeholder="Search logs…" + placeholder={placeholder} value={text} onChange={(e) => setText(e.target.value)} fullWidth diff --git a/apps/webapp/app/components/navigation/SideMenu.tsx b/apps/webapp/app/components/navigation/SideMenu.tsx index 8817360aa32..3ed99b2f82f 100644 --- a/apps/webapp/app/components/navigation/SideMenu.tsx +++ b/apps/webapp/app/components/navigation/SideMenu.tsx @@ -73,6 +73,7 @@ import { v3EnvironmentPath, v3EnvironmentVariablesPath, v3LogsPath, + v3ErrorsPath, v3ProjectAlertsPath, v3ProjectPath, v3ProjectSettingsGeneralPath, @@ -474,6 +475,15 @@ export function SideMenu({ isCollapsed={isCollapsed} /> )} + >; +export type ErrorInstance = ErrorGroupDetail["instances"][0]; + +// Cursor for error instances pagination +type ErrorInstanceCursor = { + createdAt: string; + runId: string; +}; + +const ErrorInstanceCursorSchema = z.object({ + createdAt: z.string(), + runId: z.string(), +}); + +function encodeCursor(cursor: ErrorInstanceCursor): string { + return Buffer.from(JSON.stringify(cursor)).toString("base64"); +} + +function decodeCursor(cursor: string): ErrorInstanceCursor | null { + try { + const decoded = Buffer.from(cursor, "base64").toString("utf-8"); + const parsed = JSON.parse(decoded); + const validated = ErrorInstanceCursorSchema.safeParse(parsed); + if (!validated.success) { + return null; + } + return validated.data as ErrorInstanceCursor; + } catch { + return null; + } +} + +export class ErrorGroupPresenter extends BasePresenter { + constructor( + private readonly replica: PrismaClientOrTransaction, + private readonly clickhouse: ClickHouse + ) { + super(undefined, replica); + } + + public async call( + organizationId: string, + environmentId: string, + { + userId, + projectId, + fingerprint, + cursor, + pageSize = DEFAULT_PAGE_SIZE, + }: ErrorGroupOptions + ) { + const displayableEnvironment = await findDisplayableEnvironment(environmentId, userId); + + if (!displayableEnvironment) { + throw new ServiceValidationError("No environment found"); + } + + // Use the error instances query builder + const queryBuilder = this.clickhouse.errors.instancesQueryBuilder(); + + // Apply filters + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint = {errorFingerprint: String}", { + errorFingerprint: fingerprint, + }); + queryBuilder.where("_is_deleted = 0"); + + // Cursor-based pagination + const decodedCursor = cursor ? decodeCursor(cursor) : null; + if (decodedCursor) { + queryBuilder.where( + `(created_at < {cursorCreatedAt: String} OR (created_at = {cursorCreatedAt: String} AND run_id < {cursorRunId: String}))`, + { + cursorCreatedAt: decodedCursor.createdAt, + cursorRunId: decodedCursor.runId, + } + ); + } + + queryBuilder.orderBy("created_at DESC, run_id DESC"); + queryBuilder.limit(pageSize + 1); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError) { + throw queryError; + } + + const results = records || []; + const hasMore = results.length > pageSize; + const instances = results.slice(0, pageSize); + + // Build next cursor from the last item + let nextCursor: string | undefined; + if (hasMore && instances.length > 0) { + const lastInstance = instances[instances.length - 1]; + nextCursor = encodeCursor({ + createdAt: lastInstance.created_at, + runId: lastInstance.run_id, + }); + } + + // Get error group summary from the first instance + let errorGroup: + | { + errorType: string; + errorMessage: string; + stackTrace?: string; + } + | undefined; + + if (instances.length > 0) { + const firstInstance = instances[0]; + try { + const errorData = JSON.parse(firstInstance.error_text); + errorGroup = { + errorType: errorData.type || errorData.name || "Error", + errorMessage: errorData.message || "Unknown error", + stackTrace: errorData.stack || errorData.stacktrace, + }; + } catch { + // If parsing fails, use fallback + errorGroup = { + errorType: "Error", + errorMessage: firstInstance.error_text.substring(0, 200), + }; + } + } + + // Transform results + const transformedInstances = instances.map((instance) => { + let parsedError: any; + try { + parsedError = JSON.parse(instance.error_text); + } catch { + parsedError = { message: instance.error_text }; + } + + return { + runId: instance.run_id, + friendlyId: instance.friendly_id, + taskIdentifier: instance.task_identifier, + createdAt: new Date(parseInt(instance.created_at) * 1000), + status: instance.status, + error: parsedError, + traceId: instance.trace_id, + taskVersion: instance.task_version, + }; + }); + + return { + errorGroup, + instances: transformedInstances, + pagination: { + hasMore, + nextCursor, + }, + }; + } +} diff --git a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts new file mode 100644 index 00000000000..c12a8d320b4 --- /dev/null +++ b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts @@ -0,0 +1,247 @@ +import { z } from "zod"; +import { type ClickHouse } from "@internal/clickhouse"; +import { type PrismaClientOrTransaction } from "@trigger.dev/database"; +import { type Direction } from "~/components/ListPagination"; +import { timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; +import { findDisplayableEnvironment } from "~/models/runtimeEnvironment.server"; +import { getAllTaskIdentifiers } from "~/models/task.server"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; +import { BasePresenter } from "~/presenters/v3/basePresenter.server"; + +export type ErrorsListOptions = { + userId?: string; + projectId: string; + // filters + tasks?: string[]; + period?: string; + from?: number; + to?: number; + defaultPeriod?: string; + retentionLimitDays?: number; + // search + search?: string; + // pagination + direction?: Direction; + cursor?: string; + pageSize?: number; +}; + +export const ErrorsListOptionsSchema = z.object({ + userId: z.string().optional(), + projectId: z.string(), + tasks: z.array(z.string()).optional(), + period: z.string().optional(), + from: z.number().int().nonnegative().optional(), + to: z.number().int().nonnegative().optional(), + defaultPeriod: z.string().optional(), + retentionLimitDays: z.number().int().positive().optional(), + search: z.string().max(1000).optional(), + direction: z.enum(["forward", "backward"]).optional(), + cursor: z.string().optional(), + pageSize: z.number().int().positive().max(1000).optional(), +}); + +const DEFAULT_PAGE_SIZE = 50; + +export type ErrorsList = Awaited>; +export type ErrorGroup = ErrorsList["errorGroups"][0]; +export type ErrorsListAppliedFilters = ErrorsList["filters"]; + +// Cursor for error groups pagination +type ErrorGroupCursor = { + lastSeen: string; + fingerprint: string; +}; + +const ErrorGroupCursorSchema = z.object({ + lastSeen: z.string(), + fingerprint: z.string(), +}); + +function encodeCursor(cursor: ErrorGroupCursor): string { + return Buffer.from(JSON.stringify(cursor)).toString("base64"); +} + +function decodeCursor(cursor: string): ErrorGroupCursor | null { + try { + const decoded = Buffer.from(cursor, "base64").toString("utf-8"); + const parsed = JSON.parse(decoded); + const validated = ErrorGroupCursorSchema.safeParse(parsed); + if (!validated.success) { + return null; + } + return validated.data as ErrorGroupCursor; + } catch { + return null; + } +} + +function escapeClickHouseString(val: string): string { + return val.replace(/\\/g, "\\\\").replace(/\//g, "\\/").replace(/%/g, "\\%").replace(/_/g, "\\_"); +} + +export class ErrorsListPresenter extends BasePresenter { + constructor( + private readonly replica: PrismaClientOrTransaction, + private readonly clickhouse: ClickHouse + ) { + super(undefined, replica); + } + + public async call( + organizationId: string, + environmentId: string, + { + userId, + projectId, + tasks, + period, + search, + from, + to, + cursor, + pageSize = DEFAULT_PAGE_SIZE, + defaultPeriod, + retentionLimitDays, + }: ErrorsListOptions + ) { + const time = timeFilterFromTo({ + period, + from, + to, + defaultPeriod: defaultPeriod ?? "7d", + }); + + let effectiveFrom = time.from; + let effectiveTo = time.to; + + // Apply retention limit if provided + let wasClampedByRetention = false; + if (retentionLimitDays !== undefined && effectiveFrom) { + const retentionCutoffDate = new Date(Date.now() - retentionLimitDays * 24 * 60 * 60 * 1000); + + if (effectiveFrom < retentionCutoffDate) { + effectiveFrom = retentionCutoffDate; + wasClampedByRetention = true; + } + } + + const hasFilters = + (tasks !== undefined && tasks.length > 0) || + (search !== undefined && search !== "") || + !time.isDefault; + + const possibleTasksAsync = getAllTaskIdentifiers(this.replica, environmentId); + + const [possibleTasks, displayableEnvironment] = await Promise.all([ + possibleTasksAsync, + findDisplayableEnvironment(environmentId, userId), + ]); + + if (!displayableEnvironment) { + throw new ServiceValidationError("No environment found"); + } + + // Calculate days parameter for ClickHouse query + const now = new Date(); + const daysAgo = effectiveFrom + ? Math.ceil((now.getTime() - effectiveFrom.getTime()) / (1000 * 60 * 60 * 24)) + : 30; + + // Query the pre-aggregated errors_v1 table + const queryBuilder = this.clickhouse.errors.listQueryBuilder(); + + // Apply base WHERE filters + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + + // Group by error_fingerprint to merge partial aggregations + queryBuilder.groupBy("error_fingerprint"); + + // Apply HAVING filters (filters on aggregated columns) + // Time range filter - use last_seen_date regular column instead of aggregate + queryBuilder.having("max(last_seen_date) >= now() - INTERVAL {days: Int64} DAY", { days: daysAgo }); + + // Task filter + if (tasks && tasks.length > 0) { + queryBuilder.having("anyMerge(sample_task_identifier) IN {tasks: Array(String)}", { tasks }); + } + + // Search filter - searches in error type and message + if (search && search.trim() !== "") { + const searchTerm = escapeClickHouseString(search.trim()).toLowerCase(); + queryBuilder.having( + "(lower(any(error_type)) like {searchPattern: String} OR lower(any(error_message)) like {searchPattern: String})", + { + searchPattern: `%${searchTerm}%`, + } + ); + } + + // Cursor-based pagination + const decodedCursor = cursor ? decodeCursor(cursor) : null; + if (decodedCursor) { + queryBuilder.having( + "(last_seen < {cursorLastSeen: String} OR (last_seen = {cursorLastSeen: String} AND error_fingerprint < {cursorFingerprint: String}))", + { + cursorLastSeen: decodedCursor.lastSeen, + cursorFingerprint: decodedCursor.fingerprint, + } + ); + } + + queryBuilder.orderBy("last_seen DESC, error_fingerprint DESC"); + queryBuilder.limit(pageSize + 1); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError) { + throw queryError; + } + + const results = records || []; + const hasMore = results.length > pageSize; + const errorGroups = results.slice(0, pageSize); + + // Build next cursor from the last item + let nextCursor: string | undefined; + if (hasMore && errorGroups.length > 0) { + const lastError = errorGroups[errorGroups.length - 1]; + nextCursor = encodeCursor({ + lastSeen: lastError.last_seen, + fingerprint: lastError.error_fingerprint, + }); + } + + // Transform results + const transformedErrorGroups = errorGroups.map((error) => ({ + errorType: error.error_type, + errorMessage: error.error_message, + fingerprint: error.error_fingerprint, + firstSeen: new Date(parseInt(error.first_seen) * 1000), + lastSeen: new Date(parseInt(error.last_seen) * 1000), + count: error.occurrence_count, + affectedTasks: error.affected_tasks, + sampleRunId: error.sample_run_id, + sampleFriendlyId: error.sample_friendly_id, + sampleTaskIdentifier: error.sample_task_identifier, + })); + + return { + errorGroups: transformedErrorGroups, + pagination: { + hasMore, + nextCursor, + }, + filters: { + tasks, + search, + period: time, + hasFilters, + possibleTasks, + wasClampedByRetention, + }, + }; + } +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx new file mode 100644 index 00000000000..68216267555 --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx @@ -0,0 +1,287 @@ +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { type MetaFunction, Link } from "@remix-run/react"; +import { ArrowLeftIcon } from "@heroicons/react/20/solid"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; +import { + TypedAwait, + typeddefer, + type UseDataFunctionReturn, + useTypedLoaderData, +} from "remix-typedjson"; +import { requireUser } from "~/services/session.server"; +import { EnvironmentParamSchema, v3ErrorsPath, v3RunPath } from "~/utils/pathBuilder"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { + ErrorGroupPresenter, + type ErrorInstance, +} from "~/presenters/v3/ErrorGroupPresenter.server"; +import { $replica } from "~/db.server"; +import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; +import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; +import { PageBody, PageContainer } from "~/components/layout/AppLayout"; +import { Suspense } from "react"; +import { Spinner } from "~/components/primitives/Spinner"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { Callout } from "~/components/primitives/Callout"; +import { Button } from "~/components/primitives/Buttons"; +import { Badge } from "~/components/primitives/Badge"; +import { Header2, Header3 } from "~/components/primitives/Headers"; +import { formatDistanceToNow } from "date-fns"; +import { cn } from "~/utils/cn"; + +export const meta: MetaFunction = ({ data }) => { + return [ + { + title: `Error Details | Trigger.dev`, + }, + ]; +}; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const user = await requireUser(request); + const userId = user.id; + + const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); + const fingerprint = params.fingerprint; + + if (!fingerprint) { + throw new Response("Fingerprint parameter is required", { status: 400 }); + } + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) { + throw new Response("Project not found", { status: 404 }); + } + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) { + throw new Response("Environment not found", { status: 404 }); + } + + const presenter = new ErrorGroupPresenter($replica, logsClickhouseClient); + + const detailPromise = presenter + .call(project.organizationId, environment.id, { + userId, + projectId: project.id, + fingerprint, + }) + .catch((error) => { + if (error instanceof ServiceValidationError) { + return { error: error.message }; + } + throw error; + }); + + return typeddefer({ + data: detailPromise, + organizationSlug, + projectParam, + envParam, + fingerprint, + }); +}; + +export default function Page() { + const { data, organizationSlug, projectParam, envParam } = useTypedLoaderData(); + + const errorsPath = v3ErrorsPath( + { slug: organizationSlug }, + { slug: projectParam }, + { slug: envParam } + ); + + return ( + + +
+ + + + +
+
+ + + +
+ + Loading error details… +
+ + } + > + + + Unable to load error details. Please refresh the page or try again in a moment. + + + } + > + {(result) => { + // Check if result contains an error + if ("error" in result) { + return ( +
+ + {result.error} + +
+ ); + } + return ( + + ); + }} +
+
+
+
+ ); +} + +function ErrorGroupDetail({ + errorGroup, + instances, + organizationSlug, + projectParam, + envParam, +}: { + errorGroup: + | { + errorType: string; + errorMessage: string; + stackTrace?: string; + } + | undefined; + instances: ErrorInstance[]; + organizationSlug: string; + projectParam: string; + envParam: string; +}) { + if (!errorGroup) { + return ( +
+
+ Error not found + + This error group does not exist or has no instances. + +
+
+ ); + } + + return ( +
+ {/* Error Summary */} +
+
+ + {errorGroup.errorType} + + {errorGroup.errorMessage} +
+ + {errorGroup.stackTrace && ( +
+ + Stack Trace + +
+              {errorGroup.stackTrace}
+            
+
+ )} +
+ + {/* Instances List */} +
+ Error Instances ({instances.length.toLocaleString()}) + + {instances.length === 0 ? ( + No error instances found. + ) : ( +
+ {instances.map((instance) => ( + + ))} +
+ )} +
+
+ ); +} + +function ErrorInstanceRow({ + instance, + organizationSlug, + projectParam, + envParam, +}: { + instance: ErrorInstance; + organizationSlug: string; + projectParam: string; + envParam: string; +}) { + const runPath = v3RunPath( + { slug: organizationSlug }, + { slug: projectParam }, + { slug: envParam }, + { friendlyId: instance.friendlyId } + ); + + return ( + +
+
+
+ {instance.friendlyId} + {instance.status} +
+ + Task: {instance.taskIdentifier} + + + {formatDistanceToNow(instance.createdAt, { addSuffix: true })} • Version:{" "} + {instance.taskVersion} + +
+
+ + {/* Show error details if available */} + {instance.error && typeof instance.error === "object" && "message" in instance.error && ( +
+ + {String(instance.error.message)} + +
+ )} + + ); +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx new file mode 100644 index 00000000000..fab365857d9 --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx @@ -0,0 +1,335 @@ +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { type MetaFunction, Form, Link } from "@remix-run/react"; +import { XMarkIcon } from "@heroicons/react/20/solid"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; +import { + TypedAwait, + typeddefer, + type UseDataFunctionReturn, + useTypedLoaderData, +} from "remix-typedjson"; +import { requireUser } from "~/services/session.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; +import { EnvironmentParamSchema, v3ErrorPath } from "~/utils/pathBuilder"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { ErrorsListPresenter, type ErrorGroup } from "~/presenters/v3/ErrorsListPresenter.server"; +import { $replica } from "~/db.server"; +import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; +import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; +import { PageBody, PageContainer } from "~/components/layout/AppLayout"; +import { Suspense, useMemo } from "react"; +import { useOptimisticLocation } from "~/hooks/useOptimisticLocation"; +import { Spinner } from "~/components/primitives/Spinner"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { Callout } from "~/components/primitives/Callout"; +import { LogsSearchInput } from "~/components/logs/LogsSearchInput"; +import { LogsTaskFilter } from "~/components/logs/LogsTaskFilter"; +import { TimeFilter } from "~/components/runs/v3/SharedFilters"; +import { Button } from "~/components/primitives/Buttons"; +import { Badge } from "~/components/primitives/Badge"; +import { Header1, Header3 } from "~/components/primitives/Headers"; +import { formatDistanceToNow } from "date-fns"; +import { cn } from "~/utils/cn"; + +export const meta: MetaFunction = () => { + return [ + { + title: `Errors | Trigger.dev`, + }, + ]; +}; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const user = await requireUser(request); + const userId = user.id; + + const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) { + throw new Response("Project not found", { status: 404 }); + } + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) { + throw new Response("Environment not found", { status: 404 }); + } + + // Get filters from query params + const url = new URL(request.url); + const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); + const search = url.searchParams.get("search") ?? undefined; + const period = url.searchParams.get("period") ?? undefined; + const fromStr = url.searchParams.get("from"); + const toStr = url.searchParams.get("to"); + const from = fromStr ? parseInt(fromStr, 10) : undefined; + const to = toStr ? parseInt(toStr, 10) : undefined; + + // Get the user's plan to determine retention limit + const plan = await getCurrentPlan(project.organizationId); + const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; + + const presenter = new ErrorsListPresenter($replica, logsClickhouseClient); + + const listPromise = presenter + .call(project.organizationId, environment.id, { + userId, + projectId: project.id, + tasks: tasks.length > 0 ? tasks : undefined, + search, + period, + from, + to, + defaultPeriod: "7d", + retentionLimitDays, + }) + .catch((error) => { + if (error instanceof ServiceValidationError) { + return { error: error.message }; + } + throw error; + }); + + return typeddefer({ + data: listPromise, + defaultPeriod: "7d", + retentionLimitDays, + organizationSlug, + projectParam, + envParam, + }); +}; + +export default function Page() { + const { data, defaultPeriod, retentionLimitDays, organizationSlug, projectParam, envParam } = + useTypedLoaderData(); + + return ( + + + + + + + +
+
+
+ + Loading errors… +
+
+
+ } + > + + +
+ + Unable to load errors. Please refresh the page or try again in a moment. + +
+ + } + > + {(result) => { + // Check if result contains an error + if ("error" in result) { + return ( +
+ +
+ + {result.error} + +
+
+ ); + } + return ( +
+ + +
+ ); + }} +
+
+
+
+ ); +} + +function FiltersBar({ + list, + defaultPeriod, + retentionLimitDays, +}: { + list?: Exclude["data"]>, { error: string }>; + defaultPeriod?: string; + retentionLimitDays: number; +}) { + const location = useOptimisticLocation(); + const searchParams = new URLSearchParams(location.search); + const hasFilters = + searchParams.has("tasks") || + searchParams.has("search") || + searchParams.has("period") || + searchParams.has("from") || + searchParams.has("to"); + + return ( +
+
+ {list ? ( + <> + + + + {hasFilters && ( +
+
+
+ ); +} + +function ErrorsList({ + errorGroups, + organizationSlug, + projectParam, + envParam, +}: { + errorGroups: ErrorGroup[]; + organizationSlug: string; + projectParam: string; + envParam: string; +}) { + if (errorGroups.length === 0) { + return ( +
+
+ No errors found + + No errors have been recorded in the selected time period. + +
+
+ ); + } + + return ( +
+
+ {errorGroups.map((errorGroup) => ( + + ))} +
+
+ ); +} + +function ErrorGroupRow({ + errorGroup, + organizationSlug, + projectParam, + envParam, +}: { + errorGroup: ErrorGroup; + organizationSlug: string; + projectParam: string; + envParam: string; +}) { + const errorPath = v3ErrorPath( + { slug: organizationSlug }, + { slug: projectParam }, + { slug: envParam }, + { fingerprint: errorGroup.fingerprint } + ); + + return ( + +
+
+
+ {errorGroup.errorType} + + {errorGroup.affectedTasks} task{errorGroup.affectedTasks !== 1 ? "s" : ""} + +
+ {errorGroup.errorMessage} +
+ + First seen: {formatDistanceToNow(errorGroup.firstSeen, { addSuffix: true })} + + + Last seen: {formatDistanceToNow(errorGroup.lastSeen, { addSuffix: true })} + + Sample: {errorGroup.sampleTaskIdentifier} +
+
+
+ + {errorGroup.count.toLocaleString()} + + + occurrences + +
+
+ + ); +} diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts new file mode 100644 index 00000000000..6230e7ac697 --- /dev/null +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts @@ -0,0 +1,51 @@ +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { json } from "@remix-run/node"; +import { requireUser } from "~/services/session.server"; +import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { ErrorGroupPresenter, ErrorGroupOptionsSchema } from "~/presenters/v3/ErrorGroupPresenter.server"; +import { $replica } from "~/db.server"; +import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const user = await requireUser(request); + const userId = user.id; + + const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); + const fingerprint = params.fingerprint; + + if (!fingerprint) { + throw new Response("Fingerprint parameter is required", { status: 400 }); + } + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) { + throw new Response("Project not found", { status: 404 }); + } + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) { + throw new Response("Environment not found", { status: 404 }); + } + + // Get pagination from query params + const url = new URL(request.url); + const cursor = url.searchParams.get("cursor") ?? undefined; + + const options = ErrorGroupOptionsSchema.parse({ + userId, + projectId: project.id, + fingerprint, + cursor, + }) as any; // Validated by ErrorGroupOptionsSchema at runtime + + const presenter = new ErrorGroupPresenter($replica, logsClickhouseClient); + const result = await presenter.call(project.organizationId, environment.id, options); + + return json({ + errorGroup: result.errorGroup, + instances: result.instances, + pagination: result.pagination, + }); +}; diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts new file mode 100644 index 00000000000..fd36003e8f3 --- /dev/null +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts @@ -0,0 +1,67 @@ +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { json } from "@remix-run/node"; +import { requireUser } from "~/services/session.server"; +import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { ErrorsListPresenter, ErrorsListOptionsSchema } from "~/presenters/v3/ErrorsListPresenter.server"; +import { $replica } from "~/db.server"; +import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const user = await requireUser(request); + const userId = user.id; + + const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) { + throw new Response("Project not found", { status: 404 }); + } + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) { + throw new Response("Environment not found", { status: 404 }); + } + + // Get the user's plan to determine retention limit + const plan = await getCurrentPlan(project.organizationId); + const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; + + // Get filters from query params + const url = new URL(request.url); + const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); + const search = url.searchParams.get("search") ?? undefined; + const cursor = url.searchParams.get("cursor") ?? undefined; + const period = url.searchParams.get("period") ?? undefined; + const fromStr = url.searchParams.get("from"); + const toStr = url.searchParams.get("to"); + let from = fromStr ? parseInt(fromStr, 10) : undefined; + let to = toStr ? parseInt(toStr, 10) : undefined; + + if (Number.isNaN(from)) from = undefined; + if (Number.isNaN(to)) to = undefined; + + const options = ErrorsListOptionsSchema.parse({ + userId, + projectId: project.id, + tasks: tasks.length > 0 ? tasks : undefined, + search, + cursor, + period, + from, + to, + defaultPeriod: "7d", + retentionLimitDays, + }) as any; // Validated by ErrorsListOptionsSchema at runtime + + const presenter = new ErrorsListPresenter($replica, logsClickhouseClient); + const result = await presenter.call(project.organizationId, environment.id, options); + + return json({ + errorGroups: result.errorGroups, + pagination: result.pagination, + filters: result.filters, + }); +}; diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts index faa0f3d9822..17ae3718e90 100644 --- a/apps/webapp/app/services/runsReplicationService.server.ts +++ b/apps/webapp/app/services/runsReplicationService.server.ts @@ -27,6 +27,7 @@ import { nanoid } from "nanoid"; import EventEmitter from "node:events"; import pLimit from "p-limit"; import { detectBadJsonStrings } from "~/utils/detectBadJsonStrings"; +import { calculateErrorFingerprint } from "~/utils/errorFingerprinting"; interface TransactionEvent { tag: "insert" | "update" | "delete"; @@ -852,6 +853,12 @@ export class RunsReplicationService { _version: bigint ): Promise { const output = await this.#prepareJson(run.output, run.outputType); + const errorData = { data: run.error }; + + // Calculate error fingerprint for failed runs + const errorFingerprint = (['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status)) + ? calculateErrorFingerprint(run.error) + : ''; // Return array matching TASK_RUN_COLUMNS order return [ @@ -880,7 +887,8 @@ export class RunsReplicationService { run.costInCents ?? 0, // cost_in_cents run.baseCostInCents ?? 0, // base_cost_in_cents output, // output - { data: run.error }, // error + errorData, // error + errorFingerprint, // error_fingerprint run.runTags ?? [], // tags run.taskVersion ?? "", // task_version run.sdkVersion ?? "", // sdk_version diff --git a/apps/webapp/app/utils/errorFingerprinting.ts b/apps/webapp/app/utils/errorFingerprinting.ts new file mode 100644 index 00000000000..d58b723f0e9 --- /dev/null +++ b/apps/webapp/app/utils/errorFingerprinting.ts @@ -0,0 +1,91 @@ +import { createHash } from "node:crypto"; + +/** + * Calculate error fingerprint using Sentry-style normalization. + * Groups similar errors together by normalizing dynamic values. + */ +export function calculateErrorFingerprint(error: unknown): string { + if (!error || typeof error !== "object") return ""; + + const errorObj = error as any; + const errorType = errorObj.type || errorObj.name || "Error"; + const message = errorObj.message || ""; + const stack = errorObj.stack || errorObj.stacktrace || ""; + + // Normalize message to group similar errors + const normalizedMessage = normalizeErrorMessage(message); + + // Extract and normalize first few stack frames + const normalizedStack = normalizeStackTrace(stack); + + // Create fingerprint from type + normalized message + stack + const fingerprintInput = `${errorType}:${normalizedMessage}:${normalizedStack}`; + + // Use SHA-256 hash, take first 16 chars for compact storage + return createHash("sha256").update(fingerprintInput).digest("hex").substring(0, 16); +} + +/** + * Normalize error message by replacing dynamic values with placeholders. + * This allows similar errors to be grouped together. + */ +export function normalizeErrorMessage(message: string): string { + if (!message) return ""; + + return ( + message + // UUIDs (8-4-4-4-12 format) + .replace( + /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, + "" + ) + // Run IDs (run_xxxxx format) + .replace(/run_[a-zA-Z0-9]+/g, "") + // Task run friendly IDs (task_xxxxx or similar) + .replace(/\b[a-z]+_[a-zA-Z0-9]{8,}\b/g, "") + // Standalone numeric IDs (4+ digits) + .replace(/\b\d{4,}\b/g, "") + // ISO 8601 timestamps + .replace(/\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?Z?/g, "") + // Unix timestamps (10 or 13 digits) + .replace(/\b\d{10,13}\b/g, "") + // File paths (Unix style) + .replace(/(?:\/[^\/\s]+){2,}/g, "") + // File paths (Windows style) + .replace(/[A-Z]:\\(?:[^\\]+\\)+[^\\]+/g, "") + // Email addresses + .replace(/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g, "") + // URLs + .replace(/https?:\/\/[^\s]+/g, "") + // Memory addresses (0x...) + .replace(/0x[0-9a-fA-F]{8,}/g, "") + // Quoted strings with dynamic content + .replace(/"[^"]{20,}"/g, '""') + .replace(/'[^']{20,}'/g, "''") + ); +} + +/** + * Normalize stack trace by taking first few frames and removing dynamic parts. + */ +export function normalizeStackTrace(stack: string): string { + if (!stack) return ""; + + // Take first 5 stack frames only + const lines = stack.split("\n").slice(0, 5); + + return lines + .map((line) => { + // Remove line and column numbers (file.ts:123:45 -> file.ts:_:_) + line = line.replace(/:\d+:\d+/g, ":_:_"); + // Remove standalone numbers + line = line.replace(/\b\d+\b/g, "_"); + // Remove file paths but keep filename + line = line.replace(/(?:\/[^\/\s]+)+\/([^\/\s]+)/g, "$1"); + // Normalize whitespace + line = line.trim(); + return line; + }) + .filter((line) => line.length > 0) + .join("|"); +} diff --git a/apps/webapp/app/utils/pathBuilder.ts b/apps/webapp/app/utils/pathBuilder.ts index c39234a7bbb..e8ed3e9b61a 100644 --- a/apps/webapp/app/utils/pathBuilder.ts +++ b/apps/webapp/app/utils/pathBuilder.ts @@ -527,6 +527,23 @@ export function v3LogsPath( return `${v3EnvironmentPath(organization, project, environment)}/logs`; } +export function v3ErrorsPath( + organization: OrgForPath, + project: ProjectForPath, + environment: EnvironmentForPath +) { + return `${v3EnvironmentPath(organization, project, environment)}/errors`; +} + +export function v3ErrorPath( + organization: OrgForPath, + project: ProjectForPath, + environment: EnvironmentForPath, + error: { fingerprint: string } +) { + return `${v3ErrorsPath(organization, project, environment)}/${error.fingerprint}`; +} + export function v3DeploymentsPath( organization: OrgForPath, project: ProjectForPath, diff --git a/apps/webapp/test/errorFingerprinting.test.ts b/apps/webapp/test/errorFingerprinting.test.ts new file mode 100644 index 00000000000..7f72eb6ed15 --- /dev/null +++ b/apps/webapp/test/errorFingerprinting.test.ts @@ -0,0 +1,327 @@ +import { describe, it, expect } from "vitest"; +import { + calculateErrorFingerprint, + normalizeErrorMessage, + normalizeStackTrace, +} from "~/utils/errorFingerprinting"; + +describe("normalizeErrorMessage", () => { + it("should normalize UUIDs", () => { + const message = "Error processing user 550e8400-e29b-41d4-a716-446655440000"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Error processing user "); + }); + + it("should normalize run IDs", () => { + const message = "Failed to execute run_abcd1234xyz"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Failed to execute "); + }); + + it("should normalize task friendly IDs", () => { + const message = "Task task_abc12345678 failed"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Task failed"); + }); + + it("should normalize numeric IDs (4+ digits)", () => { + const message = "User 12345 not found"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("User not found"); + }); + + it("should not normalize short numbers", () => { + const message = "Retry attempt 3 of 5"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Retry attempt 3 of 5"); + }); + + it("should normalize ISO 8601 timestamps", () => { + const message = "Event at 2024-03-01T15:30:45Z failed"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Event at failed"); + }); + + it("should normalize ISO timestamps with milliseconds", () => { + const message = "Timeout at 2024-03-01T15:30:45.123Z"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Timeout at "); + }); + + it("should normalize Unix timestamps", () => { + const message = "Created at 1234567890"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Created at "); + }); + + it("should normalize Unix timestamps (milliseconds)", () => { + const message = "Created at 1234567890123"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Created at "); + }); + + it("should normalize Unix file paths", () => { + const message = "Cannot read /home/user/project/file.ts"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Cannot read "); + }); + + it("should normalize Windows file paths", () => { + const message = "Cannot read C:\\Users\\John\\project\\file.ts"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Cannot read "); + }); + + it("should normalize email addresses", () => { + const message = "Email user@example.com already exists"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Email already exists"); + }); + + it("should normalize URLs", () => { + const message = "Failed to fetch https://api.example.com/users/123"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Failed to fetch "); + }); + + it("should normalize HTTP URLs", () => { + const message = "Request to http://localhost:3000/api failed"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Request to failed"); + }); + + it("should normalize memory addresses", () => { + const message = "Segfault at 0x7fff5fbffab0"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Segfault at "); + }); + + it("should normalize long quoted strings", () => { + const message = 'Error: "this is a very long error message with dynamic content that changes"'; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe('Error: ""'); + }); + + it("should handle multiple replacements", () => { + const message = + "User 12345 at user@example.com failed to access run_abc123 at 2024-03-01T15:30:45Z"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("User at failed to access at "); + }); + + it("should return empty string for empty input", () => { + expect(normalizeErrorMessage("")).toBe(""); + }); + + it("should handle messages with no dynamic content", () => { + const message = "Connection timeout"; + const normalized = normalizeErrorMessage(message); + expect(normalized).toBe("Connection timeout"); + }); +}); + +describe("normalizeStackTrace", () => { + it("should normalize line and column numbers", () => { + const stack = `Error: Test error + at functionName (file.ts:123:45) + at anotherFunction (other.ts:67:89)`; + const normalized = normalizeStackTrace(stack); + expect(normalized).toContain(":_:_"); + expect(normalized).not.toContain(":123:45"); + }); + + it("should remove standalone numbers", () => { + const stack = `Error: Test + at Object. (/path/to/file.ts:123:45) + at Module._compile (node:internal/modules/cjs/loader:456:78)`; + const normalized = normalizeStackTrace(stack); + expect(normalized).not.toMatch(/\b\d+\b/); + }); + + it("should keep only first 5 frames", () => { + const stack = `Error: Test + at frame1 (file1.ts:1:1) + at frame2 (file2.ts:2:2) + at frame3 (file3.ts:3:3) + at frame4 (file4.ts:4:4) + at frame5 (file5.ts:5:5) + at frame6 (file6.ts:6:6) + at frame7 (file7.ts:7:7)`; + const normalized = normalizeStackTrace(stack); + const frames = normalized.split("|"); + expect(frames.length).toBeLessThanOrEqual(5); + }); + + it("should remove file paths but keep filenames", () => { + const stack = `Error: Test + at functionName (/home/user/project/src/file.ts:123:45)`; + const normalized = normalizeStackTrace(stack); + expect(normalized).toContain("file.ts"); + expect(normalized).not.toContain("/home/user/project/src/"); + }); + + it("should filter out empty lines", () => { + const stack = `Error: Test + + at functionName (file.ts:123:45) + + at anotherFunction (other.ts:67:89)`; + const normalized = normalizeStackTrace(stack); + const frames = normalized.split("|").filter((f) => f.length > 0); + expect(frames.length).toBeLessThanOrEqual(3); + }); + + it("should return empty string for empty stack", () => { + expect(normalizeStackTrace("")).toBe(""); + }); + + it("should join frames with pipe delimiter", () => { + const stack = `Error: Test + at frame1 (file1.ts:1:1) + at frame2 (file2.ts:2:2)`; + const normalized = normalizeStackTrace(stack); + expect(normalized).toContain("|"); + }); +}); + +describe("calculateErrorFingerprint", () => { + it("should generate consistent fingerprints for same error", () => { + const error = { + type: "DatabaseError", + message: "Connection timeout", + stack: "at db.connect (db.ts:123:45)", + }; + const fp1 = calculateErrorFingerprint(error); + const fp2 = calculateErrorFingerprint(error); + expect(fp1).toBe(fp2); + expect(fp1.length).toBe(16); + }); + + it("should generate same fingerprint for errors with different IDs", () => { + const error1 = { + type: "NotFoundError", + message: "User 12345 not found", + stack: "at findUser (user.ts:50:10)", + }; + const error2 = { + type: "NotFoundError", + message: "User 67890 not found", + stack: "at findUser (user.ts:50:10)", + }; + const fp1 = calculateErrorFingerprint(error1); + const fp2 = calculateErrorFingerprint(error2); + expect(fp1).toBe(fp2); + }); + + it("should generate same fingerprint for errors with different UUIDs", () => { + const error1 = { + type: "ValidationError", + message: "Invalid token 550e8400-e29b-41d4-a716-446655440000", + }; + const error2 = { + type: "ValidationError", + message: "Invalid token 123e4567-e89b-12d3-a456-426614174000", + }; + expect(calculateErrorFingerprint(error1)).toBe(calculateErrorFingerprint(error2)); + }); + + it("should generate same fingerprint for errors with different run IDs", () => { + const error1 = { + type: "TaskError", + message: "Failed to execute run_abc123", + }; + const error2 = { + type: "TaskError", + message: "Failed to execute run_xyz789", + }; + expect(calculateErrorFingerprint(error1)).toBe(calculateErrorFingerprint(error2)); + }); + + it("should generate different fingerprints for different error types", () => { + const error1 = { + type: "DatabaseError", + message: "Connection failed", + }; + const error2 = { + type: "NetworkError", + message: "Connection failed", + }; + expect(calculateErrorFingerprint(error1)).not.toBe(calculateErrorFingerprint(error2)); + }); + + it("should generate different fingerprints for different error messages", () => { + const error1 = { + type: "Error", + message: "Connection timeout", + }; + const error2 = { + type: "Error", + message: "Connection refused", + }; + expect(calculateErrorFingerprint(error1)).not.toBe(calculateErrorFingerprint(error2)); + }); + + it("should handle error with name instead of type", () => { + const error = { + name: "TypeError", + message: "Cannot read property 'foo' of undefined", + }; + const fp = calculateErrorFingerprint(error); + expect(fp).toBeTruthy(); + expect(fp.length).toBe(16); + }); + + it("should handle error with stacktrace instead of stack", () => { + const error = { + type: "Error", + message: "Test error", + stacktrace: "at test (file.ts:1:1)", + }; + const fp = calculateErrorFingerprint(error); + expect(fp).toBeTruthy(); + }); + + it("should return empty string for non-object error", () => { + expect(calculateErrorFingerprint(null)).toBe(""); + expect(calculateErrorFingerprint(undefined)).toBe(""); + expect(calculateErrorFingerprint("error string")).toBe(""); + expect(calculateErrorFingerprint(123)).toBe(""); + }); + + it("should handle errors with no message or stack", () => { + const error = { + type: "Error", + }; + const fp = calculateErrorFingerprint(error); + expect(fp).toBeTruthy(); + expect(fp.length).toBe(16); + }); + + it("should generate fingerprints using stack trace when available", () => { + const error1 = { + type: "Error", + message: "Test", + stack: "at funcA (a.ts:1:1)\nat funcB (b.ts:2:2)", + }; + const error2 = { + type: "Error", + message: "Test", + stack: "at funcX (x.ts:1:1)\nat funcY (y.ts:2:2)", + }; + expect(calculateErrorFingerprint(error1)).not.toBe(calculateErrorFingerprint(error2)); + }); + + it("should normalize line numbers in stack traces for same code location", () => { + const error1 = { + type: "Error", + message: "Test", + stack: "at func (file.ts:123:45)", + }; + const error2 = { + type: "Error", + message: "Test", + stack: "at func (file.ts:456:78)", + }; + expect(calculateErrorFingerprint(error1)).toBe(calculateErrorFingerprint(error2)); + }); +}); diff --git a/internal-packages/clickhouse/schema/021_add_error_fingerprint_to_task_runs_v2.sql b/internal-packages/clickhouse/schema/021_add_error_fingerprint_to_task_runs_v2.sql new file mode 100644 index 00000000000..f702b91d3b9 --- /dev/null +++ b/internal-packages/clickhouse/schema/021_add_error_fingerprint_to_task_runs_v2.sql @@ -0,0 +1,11 @@ +-- +goose Up +ALTER TABLE trigger_dev.task_runs_v2 + ADD COLUMN error_fingerprint String DEFAULT ''; + +-- Bloom filter index for fast error fingerprint lookups +ALTER TABLE trigger_dev.task_runs_v2 + ADD INDEX idx_error_fingerprint error_fingerprint TYPE bloom_filter GRANULARITY 4; + +-- +goose Down +ALTER TABLE trigger_dev.task_runs_v2 DROP INDEX idx_error_fingerprint; +ALTER TABLE trigger_dev.task_runs_v2 DROP COLUMN error_fingerprint; diff --git a/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql b/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql new file mode 100644 index 00000000000..1a63e58c4f6 --- /dev/null +++ b/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql @@ -0,0 +1,83 @@ +-- +goose Up + +-- Aggregated error groups table +CREATE TABLE trigger_dev.errors_v1 +( + organization_id String, + project_id String, + environment_id String, + error_fingerprint String, + + -- Error details (samples from occurrences) + error_type String, + error_message String, + sample_stack_trace String, + + -- TTL tracking column (regular column for TTL - stores max created_at) + last_seen_date DateTime64(3), + + -- Aggregated statistics using AggregateFunction + first_seen AggregateFunction(min, DateTime64(3)), + last_seen AggregateFunction(max, DateTime64(3)), + occurrence_count AggregateFunction(sum, UInt64), + affected_tasks AggregateFunction(uniq, String), + affected_task_versions AggregateFunction(uniq, String), + + -- Samples for debugging + sample_run_id AggregateFunction(any, String), + sample_friendly_id AggregateFunction(any, String), + sample_task_identifier AggregateFunction(any, String), + + -- Status distribution + status_distribution AggregateFunction(sumMap, Array(String), Array(UInt64)) +) +ENGINE = AggregatingMergeTree() +PARTITION BY organization_id +ORDER BY (organization_id, project_id, environment_id, error_fingerprint) +TTL last_seen_date + INTERVAL 90 DAY +SETTINGS index_granularity = 8192; + +-- Materialized view to auto-populate from task_runs_v2 +CREATE MATERIALIZED VIEW trigger_dev.mv_errors_v1 +TO trigger_dev.errors_v1 +AS +SELECT + organization_id, + project_id, + environment_id, + error_fingerprint, + + -- Use any() for sample values + any(coalesce(JSONExtractString(error_text, 'type'), JSONExtractString(error_text, 'name'), 'Error')) as error_type, + any(coalesce(substring(JSONExtractString(error_text, 'message'), 1, 500), 'Unknown error')) as error_message, + any(coalesce(substring(JSONExtractString(error_text, 'stack'), 1, 2000), '')) as sample_stack_trace, + + -- Regular column for TTL tracking + max(created_at) as last_seen_date, + + -- Aggregate functions with State combinator + minState(created_at) as first_seen, + maxState(created_at) as last_seen, + sumState(toUInt64(1)) as occurrence_count, + uniqState(task_identifier) as affected_tasks, + uniqState(task_version) as affected_task_versions, + + anyState(run_id) as sample_run_id, + anyState(friendly_id) as sample_friendly_id, + anyState(task_identifier) as sample_task_identifier, + + sumMapState([status], [toUInt64(1)]) as status_distribution +FROM trigger_dev.task_runs_v2 +WHERE + error_fingerprint != '' + AND status IN ('SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS') + AND _is_deleted = 0 +GROUP BY + organization_id, + project_id, + environment_id, + error_fingerprint; + +-- +goose Down +DROP VIEW IF EXISTS trigger_dev.mv_errors_v1; +DROP TABLE IF EXISTS trigger_dev.errors_v1; diff --git a/internal-packages/clickhouse/src/client/queryBuilder.ts b/internal-packages/clickhouse/src/client/queryBuilder.ts index e802fc11bf3..dc0fb297cc9 100644 --- a/internal-packages/clickhouse/src/client/queryBuilder.ts +++ b/internal-packages/clickhouse/src/client/queryBuilder.ts @@ -13,6 +13,7 @@ export class ClickhouseQueryBuilder { private name: string; private baseQuery: string; private whereClauses: string[] = []; + private havingClauses: string[] = []; private params: QueryParams = {}; private orderByClause: string | null = null; private limitClause: string | null = null; @@ -69,6 +70,21 @@ export class ClickhouseQueryBuilder { return this; } + having(clause: string, params?: QueryParams): this { + this.havingClauses.push(clause); + if (params) { + Object.assign(this.params, params); + } + return this; + } + + havingIf(condition: any, clause: string, params?: QueryParams): this { + if (condition) { + this.having(clause, params); + } + return this; + } + orderBy(clause: string): this { this.orderByClause = clause; return this; @@ -101,6 +117,9 @@ export class ClickhouseQueryBuilder { if (this.groupByClause) { query += ` GROUP BY ${this.groupByClause}`; } + if (this.havingClauses.length > 0) { + query += " HAVING " + this.havingClauses.join(" AND "); + } if (this.orderByClause) { query += ` ORDER BY ${this.orderByClause}`; } @@ -119,6 +138,7 @@ export class ClickhouseQueryFastBuilder> { private settings: ClickHouseSettings | undefined; private prewhereClauses: string[] = []; private whereClauses: string[] = []; + private havingClauses: string[] = []; private params: QueryParams = {}; private orderByClause: string | null = null; private limitClause: string | null = null; @@ -191,6 +211,21 @@ export class ClickhouseQueryFastBuilder> { return this; } + having(clause: string, params?: QueryParams): this { + this.havingClauses.push(clause); + if (params) { + Object.assign(this.params, params); + } + return this; + } + + havingIf(condition: any, clause: string, params?: QueryParams): this { + if (condition) { + this.having(clause, params); + } + return this; + } + orderBy(clause: string): this { this.orderByClause = clause; return this; @@ -225,6 +260,9 @@ export class ClickhouseQueryFastBuilder> { if (this.groupByClause) { query += ` GROUP BY ${this.groupByClause}`; } + if (this.havingClauses.length > 0) { + query += " HAVING " + this.havingClauses.join(" AND "); + } if (this.orderByClause) { query += ` ORDER BY ${this.orderByClause}`; } diff --git a/internal-packages/clickhouse/src/errors.ts b/internal-packages/clickhouse/src/errors.ts new file mode 100644 index 00000000000..3f3ed21142a --- /dev/null +++ b/internal-packages/clickhouse/src/errors.ts @@ -0,0 +1,206 @@ +import { ClickHouseSettings } from "@clickhouse/client"; +import { z } from "zod"; +import { ClickhouseReader } from "./client/types.js"; + +export const ErrorGroupsListQueryResult = z.object({ + error_fingerprint: z.string(), + error_type: z.string(), + error_message: z.string(), + first_seen: z.string(), + last_seen: z.string(), + occurrence_count: z.number(), + affected_tasks: z.number(), + sample_run_id: z.string(), + sample_friendly_id: z.string(), + sample_task_identifier: z.string(), +}); + +export type ErrorGroupsListQueryResult = z.infer; + +/** + * Gets a query builder for listing error groups from the pre-aggregated errors_v1 table. + * Allows flexible filtering and pagination. + */ +export function getErrorGroupsListQueryBuilder( + ch: ClickhouseReader, + settings?: ClickHouseSettings +) { + return ch.queryBuilder({ + name: "getErrorGroupsList", + baseQuery: ` + SELECT + error_fingerprint, + any(error_type) as error_type, + any(error_message) as error_message, + toString(minMerge(first_seen)) as first_seen, + toString(maxMerge(last_seen)) as last_seen, + toUInt64(sumMerge(occurrence_count)) as occurrence_count, + toUInt64(uniqMerge(affected_tasks)) as affected_tasks, + anyMerge(sample_run_id) as sample_run_id, + anyMerge(sample_friendly_id) as sample_friendly_id, + anyMerge(sample_task_identifier) as sample_task_identifier + FROM trigger_dev.errors_v1 + `, + schema: ErrorGroupsListQueryResult, + settings, + }); +} + +export const ErrorGroupQueryResult = z.object({ + error_fingerprint: z.string(), + error_type: z.string(), + error_message: z.string(), + first_seen: z.string(), + last_seen: z.string(), + occurrence_count: z.number(), + affected_tasks: z.number(), + sample_run_id: z.string(), + sample_friendly_id: z.string(), + sample_task_identifier: z.string(), +}); + +export type ErrorGroupQueryResult = z.infer; + +export const ErrorGroupQueryParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + days: z.number().int().default(30), + limit: z.number().int().default(50), + offset: z.number().int().default(0), +}); + +export type ErrorGroupQueryParams = z.infer; + +/** + * Gets error groups from the pre-aggregated errors_v1 table. + * Much faster than on-the-fly aggregation. + */ +export function getErrorGroups(ch: ClickhouseReader, settings?: ClickHouseSettings) { + return ch.query({ + name: "getErrorGroups", + query: ` + SELECT + error_fingerprint, + any(error_type) as error_type, + any(error_message) as error_message, + toString(minMerge(first_seen)) as first_seen, + toString(maxMerge(last_seen)) as last_seen, + toUInt64(sumMerge(occurrence_count)) as occurrence_count, + toUInt64(uniqMerge(affected_tasks)) as affected_tasks, + anyMerge(sample_run_id) as sample_run_id, + anyMerge(sample_friendly_id) as sample_friendly_id, + anyMerge(sample_task_identifier) as sample_task_identifier + FROM trigger_dev.errors_v1 + WHERE + organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND maxMerge(last_seen) >= now() - INTERVAL {days: Int64} DAY + GROUP BY error_fingerprint + ORDER BY last_seen DESC + LIMIT {limit: Int64} + OFFSET {offset: Int64} + `, + schema: ErrorGroupQueryResult, + params: ErrorGroupQueryParams, + settings, + }); +} + +export const ErrorInstanceQueryResult = z.object({ + run_id: z.string(), + friendly_id: z.string(), + task_identifier: z.string(), + created_at: z.string(), + status: z.string(), + error_text: z.string(), + trace_id: z.string(), + task_version: z.string(), +}); + +export type ErrorInstanceQueryResult = z.infer; + +export const ErrorInstanceQueryParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + errorFingerprint: z.string(), + limit: z.number().int().default(50), + offset: z.number().int().default(0), +}); + +export type ErrorInstanceQueryParams = z.infer; + +export const ErrorInstancesListQueryResult = z.object({ + run_id: z.string(), + friendly_id: z.string(), + task_identifier: z.string(), + created_at: z.string(), + status: z.string(), + error_text: z.string(), + trace_id: z.string(), + task_version: z.string(), +}); + +export type ErrorInstancesListQueryResult = z.infer; + +/** + * Gets a query builder for listing error instances from task_runs_v2. + * Allows flexible filtering and pagination for runs with a specific error fingerprint. + */ +export function getErrorInstancesListQueryBuilder( + ch: ClickhouseReader, + settings?: ClickHouseSettings +) { + return ch.queryBuilder({ + name: "getErrorInstancesList", + baseQuery: ` + SELECT + run_id, + friendly_id, + task_identifier, + toString(created_at) as created_at, + status, + error_text, + trace_id, + task_version + FROM trigger_dev.task_runs_v2 FINAL + `, + schema: ErrorInstancesListQueryResult, + settings, + }); +} + +/** + * Gets individual run instances for a specific error fingerprint. + */ +export function getErrorInstances(ch: ClickhouseReader, settings?: ClickHouseSettings) { + return ch.query({ + name: "getErrorInstances", + query: ` + SELECT + run_id, + friendly_id, + task_identifier, + toString(created_at) as created_at, + status, + error_text, + trace_id, + task_version + FROM trigger_dev.task_runs_v2 FINAL + WHERE + organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND error_fingerprint = {errorFingerprint: String} + AND _is_deleted = 0 + ORDER BY created_at DESC + LIMIT {limit: Int64} + OFFSET {offset: Int64} + `, + schema: ErrorInstanceQueryResult, + params: ErrorInstanceQueryParams, + settings, + }); +} diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index b66ce8e3ed6..cfb2bd097f9 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -27,6 +27,12 @@ import { getLogsSearchListQueryBuilder, } from "./taskEvents.js"; import { insertMetrics } from "./metrics.js"; +import { + getErrorGroups, + getErrorInstances, + getErrorGroupsListQueryBuilder, + getErrorInstancesListQueryBuilder, +} from "./errors.js"; import { Logger, type LogLevel } from "@trigger.dev/core/logger"; import type { Agent as HttpAgent } from "http"; import type { Agent as HttpsAgent } from "https"; @@ -34,6 +40,7 @@ import type { Agent as HttpsAgent } from "https"; export type * from "./taskRuns.js"; export type * from "./taskEvents.js"; export type * from "./metrics.js"; +export type * from "./errors.js"; export type * from "./client/queryBuilder.js"; // Re-export column constants, indices, and type-safe accessors @@ -229,4 +236,13 @@ export class ClickHouse { logsListQueryBuilder: getLogsSearchListQueryBuilder(this.reader), }; } + + get errors() { + return { + getGroups: getErrorGroups(this.reader), + getInstances: getErrorInstances(this.reader), + listQueryBuilder: getErrorGroupsListQueryBuilder(this.reader), + instancesQueryBuilder: getErrorInstancesListQueryBuilder(this.reader), + }; + } } diff --git a/internal-packages/clickhouse/src/taskRuns.ts b/internal-packages/clickhouse/src/taskRuns.ts index 8c1d29ac162..4162691ed7a 100644 --- a/internal-packages/clickhouse/src/taskRuns.ts +++ b/internal-packages/clickhouse/src/taskRuns.ts @@ -29,6 +29,7 @@ export const TaskRunV2 = z.object({ base_cost_in_cents: z.number().default(0), output: z.unknown(), error: z.unknown(), + error_fingerprint: z.string().default(""), tags: z.array(z.string()).default([]), task_version: z.string(), sdk_version: z.string(), @@ -82,6 +83,7 @@ export const TASK_RUN_COLUMNS = [ "base_cost_in_cents", "output", "error", + "error_fingerprint", "tags", "task_version", "sdk_version", @@ -144,6 +146,7 @@ export type TaskRunFieldTypes = { base_cost_in_cents: number; output: { data: unknown }; error: { data: unknown }; + error_fingerprint: string; tags: string[]; task_version: string; sdk_version: string; @@ -277,6 +280,7 @@ export type TaskRunInsertArray = [ base_cost_in_cents: number, output: { data: unknown }, error: { data: unknown }, + error_fingerprint: string, tags: string[], task_version: string, sdk_version: string, From ab6731cf40d07d82bbd8894efe7212d2d00d9a88 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 2 Mar 2026 11:50:36 +0000 Subject: [PATCH 02/12] Improved layout --- .../route.tsx | 208 +++++++++--------- 1 file changed, 110 insertions(+), 98 deletions(-) diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx index fab365857d9..801fb552043 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx @@ -1,5 +1,5 @@ import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; -import { type MetaFunction, Form, Link } from "@remix-run/react"; +import { type MetaFunction, Form, Link, Outlet } from "@remix-run/react"; import { XMarkIcon } from "@heroicons/react/20/solid"; import { ServiceValidationError } from "~/v3/services/baseService.server"; import { @@ -31,6 +31,18 @@ import { Badge } from "~/components/primitives/Badge"; import { Header1, Header3 } from "~/components/primitives/Headers"; import { formatDistanceToNow } from "date-fns"; import { cn } from "~/utils/cn"; +import { + CopyableTableCell, + Table, + TableBlankRow, + TableBody, + TableCell, + TableCellChevron, + TableCellMenu, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; export const meta: MetaFunction = () => { return [ @@ -106,75 +118,81 @@ export default function Page() { useTypedLoaderData(); return ( - - - - + <> + + + + - - -
-
-
- - Loading errors… -
-
-
- } - > - - -
- - Unable to load errors. Please refresh the page or try again in a moment. - + + +
+
+
+ + Loading errors… +
} > - {(result) => { - // Check if result contains an error - if ("error" in result) { + + +
+ + Unable to load errors. Please refresh the page or try again in a moment. + +
+
+ } + > + {(result) => { + // Check if result contains an error + if ("error" in result) { + return ( +
+ +
+ + {result.error} + +
+
+ ); + } return ( -
+
-
- - {result.error} - -
+
); - } - return ( -
- - -
- ); - }} - - - - + }} + + + + + + ); } @@ -260,8 +278,19 @@ function ErrorsList({ } return ( -
-
+ + + + ID + Error + Occurrences + Tasks + First seen + Last seen + Go to page + + + {errorGroups.map((errorGroup) => ( ))} - - + +
); } @@ -294,42 +323,25 @@ function ErrorGroupRow({ { fingerprint: errorGroup.fingerprint } ); + const errorMessage = `${errorGroup.errorType}: ${errorGroup.errorMessage}`; + return ( - -
-
-
- {errorGroup.errorType} - - {errorGroup.affectedTasks} task{errorGroup.affectedTasks !== 1 ? "s" : ""} - -
- {errorGroup.errorMessage} -
- - First seen: {formatDistanceToNow(errorGroup.firstSeen, { addSuffix: true })} - - - Last seen: {formatDistanceToNow(errorGroup.lastSeen, { addSuffix: true })} - - Sample: {errorGroup.sampleTaskIdentifier} -
-
-
- - {errorGroup.count.toLocaleString()} - - - occurrences - -
-
- + + + {errorGroup.fingerprint.slice(-8)} + + + {errorMessage} + + {errorGroup.count.toLocaleString()} + {errorGroup.affectedTasks} + + {formatDistanceToNow(errorGroup.firstSeen, { addSuffix: true })} + + + {formatDistanceToNow(errorGroup.lastSeen, { addSuffix: true })} + + + ); } From 4e40640814f77f0686a2750807658f4a1cf41116 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 2 Mar 2026 15:22:34 +0000 Subject: [PATCH 03/12] Fingerprint benchmarks --- apps/webapp/app/env.server.ts | 1 + .../runsReplicationInstance.server.ts | 1 + .../services/runsReplicationService.server.ts | 8 +- apps/webapp/app/utils/errorFingerprinting.ts | 16 +- .../test/runsReplicationBenchmark.README.md | 283 +++++++++ .../test/runsReplicationBenchmark.producer.ts | 205 +++++++ .../test/runsReplicationBenchmark.test.ts | 567 ++++++++++++++++++ 7 files changed, 1072 insertions(+), 9 deletions(-) create mode 100644 apps/webapp/test/runsReplicationBenchmark.README.md create mode 100644 apps/webapp/test/runsReplicationBenchmark.producer.ts create mode 100644 apps/webapp/test/runsReplicationBenchmark.test.ts diff --git a/apps/webapp/app/env.server.ts b/apps/webapp/app/env.server.ts index fa019f2f75e..6e44a06d597 100644 --- a/apps/webapp/app/env.server.ts +++ b/apps/webapp/app/env.server.ts @@ -1195,6 +1195,7 @@ const EnvironmentSchema = z RUN_REPLICATION_INSERT_MAX_DELAY_MS: z.coerce.number().int().default(2000), RUN_REPLICATION_INSERT_STRATEGY: z.enum(["insert", "insert_async"]).default("insert"), RUN_REPLICATION_DISABLE_PAYLOAD_INSERT: z.string().default("0"), + RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING: z.string().default("0"), // Clickhouse CLICKHOUSE_URL: z.string(), diff --git a/apps/webapp/app/services/runsReplicationInstance.server.ts b/apps/webapp/app/services/runsReplicationInstance.server.ts index 8dc078d338f..0a8ab5e1bde 100644 --- a/apps/webapp/app/services/runsReplicationInstance.server.ts +++ b/apps/webapp/app/services/runsReplicationInstance.server.ts @@ -68,6 +68,7 @@ function initializeRunsReplicationInstance() { insertMaxDelayMs: env.RUN_REPLICATION_INSERT_MAX_DELAY_MS, insertStrategy: env.RUN_REPLICATION_INSERT_STRATEGY, disablePayloadInsert: env.RUN_REPLICATION_DISABLE_PAYLOAD_INSERT === "1", + disableErrorFingerprinting: env.RUN_REPLICATION_DISABLE_ERROR_FINGERPRINTING === "1", }); if (env.RUN_REPLICATION_ENABLED === "1") { diff --git a/apps/webapp/app/services/runsReplicationService.server.ts b/apps/webapp/app/services/runsReplicationService.server.ts index 17ae3718e90..1d7714ae478 100644 --- a/apps/webapp/app/services/runsReplicationService.server.ts +++ b/apps/webapp/app/services/runsReplicationService.server.ts @@ -71,6 +71,7 @@ export type RunsReplicationServiceOptions = { insertBaseDelayMs?: number; insertMaxDelayMs?: number; disablePayloadInsert?: boolean; + disableErrorFingerprinting?: boolean; }; type PostgresTaskRun = TaskRun & { masterQueue: string }; @@ -116,6 +117,7 @@ export class RunsReplicationService { private _insertMaxDelayMs: number; private _insertStrategy: "insert" | "insert_async"; private _disablePayloadInsert: boolean; + private _disableErrorFingerprinting: boolean; // Metrics private _replicationLagHistogram: Histogram; @@ -190,6 +192,7 @@ export class RunsReplicationService { this._insertStrategy = options.insertStrategy ?? "insert"; this._disablePayloadInsert = options.disablePayloadInsert ?? false; + this._disableErrorFingerprinting = options.disableErrorFingerprinting ?? false; this._replicationClient = new LogicalReplicationClient({ pgConfig: { @@ -856,7 +859,10 @@ export class RunsReplicationService { const errorData = { data: run.error }; // Calculate error fingerprint for failed runs - const errorFingerprint = (['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status)) + const errorFingerprint = ( + !this._disableErrorFingerprinting && + ['SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS'].includes(run.status) + ) ? calculateErrorFingerprint(run.error) : ''; diff --git a/apps/webapp/app/utils/errorFingerprinting.ts b/apps/webapp/app/utils/errorFingerprinting.ts index d58b723f0e9..2e8c9438335 100644 --- a/apps/webapp/app/utils/errorFingerprinting.ts +++ b/apps/webapp/app/utils/errorFingerprinting.ts @@ -5,12 +5,15 @@ import { createHash } from "node:crypto"; * Groups similar errors together by normalizing dynamic values. */ export function calculateErrorFingerprint(error: unknown): string { - if (!error || typeof error !== "object") return ""; + if (!error || typeof error !== "object" || Array.isArray(error)) return ""; + // This is a but ugly but… + // 1. We can't use a schema here because it's a hot path and needs to be fast. + // 2. It won't be an instanceof Error because it's from the database. const errorObj = error as any; - const errorType = errorObj.type || errorObj.name || "Error"; - const message = errorObj.message || ""; - const stack = errorObj.stack || errorObj.stacktrace || ""; + const errorType = String(errorObj.type || errorObj.name || "Error"); + const message = String(errorObj.message || ""); + const stack = String(errorObj.stack || errorObj.stacktrace || ""); // Normalize message to group similar errors const normalizedMessage = normalizeErrorMessage(message); @@ -35,10 +38,7 @@ export function normalizeErrorMessage(message: string): string { return ( message // UUIDs (8-4-4-4-12 format) - .replace( - /[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, - "" - ) + .replace(/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}/gi, "") // Run IDs (run_xxxxx format) .replace(/run_[a-zA-Z0-9]+/g, "") // Task run friendly IDs (task_xxxxx or similar) diff --git a/apps/webapp/test/runsReplicationBenchmark.README.md b/apps/webapp/test/runsReplicationBenchmark.README.md new file mode 100644 index 00000000000..68ff312eb0b --- /dev/null +++ b/apps/webapp/test/runsReplicationBenchmark.README.md @@ -0,0 +1,283 @@ +# RunsReplicationService Error Fingerprinting Benchmark + +This benchmark measures the performance impact of error fingerprinting in the RunsReplicationService. + +## Overview + +The benchmark: +1. Creates a realistic dataset of TaskRuns (7% with errors by default) +2. Runs the producer in a **separate process** to simulate real-world load +3. Measures replication throughput and Event Loop Utilization (ELU) +4. Compares performance with fingerprinting **enabled** vs **disabled** + +## Architecture + +``` +┌─────────────────┐ ┌──────────────────────┐ +│ Producer │ │ Benchmark Test │ +│ (Child Process)│─────────│ (Main Process) │ +│ │ IPC │ │ +│ - Inserts │ │ - RunsReplication │ +│ TaskRuns │ │ Service │ +│ to Postgres │ │ - ELU Monitor │ +│ │ │ - Metrics │ +└─────────────────┘ └──────────────────────┘ + │ │ + │ │ + ▼ ▼ + ┌──────────┐ ┌──────────────┐ + │ Postgres │ │ ClickHouse │ + └──────────┘ └──────────────┘ +``` + +## Files + +- `runsReplicationBenchmark.test.ts` - Main benchmark test +- `runsReplicationBenchmark.producer.ts` - Producer script (runs in child process) +- `runsReplicationBenchmark.README.md` - This file + +## Configuration + +The benchmark can be configured via environment variables or by editing `BENCHMARK_CONFIG` in the test file: + +```typescript +const BENCHMARK_CONFIG = { + // Number of runs to create + NUM_RUNS: parseInt(process.env.BENCHMARK_NUM_RUNS || "5000", 10), + + // Error rate (0.07 = 7%) + ERROR_RATE: 0.07, + + // Producer batch size + PRODUCER_BATCH_SIZE: 100, + + // Replication service settings + FLUSH_BATCH_SIZE: 50, + FLUSH_INTERVAL_MS: 100, + MAX_FLUSH_CONCURRENCY: 4, + + // Timeout + REPLICATION_TIMEOUT_MS: 120_000, // 2 minutes +}; +``` + +## Running the Benchmark + +### Quick Test (Small Dataset) + +```bash +cd apps/webapp +BENCHMARK_NUM_RUNS=1000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run +``` + +### Realistic Benchmark (Larger Dataset) + +```bash +cd apps/webapp +BENCHMARK_NUM_RUNS=10000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run +``` + +### High Volume Benchmark + +```bash +cd apps/webapp +BENCHMARK_NUM_RUNS=50000 pnpm run test ./test/runsReplicationBenchmark.test.ts --run +``` + +**Note:** The test is marked with `.skip` by default. To run it, remove the `.skip` from the test: + +```typescript +// Change this: +containerTest.skip("should benchmark...", async () => { + +// To this: +containerTest("should benchmark...", async () => { +``` + +## What Gets Measured + +### 1. Producer Metrics +- Total runs created +- Runs with errors (should be ~7%) +- Duration +- Throughput (runs/sec) + +### 2. Replication Metrics +- Total runs replicated to ClickHouse +- Replication duration +- Replication throughput (runs/sec) + +### 3. Event Loop Utilization (ELU) +- Mean utilization (%) +- P50 (median) utilization (%) +- P95 utilization (%) +- P99 utilization (%) +- All samples for detailed analysis + +### 4. OpenTelemetry Metrics +- Batches flushed +- Task runs inserted +- Payloads inserted +- Events processed + +## Output + +The benchmark produces detailed output including: + +``` +================================================================================ +BENCHMARK: baseline-no-fingerprinting +Error Fingerprinting: DISABLED +Runs: 5000, Error Rate: 7.0% +================================================================================ + +[Producer] Starting - will create 5000 runs (7.0% with errors) +[Producer] Progress: 1000/5000 runs (2500 runs/sec) +... +[Producer] Completed: + - Total runs: 5000 + - With errors: 352 (7.0%) + - Duration: 2145ms + - Throughput: 2331 runs/sec + +[Benchmark] Waiting for replication to complete... + +================================================================================ +RESULTS: baseline-no-fingerprinting +================================================================================ + +Producer: + Created: 5000 runs + With errors: 352 (7.0%) + Duration: 2145ms + Throughput: 2331 runs/sec + +Replication: + Replicated: 5000 runs + Duration: 3456ms + Throughput: 1447 runs/sec + +Event Loop Utilization: + Mean: 23.45% + P50: 22.10% + P95: 34.20% + P99: 41.30% + Samples: 346 + +Metrics: + Batches flushed: 102 + Task runs inserted: 5000 + Payloads inserted: 5000 + Events processed: 5000 +================================================================================ + +[... Similar output for "with-fingerprinting" benchmark ...] + +================================================================================ +COMPARISON +Baseline: baseline-no-fingerprinting (fingerprinting OFF) +Comparison: with-fingerprinting (fingerprinting ON) +================================================================================ + +Replication Duration: + 3456ms → 3512ms (+1.62%) + +Throughput: + 1447 → 1424 runs/sec (-1.59%) + +Event Loop Utilization (Mean): + 23.45% → 24.12% (+2.86%) + +Event Loop Utilization (P99): + 41.30% → 43.20% (+4.60%) + +================================================================================ + +BENCHMARK COMPLETE +Fingerprinting impact on replication duration: +1.62% +Fingerprinting impact on throughput: -1.59% +Fingerprinting impact on ELU (mean): +2.86% +Fingerprinting impact on ELU (P99): +4.60% +``` + +## Interpreting Results + +### What to Look For + +1. **Replication Duration Delta** - How much longer replication takes with fingerprinting +2. **Throughput Delta** - Change in runs processed per second +3. **ELU Delta** - Change in event loop utilization (higher = more CPU bound) + +### Expected Results + +With a 7% error rate and SHA-256 hashing: +- **Small impact** (<5% overhead): Fingerprinting is well optimized +- **Moderate impact** (5-15% overhead): May want to consider optimizations +- **Large impact** (>15% overhead): Fingerprinting needs optimization + +### Performance Optimization Ideas + +If the benchmark shows significant overhead, consider: + +1. **Faster hashing algorithm** - Replace SHA-256 with xxHash or MurmurHash3 +2. **Worker threads** - Move fingerprinting to worker threads +3. **Caching** - Cache fingerprints for identical errors +4. **Lazy computation** - Only compute fingerprints when needed +5. **Batch processing** - Group similar errors before hashing + +## Dataset Characteristics + +The producer generates realistic error variety: + +- TypeError (undefined property access) +- Error (API fetch failures) +- ValidationError (input validation) +- TimeoutError (operation timeouts) +- DatabaseError (connection failures) +- ReferenceError (undefined variables) + +Each error template includes: +- Realistic stack traces +- Variable IDs and timestamps +- Line/column numbers +- File paths + +This ensures the fingerprinting algorithm is tested with realistic data. + +## Troubleshooting + +### Benchmark Times Out + +Increase the timeout: +```typescript +REPLICATION_TIMEOUT_MS: 300_000, // 5 minutes +``` + +### Producer Fails + +Check Postgres connection and ensure: +- Docker services are running (`pnpm run docker`) +- Database is accessible +- Sufficient disk space + +### Different Results Each Run + +This is normal! Factors affecting variance: +- System load +- Docker container overhead +- Database I/O +- Network latency (even localhost) + +Run multiple times and look at trends. + +## Future Enhancements + +Potential improvements to the benchmark: + +1. **Multiple error rates** - Test 0%, 5%, 10%, 25%, 50% error rates +2. **Different hash algorithms** - Compare SHA-256 vs xxHash vs MurmurHash3 +3. **Worker thread comparison** - Test main thread vs worker threads +4. **Concurrent producers** - Multiple producer processes +5. **Memory profiling** - Track memory usage over time +6. **Flame graphs** - Generate CPU flame graphs for analysis +7. **Historical tracking** - Store results over time to track regressions diff --git a/apps/webapp/test/runsReplicationBenchmark.producer.ts b/apps/webapp/test/runsReplicationBenchmark.producer.ts new file mode 100644 index 00000000000..dbed1d81938 --- /dev/null +++ b/apps/webapp/test/runsReplicationBenchmark.producer.ts @@ -0,0 +1,205 @@ +#!/usr/bin/env node +/** + * Producer script that runs in a separate process to insert TaskRuns into PostgreSQL. + * This simulates realistic production load for benchmarking RunsReplicationService. + */ + +import { PrismaClient } from "@trigger.dev/database"; +import { performance } from "node:perf_hooks"; + +interface ProducerConfig { + postgresUrl: string; + organizationId: string; + projectId: string; + environmentId: string; + numRuns: number; + errorRate: number; // 0.07 = 7% + batchSize: number; +} + +// Error templates for realistic variety +const ERROR_TEMPLATES = [ + { + type: "TypeError", + message: "Cannot read property 'foo' of undefined", + stack: `TypeError: Cannot read property 'foo' of undefined + at processData (/app/src/handler.ts:42:15) + at runTask (/app/src/runtime.ts:128:20) + at executeRun (/app/src/executor.ts:89:12) + at async Runner.execute (/app/src/runner.ts:56:5)`, + }, + { + type: "Error", + message: "Failed to fetch data from API endpoint https://api.example.com/data/12345", + stack: `Error: Failed to fetch data from API endpoint https://api.example.com/data/12345 + at fetchData (/app/src/api.ts:78:11) + at getData (/app/src/service.ts:34:18) + at processTask (/app/src/handler.ts:23:15) + at runTask (/app/src/runtime.ts:128:20)`, + }, + { + type: "ValidationError", + message: "Invalid input: expected string for field 'email', got number: 1234567890", + stack: `ValidationError: Invalid input: expected string for field 'email', got number: 1234567890 + at validateInput (/app/src/validator.ts:156:9) + at processRequest (/app/src/handler.ts:67:23) + at runTask (/app/src/runtime.ts:128:20)`, + }, + { + type: "TimeoutError", + message: "Operation timed out after 30000ms", + stack: `TimeoutError: Operation timed out after 30000ms + at Timeout._onTimeout (/app/src/timeout.ts:45:15) + at processTask (/app/src/handler.ts:89:12) + at runTask (/app/src/runtime.ts:128:20)`, + }, + { + type: "DatabaseError", + message: "Connection to database 'prod_db' failed: timeout of 5000ms exceeded", + stack: `DatabaseError: Connection to database 'prod_db' failed: timeout of 5000ms exceeded + at connect (/app/node_modules/pg/lib/client.js:234:11) + at query (/app/src/db.ts:89:18) + at getData (/app/src/service.ts:45:22)`, + }, + { + type: "ReferenceError", + message: "userId is not defined", + stack: `ReferenceError: userId is not defined + at validateUser (/app/src/auth.ts:123:9) + at processTask (/app/src/handler.ts:34:15) + at runTask (/app/src/runtime.ts:128:20)`, + }, +]; + +function generateError() { + const template = ERROR_TEMPLATES[Math.floor(Math.random() * ERROR_TEMPLATES.length)]; + + // Add variation to make errors slightly different + const randomId = Math.floor(Math.random() * 100000); + const randomTimestamp = Date.now() + Math.floor(Math.random() * 10000); + + return { + type: template.type, + name: template.type, + message: template.message + .replace(/\d{4,}/g, String(randomId)) + .replace(/\d{13}/g, String(randomTimestamp)), + stack: template.stack + .replace(/:\d+:\d+/g, `:${Math.floor(Math.random() * 500)}:${Math.floor(Math.random() * 50)}`) + .replace(/\d{4,}/g, String(randomId)), + }; +} + +async function runProducer(config: ProducerConfig) { + const prisma = new PrismaClient({ + datasources: { + db: { + url: config.postgresUrl, + }, + }, + }); + + try { + console.log(`[Producer] Starting - will create ${config.numRuns} runs (${(config.errorRate * 100).toFixed(1)}% with errors)`); + const startTime = performance.now(); + let created = 0; + let withErrors = 0; + + // Process in batches to avoid overwhelming the database + for (let batch = 0; batch < Math.ceil(config.numRuns / config.batchSize); batch++) { + const batchStart = batch * config.batchSize; + const batchEnd = Math.min(batchStart + config.batchSize, config.numRuns); + const batchSize = batchEnd - batchStart; + + const runs = []; + for (let i = batchStart; i < batchEnd; i++) { + const hasError = Math.random() < config.errorRate; + const status = hasError ? "COMPLETED_WITH_ERRORS" : "COMPLETED_SUCCESSFULLY"; + + const runData: any = { + friendlyId: `run_bench_${Date.now()}_${i}`, + taskIdentifier: `benchmark-task-${i % 10}`, // Vary task identifiers + payload: JSON.stringify({ index: i, timestamp: Date.now() }), + traceId: `trace_${i}`, + spanId: `span_${i}`, + queue: `queue-${i % 5}`, // Vary queues + runtimeEnvironmentId: config.environmentId, + projectId: config.projectId, + organizationId: config.organizationId, + environmentType: "DEVELOPMENT", + engine: "V2", + status, + createdAt: new Date(Date.now() - Math.floor(Math.random() * 1000)), + updatedAt: new Date(), + }; + + if (hasError) { + runData.error = generateError(); + withErrors++; + } + + runs.push(runData); + } + + // Insert batch + await prisma.taskRun.createMany({ + data: runs, + }); + + created += batchSize; + + if (batch % 10 === 0 || batch === Math.ceil(config.numRuns / config.batchSize) - 1) { + const elapsed = performance.now() - startTime; + const rate = (created / elapsed) * 1000; + console.log(`[Producer] Progress: ${created}/${config.numRuns} runs (${rate.toFixed(0)} runs/sec)`); + } + } + + const endTime = performance.now(); + const duration = endTime - startTime; + const throughput = (created / duration) * 1000; + + console.log(`[Producer] Completed:`); + console.log(` - Total runs: ${created}`); + console.log(` - With errors: ${withErrors} (${((withErrors / created) * 100).toFixed(1)}%)`); + console.log(` - Duration: ${duration.toFixed(0)}ms`); + console.log(` - Throughput: ${throughput.toFixed(0)} runs/sec`); + + // Send results to parent process + if (process.send) { + process.send({ + type: "complete", + stats: { + created, + withErrors, + duration, + throughput, + }, + }); + } + } catch (error) { + console.error("[Producer] Error:", error); + if (process.send) { + process.send({ + type: "error", + error: error instanceof Error ? error.message : String(error), + }); + } + process.exit(1); + } finally { + await prisma.$disconnect(); + } +} + +// Parse config from command line args +const configArg = process.argv[2]; +if (!configArg) { + console.error("Usage: runsReplicationBenchmark.producer.ts "); + process.exit(1); +} + +const config: ProducerConfig = JSON.parse(configArg); +runProducer(config).catch((error) => { + console.error("Fatal error:", error); + process.exit(1); +}); diff --git a/apps/webapp/test/runsReplicationBenchmark.test.ts b/apps/webapp/test/runsReplicationBenchmark.test.ts new file mode 100644 index 00000000000..5f0fbb8b27f --- /dev/null +++ b/apps/webapp/test/runsReplicationBenchmark.test.ts @@ -0,0 +1,567 @@ +import { ClickHouse } from "@internal/clickhouse"; +import { containerTest } from "@internal/testcontainers"; +import { fork, type ChildProcess } from "node:child_process"; +import { performance, PerformanceObserver } from "node:perf_hooks"; +import { setTimeout } from "node:timers/promises"; +import path from "node:path"; +import { z } from "zod"; +import { RunsReplicationService } from "~/services/runsReplicationService.server"; +import { createInMemoryTracing, createInMemoryMetrics } from "./utils/tracing"; + +// Extend test timeout for benchmarks +vi.setConfig({ testTimeout: 300_000 }); // 5 minutes + +/** + * Benchmark configuration + */ +const BENCHMARK_CONFIG = { + // Number of runs to create - adjust this to test different volumes + // Start with smaller numbers (1000) for quick tests, increase to 10000+ for realistic benchmarks + NUM_RUNS: parseInt(process.env.BENCHMARK_NUM_RUNS || "5000", 10), + + // Error rate (7% = realistic production load with some failures) + ERROR_RATE: 0.07, + + // Batch size for producer + PRODUCER_BATCH_SIZE: 100, + + // Replication service settings + FLUSH_BATCH_SIZE: 50, + FLUSH_INTERVAL_MS: 100, + MAX_FLUSH_CONCURRENCY: 4, + + // How long to wait for replication to complete (in ms) + REPLICATION_TIMEOUT_MS: 120_000, // 2 minutes +}; + +interface BenchmarkResult { + name: string; + fingerprintingEnabled: boolean; + producerStats: { + created: number; + withErrors: number; + duration: number; + throughput: number; + }; + replicationStats: { + duration: number; + throughput: number; + replicatedRuns: number; + }; + eluStats: { + mean: number; + p50: number; + p95: number; + p99: number; + samples: number[]; + }; + metricsStats: { + batchesFlushed: number; + taskRunsInserted: number; + payloadsInserted: number; + eventsProcessed: number; + }; +} + +/** + * Measure Event Loop Utilization during benchmark + */ +class ELUMonitor { + private samples: number[] = []; + private interval: NodeJS.Timeout | null = null; + private startELU: { idle: number; active: number } | null = null; + + start(intervalMs: number = 100) { + this.samples = []; + this.startELU = performance.eventLoopUtilization(); + + this.interval = setInterval(() => { + const elu = performance.eventLoopUtilization(); + const utilization = elu.utilization * 100; // Convert to percentage + this.samples.push(utilization); + }, intervalMs); + } + + stop(): { mean: number; p50: number; p95: number; p99: number; samples: number[] } { + if (this.interval) { + clearInterval(this.interval); + this.interval = null; + } + + if (this.samples.length === 0) { + return { mean: 0, p50: 0, p95: 0, p99: 0, samples: [] }; + } + + const sorted = [...this.samples].sort((a, b) => a - b); + const mean = sorted.reduce((sum, val) => sum + val, 0) / sorted.length; + const p50 = sorted[Math.floor(sorted.length * 0.5)]; + const p95 = sorted[Math.floor(sorted.length * 0.95)]; + const p99 = sorted[Math.floor(sorted.length * 0.99)]; + + return { mean, p50, p95, p99, samples: sorted }; + } +} + +/** + * Run the producer script in a separate process + */ +async function runProducer(config: { + postgresUrl: string; + organizationId: string; + projectId: string; + environmentId: string; + numRuns: number; + errorRate: number; + batchSize: number; +}): Promise<{ created: number; withErrors: number; duration: number; throughput: number }> { + return new Promise((resolve, reject) => { + const producerPath = path.join(__dirname, "runsReplicationBenchmark.producer.ts"); + + // Use tsx to run the TypeScript file directly + const child = fork(producerPath, [JSON.stringify(config)], { + stdio: ["ignore", "pipe", "pipe", "ipc"], + execArgv: ["-r", "tsx/cjs"], + }); + + let output = ""; + + child.stdout?.on("data", (data) => { + const text = data.toString(); + output += text; + console.log(text.trim()); + }); + + child.stderr?.on("data", (data) => { + console.error(data.toString().trim()); + }); + + child.on("message", (message: any) => { + if (message.type === "complete") { + resolve(message.stats); + } else if (message.type === "error") { + reject(new Error(message.error)); + } + }); + + child.on("error", (error) => { + reject(error); + }); + + child.on("exit", (code) => { + if (code !== 0) { + reject(new Error(`Producer exited with code ${code}`)); + } + }); + }); +} + +/** + * Wait for all runs to be replicated to ClickHouse + */ +async function waitForReplication( + clickhouse: ClickHouse, + organizationId: string, + expectedCount: number, + timeoutMs: number +): Promise<{ duration: number; replicatedRuns: number }> { + const startTime = performance.now(); + const deadline = startTime + timeoutMs; + + const queryRuns = clickhouse.reader.query({ + name: "benchmark-count", + query: + "SELECT count(*) as count FROM trigger_dev.task_runs_v2 WHERE organization_id = {org_id:String}", + schema: z.object({ count: z.number() }), + params: z.object({ org_id: z.string() }), + }); + + while (performance.now() < deadline) { + const [error, result] = await queryRuns({ org_id: organizationId }); + + if (error) { + throw new Error(`Failed to query ClickHouse: ${error.message}`); + } + + const count = result?.[0]?.count || 0; + + if (count >= expectedCount) { + const duration = performance.now() - startTime; + return { duration, replicatedRuns: count }; + } + + // Wait a bit before checking again + await setTimeout(500); + } + + throw new Error( + `Replication timeout: expected ${expectedCount} runs, but only found ${await getRunCount( + clickhouse + )} after ${timeoutMs}ms` + ); +} + +async function getRunCount(clickhouse: ClickHouse): Promise { + const queryRuns = clickhouse.reader.query({ + name: "benchmark-count", + query: "SELECT count(*) as count FROM trigger_dev.task_runs_v2", + schema: z.object({ count: z.number() }), + }); + + const [error, result] = await queryRuns({}); + if (error) return 0; + return result?.[0]?.count || 0; +} + +/** + * Extract metrics from OpenTelemetry metrics + */ +function extractMetrics(metrics: any[]): { + batchesFlushed: number; + taskRunsInserted: number; + payloadsInserted: number; + eventsProcessed: number; +} { + function getMetricData(name: string) { + for (const resourceMetrics of metrics) { + for (const scopeMetrics of resourceMetrics.scopeMetrics) { + for (const metric of scopeMetrics.metrics) { + if (metric.descriptor.name === name) { + return metric; + } + } + } + } + return null; + } + + function sumCounterValues(metric: any): number { + if (!metric?.dataPoints) return 0; + return metric.dataPoints.reduce((sum: number, dp: any) => sum + (dp.value || 0), 0); + } + + return { + batchesFlushed: sumCounterValues(getMetricData("runs_replication.batches_flushed")), + taskRunsInserted: sumCounterValues(getMetricData("runs_replication.task_runs_inserted")), + payloadsInserted: sumCounterValues(getMetricData("runs_replication.payloads_inserted")), + eventsProcessed: sumCounterValues(getMetricData("runs_replication.events_processed")), + }; +} + +/** + * Run a single benchmark test + */ +async function runBenchmark( + name: string, + fingerprintingEnabled: boolean, + { + clickhouseContainer, + redisOptions, + postgresContainer, + prisma, + }: { + clickhouseContainer: any; + redisOptions: any; + postgresContainer: any; + prisma: any; + } +): Promise { + console.log(`\n${"=".repeat(80)}`); + console.log(`BENCHMARK: ${name}`); + console.log(`Error Fingerprinting: ${fingerprintingEnabled ? "ENABLED" : "DISABLED"}`); + console.log( + `Runs: ${BENCHMARK_CONFIG.NUM_RUNS}, Error Rate: ${(BENCHMARK_CONFIG.ERROR_RATE * 100).toFixed( + 1 + )}%` + ); + console.log(`${"=".repeat(80)}\n`); + + // Setup + const organization = await prisma.organization.create({ + data: { + title: `benchmark-${name}`, + slug: `benchmark-${name}`, + }, + }); + + const project = await prisma.project.create({ + data: { + name: `benchmark-${name}`, + slug: `benchmark-${name}`, + organizationId: organization.id, + externalRef: `benchmark-${name}`, + }, + }); + + const runtimeEnvironment = await prisma.runtimeEnvironment.create({ + data: { + slug: `benchmark-${name}`, + type: "DEVELOPMENT", + projectId: project.id, + organizationId: organization.id, + apiKey: `benchmark-${name}`, + pkApiKey: `benchmark-${name}`, + shortcode: `benchmark-${name}`, + }, + }); + + // Setup ClickHouse + const clickhouse = new ClickHouse({ + url: clickhouseContainer.getConnectionUrl(), + name: `benchmark-${name}`, + compression: { + request: true, + }, + logLevel: "warn", + }); + + // Setup tracing and metrics + const { tracer } = createInMemoryTracing(); + const metricsHelper = createInMemoryMetrics(); + + // Create and start replication service + const runsReplicationService = new RunsReplicationService({ + clickhouse, + pgConnectionUrl: postgresContainer.getConnectionUri(), + serviceName: `benchmark-${name}`, + slotName: `benchmark_${name.replace(/-/g, "_")}`, + publicationName: `benchmark_${name.replace(/-/g, "_")}_pub`, + redisOptions, + maxFlushConcurrency: BENCHMARK_CONFIG.MAX_FLUSH_CONCURRENCY, + flushIntervalMs: BENCHMARK_CONFIG.FLUSH_INTERVAL_MS, + flushBatchSize: BENCHMARK_CONFIG.FLUSH_BATCH_SIZE, + leaderLockTimeoutMs: 10000, + leaderLockExtendIntervalMs: 2000, + ackIntervalSeconds: 10, + tracer, + meter: metricsHelper.meter, + logLevel: "warn", + disableErrorFingerprinting: !fingerprintingEnabled, + }); + + await runsReplicationService.start(); + + // Start ELU monitoring + const eluMonitor = new ELUMonitor(); + eluMonitor.start(100); + + // Run producer in separate process + console.log("\n[Benchmark] Starting producer..."); + const producerStats = await runProducer({ + postgresUrl: postgresContainer.getConnectionUri(), + organizationId: organization.id, + projectId: project.id, + environmentId: runtimeEnvironment.id, + numRuns: BENCHMARK_CONFIG.NUM_RUNS, + errorRate: BENCHMARK_CONFIG.ERROR_RATE, + batchSize: BENCHMARK_CONFIG.PRODUCER_BATCH_SIZE, + }); + + console.log("\n[Benchmark] Waiting for replication to complete..."); + const replicationResult = await waitForReplication( + clickhouse, + organization.id, + producerStats.created, + BENCHMARK_CONFIG.REPLICATION_TIMEOUT_MS + ); + + // Stop ELU monitoring + const eluStats = eluMonitor.stop(); + + // Get metrics + const metrics = await metricsHelper.getMetrics(); + const metricsStats = extractMetrics(metrics); + + // Cleanup + await runsReplicationService.stop(); + await metricsHelper.shutdown(); + + const throughput = (replicationResult.replicatedRuns / replicationResult.duration) * 1000; + + const result: BenchmarkResult = { + name, + fingerprintingEnabled, + producerStats, + replicationStats: { + duration: replicationResult.duration, + throughput, + replicatedRuns: replicationResult.replicatedRuns, + }, + eluStats, + metricsStats, + }; + + // Print results + console.log(`\n${"=".repeat(80)}`); + console.log(`RESULTS: ${name}`); + console.log(`${"=".repeat(80)}`); + console.log("\nProducer:"); + console.log(` Created: ${producerStats.created} runs`); + console.log( + ` With errors: ${producerStats.withErrors} (${( + (producerStats.withErrors / producerStats.created) * + 100 + ).toFixed(1)}%)` + ); + console.log(` Duration: ${producerStats.duration.toFixed(0)}ms`); + console.log(` Throughput: ${producerStats.throughput.toFixed(0)} runs/sec`); + console.log("\nReplication:"); + console.log(` Replicated: ${replicationResult.replicatedRuns} runs`); + console.log(` Duration: ${replicationResult.duration.toFixed(0)}ms`); + console.log(` Throughput: ${throughput.toFixed(0)} runs/sec`); + console.log("\nEvent Loop Utilization:"); + console.log(` Mean: ${eluStats.mean.toFixed(2)}%`); + console.log(` P50: ${eluStats.p50.toFixed(2)}%`); + console.log(` P95: ${eluStats.p95.toFixed(2)}%`); + console.log(` P99: ${eluStats.p99.toFixed(2)}%`); + console.log(` Samples: ${eluStats.samples.length}`); + console.log("\nMetrics:"); + console.log(` Batches flushed: ${metricsStats.batchesFlushed}`); + console.log(` Task runs inserted: ${metricsStats.taskRunsInserted}`); + console.log(` Payloads inserted: ${metricsStats.payloadsInserted}`); + console.log(` Events processed: ${metricsStats.eventsProcessed}`); + console.log(`${"=".repeat(80)}\n`); + + return result; +} + +/** + * Compare two benchmark results and print delta + */ +function compareBenchmarks(baseline: BenchmarkResult, comparison: BenchmarkResult) { + console.log(`\n${"=".repeat(80)}`); + console.log("COMPARISON"); + console.log( + `Baseline: ${baseline.name} (fingerprinting ${baseline.fingerprintingEnabled ? "ON" : "OFF"})` + ); + console.log( + `Comparison: ${comparison.name} (fingerprinting ${ + comparison.fingerprintingEnabled ? "ON" : "OFF" + })` + ); + console.log(`${"=".repeat(80)}`); + + const replicationDurationDelta = + ((comparison.replicationStats.duration - baseline.replicationStats.duration) / + baseline.replicationStats.duration) * + 100; + const throughputDelta = + ((comparison.replicationStats.throughput - baseline.replicationStats.throughput) / + baseline.replicationStats.throughput) * + 100; + const eluMeanDelta = + ((comparison.eluStats.mean - baseline.eluStats.mean) / baseline.eluStats.mean) * 100; + const eluP99Delta = + ((comparison.eluStats.p99 - baseline.eluStats.p99) / baseline.eluStats.p99) * 100; + + console.log("\nReplication Duration:"); + console.log( + ` ${baseline.replicationStats.duration.toFixed( + 0 + )}ms → ${comparison.replicationStats.duration.toFixed(0)}ms (${ + replicationDurationDelta > 0 ? "+" : "" + }${replicationDurationDelta.toFixed(2)}%)` + ); + + console.log("\nThroughput:"); + console.log( + ` ${baseline.replicationStats.throughput.toFixed( + 0 + )} → ${comparison.replicationStats.throughput.toFixed(0)} runs/sec (${ + throughputDelta > 0 ? "+" : "" + }${throughputDelta.toFixed(2)}%)` + ); + + console.log("\nEvent Loop Utilization (Mean):"); + console.log( + ` ${baseline.eluStats.mean.toFixed(2)}% → ${comparison.eluStats.mean.toFixed(2)}% (${ + eluMeanDelta > 0 ? "+" : "" + }${eluMeanDelta.toFixed(2)}%)` + ); + + console.log("\nEvent Loop Utilization (P99):"); + console.log( + ` ${baseline.eluStats.p99.toFixed(2)}% → ${comparison.eluStats.p99.toFixed(2)}% (${ + eluP99Delta > 0 ? "+" : "" + }${eluP99Delta.toFixed(2)}%)` + ); + + console.log(`\n${"=".repeat(80)}\n`); + + // Return deltas for assertions if needed + return { + replicationDurationDelta, + throughputDelta, + eluMeanDelta, + eluP99Delta, + }; +} + +describe("RunsReplicationService Benchmark", () => { + containerTest.skipIf(process.env.BENCHMARKS_ENABLED !== "1")( + "should benchmark error fingerprinting performance impact", + async ({ clickhouseContainer, redisOptions, postgresContainer, prisma }) => { + // Enable replica identity for TaskRun table + await prisma.$executeRawUnsafe(`ALTER TABLE public."TaskRun" REPLICA IDENTITY FULL;`); + + console.log("\n" + "=".repeat(80)); + console.log("RUNS REPLICATION SERVICE - ERROR FINGERPRINTING BENCHMARK"); + console.log("=".repeat(80)); + console.log(`Configuration:`); + console.log(` Total runs: ${BENCHMARK_CONFIG.NUM_RUNS}`); + console.log(` Error rate: ${(BENCHMARK_CONFIG.ERROR_RATE * 100).toFixed(1)}%`); + console.log( + ` Expected errors: ~${Math.floor(BENCHMARK_CONFIG.NUM_RUNS * BENCHMARK_CONFIG.ERROR_RATE)}` + ); + console.log(` Producer batch size: ${BENCHMARK_CONFIG.PRODUCER_BATCH_SIZE}`); + console.log(` Replication batch size: ${BENCHMARK_CONFIG.FLUSH_BATCH_SIZE}`); + console.log(` Max flush concurrency: ${BENCHMARK_CONFIG.MAX_FLUSH_CONCURRENCY}`); + console.log("=".repeat(80) + "\n"); + + // Run benchmark WITHOUT error fingerprinting (baseline) + const baselineResult = await runBenchmark("baseline-no-fingerprinting", false, { + clickhouseContainer, + redisOptions, + postgresContainer, + prisma, + }); + + // Run benchmark WITH error fingerprinting + const fingerprintingResult = await runBenchmark("with-fingerprinting", true, { + clickhouseContainer, + redisOptions, + postgresContainer, + prisma, + }); + + // Compare results + const deltas = compareBenchmarks(baselineResult, fingerprintingResult); + + // Basic assertions - just to ensure benchmarks completed successfully + expect(baselineResult.replicationStats.replicatedRuns).toBe(BENCHMARK_CONFIG.NUM_RUNS); + expect(fingerprintingResult.replicationStats.replicatedRuns).toBe(BENCHMARK_CONFIG.NUM_RUNS); + + // Log final summary + console.log("BENCHMARK COMPLETE"); + console.log( + `Fingerprinting impact on replication duration: ${ + deltas.replicationDurationDelta > 0 ? "+" : "" + }${deltas.replicationDurationDelta.toFixed(2)}%` + ); + console.log( + `Fingerprinting impact on throughput: ${ + deltas.throughputDelta > 0 ? "+" : "" + }${deltas.throughputDelta.toFixed(2)}%` + ); + console.log( + `Fingerprinting impact on ELU (mean): ${ + deltas.eluMeanDelta > 0 ? "+" : "" + }${deltas.eluMeanDelta.toFixed(2)}%` + ); + console.log( + `Fingerprinting impact on ELU (P99): ${ + deltas.eluP99Delta > 0 ? "+" : "" + }${deltas.eluP99Delta.toFixed(2)}%` + ); + } + ); +}); From 6268d5110fbd793e10d29f65e7efca6459bdd4c4 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 2 Mar 2026 17:44:40 +0000 Subject: [PATCH 04/12] Errors are now by task --- .../app/components/navigation/SideMenu.tsx | 7 +- .../v3/ErrorsListPresenter.server.ts | 94 +++++++++-- .../route.tsx | 152 ++++++++++++++++-- .../schema/022_create_errors_v1_table.sql | 37 ++--- internal-packages/clickhouse/src/errors.ts | 80 +++++++-- internal-packages/clickhouse/src/index.ts | 2 + 6 files changed, 305 insertions(+), 67 deletions(-) diff --git a/apps/webapp/app/components/navigation/SideMenu.tsx b/apps/webapp/app/components/navigation/SideMenu.tsx index 3ed99b2f82f..96b082448d5 100644 --- a/apps/webapp/app/components/navigation/SideMenu.tsx +++ b/apps/webapp/app/components/navigation/SideMenu.tsx @@ -24,6 +24,7 @@ import { Squares2X2Icon, TableCellsIcon, UsersIcon, + BugAntIcon, } from "@heroicons/react/20/solid"; import { Link, useFetcher, useNavigation } from "@remix-run/react"; import { LayoutGroup, motion } from "framer-motion"; @@ -477,9 +478,9 @@ export function SideMenu({ )} >; export type ErrorGroup = ErrorsList["errorGroups"][0]; export type ErrorsListAppliedFilters = ErrorsList["filters"]; +export type ErrorHourlyOccurrences = Awaited< + ReturnType +>; +export type ErrorHourlyActivity = ErrorHourlyOccurrences[string]; // Cursor for error groups pagination type ErrorGroupCursor = { @@ -76,6 +80,15 @@ function decodeCursor(cursor: string): ErrorGroupCursor | null { } } +function parseClickHouseDateTime(value: string): Date { + const asNum = Number(value); + if (!isNaN(asNum) && asNum > 1e12) { + return new Date(asNum); + } + // ClickHouse returns 'YYYY-MM-DD HH:mm:ss.SSS' in UTC + return new Date(value.replace(" ", "T") + "Z"); +} + function escapeClickHouseString(val: string): string { return val.replace(/\\/g, "\\\\").replace(/\//g, "\\/").replace(/%/g, "\\%").replace(/_/g, "\\_"); } @@ -156,18 +169,19 @@ export class ErrorsListPresenter extends BasePresenter { queryBuilder.where("project_id = {projectId: String}", { projectId }); queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); - // Group by error_fingerprint to merge partial aggregations - queryBuilder.groupBy("error_fingerprint"); - - // Apply HAVING filters (filters on aggregated columns) - // Time range filter - use last_seen_date regular column instead of aggregate - queryBuilder.having("max(last_seen_date) >= now() - INTERVAL {days: Int64} DAY", { days: daysAgo }); - - // Task filter + // Task filter (task_identifier is part of the key, so use WHERE) if (tasks && tasks.length > 0) { - queryBuilder.having("anyMerge(sample_task_identifier) IN {tasks: Array(String)}", { tasks }); + queryBuilder.where("task_identifier IN {tasks: Array(String)}", { tasks }); } + // Group by key columns to merge partial aggregations + queryBuilder.groupBy("error_fingerprint, task_identifier"); + + // Time range filter + queryBuilder.having("max(last_seen_date) >= now() - INTERVAL {days: Int64} DAY", { + days: daysAgo, + }); + // Search filter - searches in error type and message if (search && search.trim() !== "") { const searchTerm = escapeClickHouseString(search.trim()).toLowerCase(); @@ -219,13 +233,12 @@ export class ErrorsListPresenter extends BasePresenter { errorType: error.error_type, errorMessage: error.error_message, fingerprint: error.error_fingerprint, - firstSeen: new Date(parseInt(error.first_seen) * 1000), - lastSeen: new Date(parseInt(error.last_seen) * 1000), + taskIdentifier: error.task_identifier, + firstSeen: parseClickHouseDateTime(error.first_seen), + lastSeen: parseClickHouseDateTime(error.last_seen), count: error.occurrence_count, - affectedTasks: error.affected_tasks, sampleRunId: error.sample_run_id, sampleFriendlyId: error.sample_friendly_id, - sampleTaskIdentifier: error.sample_task_identifier, })); return { @@ -244,4 +257,59 @@ export class ErrorsListPresenter extends BasePresenter { }, }; } + + public async getHourlyOccurrences( + organizationId: string, + projectId: string, + environmentId: string, + fingerprints: string[] + ): Promise>> { + if (fingerprints.length === 0) { + return {}; + } + + const hours = 24; + + const [queryError, records] = await this.clickhouse.errors.getHourlyOccurrences({ + organizationId, + projectId, + environmentId, + fingerprints, + hours, + }); + + if (queryError) { + throw queryError; + } + + // Build 24 hourly buckets as epoch seconds (UTC, floored to hour) + const buckets: number[] = []; + const nowMs = Date.now(); + for (let i = hours - 1; i >= 0; i--) { + const hourStart = Math.floor((nowMs - i * 3_600_000) / 3_600_000) * 3_600; + buckets.push(hourStart); + } + + // Index ClickHouse results by fingerprint → epoch → count + const grouped = new Map>(); + for (const row of records ?? []) { + let byHour = grouped.get(row.error_fingerprint); + if (!byHour) { + byHour = new Map(); + grouped.set(row.error_fingerprint, byHour); + } + byHour.set(row.hour_epoch, row.count); + } + + const result: Record> = {}; + for (const fp of fingerprints) { + const byHour = grouped.get(fp); + result[fp] = buckets.map((epoch) => ({ + date: new Date(epoch * 1000), + count: byHour?.get(epoch) ?? 0, + })); + } + + return result; + } } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx index 801fb552043..2de7a3ba0af 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx @@ -2,23 +2,33 @@ import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; import { type MetaFunction, Form, Link, Outlet } from "@remix-run/react"; import { XMarkIcon } from "@heroicons/react/20/solid"; import { ServiceValidationError } from "~/v3/services/baseService.server"; -import { - TypedAwait, - typeddefer, - type UseDataFunctionReturn, - useTypedLoaderData, -} from "remix-typedjson"; +import { TypedAwait, typeddefer, useTypedLoaderData } from "remix-typedjson"; import { requireUser } from "~/services/session.server"; import { getCurrentPlan } from "~/services/platform.v3.server"; import { EnvironmentParamSchema, v3ErrorPath } from "~/utils/pathBuilder"; import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; -import { ErrorsListPresenter, type ErrorGroup } from "~/presenters/v3/ErrorsListPresenter.server"; +import { + ErrorsListPresenter, + type ErrorsList, + type ErrorHourlyOccurrences, + type ErrorHourlyActivity, + type ErrorGroup, +} from "~/presenters/v3/ErrorsListPresenter.server"; import { $replica } from "~/db.server"; import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; import { PageBody, PageContainer } from "~/components/layout/AppLayout"; -import { Suspense, useMemo } from "react"; +import { Suspense } from "react"; +import { + Bar, + BarChart, + ReferenceLine, + ResponsiveContainer, + Tooltip, + type TooltipProps, + YAxis, +} from "recharts"; import { useOptimisticLocation } from "~/hooks/useOptimisticLocation"; import { Spinner } from "~/components/primitives/Spinner"; import { Paragraph } from "~/components/primitives/Paragraph"; @@ -29,8 +39,11 @@ import { TimeFilter } from "~/components/runs/v3/SharedFilters"; import { Button } from "~/components/primitives/Buttons"; import { Badge } from "~/components/primitives/Badge"; import { Header1, Header3 } from "~/components/primitives/Headers"; +import { formatDateTime } from "~/components/primitives/DateTime"; +import TooltipPortal from "~/components/primitives/TooltipPortal"; import { formatDistanceToNow } from "date-fns"; import { cn } from "~/utils/cn"; +import { formatNumberCompact } from "~/utils/numberFormatter"; import { CopyableTableCell, Table, @@ -103,8 +116,21 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw error; }); + const hourlyOccurrencesPromise = listPromise.then((result) => { + if ("error" in result) return {} as ErrorHourlyOccurrences; + const fingerprints = result.errorGroups.map((g) => g.fingerprint); + if (fingerprints.length === 0) return {} as ErrorHourlyOccurrences; + return presenter.getHourlyOccurrences( + project.organizationId, + project.id, + environment.id, + fingerprints + ); + }); + return typeddefer({ data: listPromise, + hourlyOccurrences: hourlyOccurrencesPromise, defaultPeriod: "7d", retentionLimitDays, organizationSlug, @@ -114,8 +140,15 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }; export default function Page() { - const { data, defaultPeriod, retentionLimitDays, organizationSlug, projectParam, envParam } = - useTypedLoaderData(); + const { + data, + hourlyOccurrences, + defaultPeriod, + retentionLimitDays, + organizationSlug, + projectParam, + envParam, + } = useTypedLoaderData(); return ( <> @@ -180,6 +213,7 @@ export default function Page() { /> ["data"]>, { error: string }>; + list?: ErrorsList; defaultPeriod?: string; retentionLimitDays: number; }) { @@ -255,11 +289,13 @@ function FiltersBar({ function ErrorsList({ errorGroups, + hourlyOccurrences, organizationSlug, projectParam, envParam, }: { errorGroups: ErrorGroup[]; + hourlyOccurrences: Promise; organizationSlug: string; projectParam: string; envParam: string; @@ -284,7 +320,8 @@ function ErrorsList({ ID Error Occurrences - Tasks + Past 24h + Task First seen Last seen Go to page @@ -295,6 +332,7 @@ function ErrorsList({ ; organizationSlug: string; projectParam: string; envParam: string; @@ -323,18 +363,32 @@ function ErrorGroupRow({ { fingerprint: errorGroup.fingerprint } ); - const errorMessage = `${errorGroup.errorType}: ${errorGroup.errorMessage}`; + const errorMessage = `${errorGroup.errorMessage}`; return ( {errorGroup.fingerprint.slice(-8)} + {errorGroup.taskIdentifier} {errorMessage} {errorGroup.count.toLocaleString()} - {errorGroup.affectedTasks} + + }> + }> + {(data) => { + const activity = data[errorGroup.fingerprint]; + return activity ? ( + + ) : ( + + ); + }} + + + {formatDistanceToNow(errorGroup.firstSeen, { addSuffix: true })} @@ -345,3 +399,73 @@ function ErrorGroupRow({ ); } + +function ErrorActivityGraph({ activity }: { activity: ErrorHourlyActivity }) { + const maxCount = Math.max(...activity.map((d) => d.count)); + + return ( +
+
+ + + + } + allowEscapeViewBox={{ x: true, y: true }} + wrapperStyle={{ zIndex: 1000 }} + animationDuration={0} + /> + + {maxCount > 0 && ( + + )} + + +
+ + {formatNumberCompact(maxCount)} + +
+ ); +} + +const ErrorActivityTooltip = ({ active, payload }: TooltipProps) => { + if (active && payload && payload.length > 0) { + const entry = payload[0].payload as { date: Date; count: number }; + const date = entry.date instanceof Date ? entry.date : new Date(entry.date); + const formattedDate = formatDateTime(date, "UTC", [], false, true); + + return ( + +
+ {formattedDate} +
+ {entry.count}{" "} + + {entry.count === 1 ? "occurrence" : "occurrences"} + +
+
+
+ ); + } + + return null; +}; + +function ErrorActivityBlankState() { + return ( +
+ {[...Array(24)].map((_, i) => ( +
+ ))} +
+ ); +} diff --git a/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql b/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql index 1a63e58c4f6..ecc6e9fdff6 100644 --- a/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql +++ b/internal-packages/clickhouse/schema/022_create_errors_v1_table.sql @@ -1,11 +1,12 @@ -- +goose Up --- Aggregated error groups table +-- Aggregated error groups table (per task + fingerprint) CREATE TABLE trigger_dev.errors_v1 ( organization_id String, project_id String, environment_id String, + task_identifier String, error_fingerprint String, -- Error details (samples from occurrences) @@ -13,27 +14,24 @@ CREATE TABLE trigger_dev.errors_v1 error_message String, sample_stack_trace String, - -- TTL tracking column (regular column for TTL - stores max created_at) - last_seen_date DateTime64(3), + -- SimpleAggregateFunction stores raw values and applies the function during merge, + -- avoiding binary state encoding issues with AggregateFunction. + last_seen_date SimpleAggregateFunction(max, DateTime), - -- Aggregated statistics using AggregateFunction - first_seen AggregateFunction(min, DateTime64(3)), - last_seen AggregateFunction(max, DateTime64(3)), + first_seen SimpleAggregateFunction(min, DateTime64(3)), + last_seen SimpleAggregateFunction(max, DateTime64(3)), occurrence_count AggregateFunction(sum, UInt64), - affected_tasks AggregateFunction(uniq, String), affected_task_versions AggregateFunction(uniq, String), -- Samples for debugging sample_run_id AggregateFunction(any, String), sample_friendly_id AggregateFunction(any, String), - sample_task_identifier AggregateFunction(any, String), -- Status distribution status_distribution AggregateFunction(sumMap, Array(String), Array(UInt64)) ) ENGINE = AggregatingMergeTree() -PARTITION BY organization_id -ORDER BY (organization_id, project_id, environment_id, error_fingerprint) +ORDER BY (organization_id, project_id, environment_id, task_identifier, error_fingerprint) TTL last_seen_date + INTERVAL 90 DAY SETTINGS index_granularity = 8192; @@ -45,26 +43,22 @@ SELECT organization_id, project_id, environment_id, + task_identifier, error_fingerprint, - -- Use any() for sample values - any(coalesce(JSONExtractString(error_text, 'type'), JSONExtractString(error_text, 'name'), 'Error')) as error_type, - any(coalesce(substring(JSONExtractString(error_text, 'message'), 1, 500), 'Unknown error')) as error_message, - any(coalesce(substring(JSONExtractString(error_text, 'stack'), 1, 2000), '')) as sample_stack_trace, + any(coalesce(nullIf(toString(error.data.type), ''), nullIf(toString(error.data.name), ''), 'Error')) as error_type, + any(coalesce(nullIf(substring(toString(error.data.message), 1, 500), ''), 'Unknown error')) as error_message, + any(coalesce(substring(toString(error.data.stack), 1, 2000), '')) as sample_stack_trace, - -- Regular column for TTL tracking - max(created_at) as last_seen_date, + toDateTime(max(created_at)) as last_seen_date, - -- Aggregate functions with State combinator - minState(created_at) as first_seen, - maxState(created_at) as last_seen, + min(created_at) as first_seen, + max(created_at) as last_seen, sumState(toUInt64(1)) as occurrence_count, - uniqState(task_identifier) as affected_tasks, uniqState(task_version) as affected_task_versions, anyState(run_id) as sample_run_id, anyState(friendly_id) as sample_friendly_id, - anyState(task_identifier) as sample_task_identifier, sumMapState([status], [toUInt64(1)]) as status_distribution FROM trigger_dev.task_runs_v2 @@ -76,6 +70,7 @@ GROUP BY organization_id, project_id, environment_id, + task_identifier, error_fingerprint; -- +goose Down diff --git a/internal-packages/clickhouse/src/errors.ts b/internal-packages/clickhouse/src/errors.ts index 3f3ed21142a..040561ebd5d 100644 --- a/internal-packages/clickhouse/src/errors.ts +++ b/internal-packages/clickhouse/src/errors.ts @@ -4,15 +4,14 @@ import { ClickhouseReader } from "./client/types.js"; export const ErrorGroupsListQueryResult = z.object({ error_fingerprint: z.string(), + task_identifier: z.string(), error_type: z.string(), error_message: z.string(), first_seen: z.string(), last_seen: z.string(), occurrence_count: z.number(), - affected_tasks: z.number(), sample_run_id: z.string(), sample_friendly_id: z.string(), - sample_task_identifier: z.string(), }); export type ErrorGroupsListQueryResult = z.infer; @@ -30,15 +29,14 @@ export function getErrorGroupsListQueryBuilder( baseQuery: ` SELECT error_fingerprint, + task_identifier, any(error_type) as error_type, any(error_message) as error_message, - toString(minMerge(first_seen)) as first_seen, - toString(maxMerge(last_seen)) as last_seen, + toString(toUnixTimestamp64Milli(min(first_seen))) as first_seen, + toString(toUnixTimestamp64Milli(max(last_seen))) as last_seen, toUInt64(sumMerge(occurrence_count)) as occurrence_count, - toUInt64(uniqMerge(affected_tasks)) as affected_tasks, anyMerge(sample_run_id) as sample_run_id, - anyMerge(sample_friendly_id) as sample_friendly_id, - anyMerge(sample_task_identifier) as sample_task_identifier + anyMerge(sample_friendly_id) as sample_friendly_id FROM trigger_dev.errors_v1 `, schema: ErrorGroupsListQueryResult, @@ -48,15 +46,14 @@ export function getErrorGroupsListQueryBuilder( export const ErrorGroupQueryResult = z.object({ error_fingerprint: z.string(), + task_identifier: z.string(), error_type: z.string(), error_message: z.string(), first_seen: z.string(), last_seen: z.string(), occurrence_count: z.number(), - affected_tasks: z.number(), sample_run_id: z.string(), sample_friendly_id: z.string(), - sample_task_identifier: z.string(), }); export type ErrorGroupQueryResult = z.infer; @@ -82,22 +79,21 @@ export function getErrorGroups(ch: ClickhouseReader, settings?: ClickHouseSettin query: ` SELECT error_fingerprint, + task_identifier, any(error_type) as error_type, any(error_message) as error_message, - toString(minMerge(first_seen)) as first_seen, - toString(maxMerge(last_seen)) as last_seen, + toString(toUnixTimestamp64Milli(min(first_seen))) as first_seen, + toString(toUnixTimestamp64Milli(max(last_seen))) as last_seen, toUInt64(sumMerge(occurrence_count)) as occurrence_count, - toUInt64(uniqMerge(affected_tasks)) as affected_tasks, anyMerge(sample_run_id) as sample_run_id, - anyMerge(sample_friendly_id) as sample_friendly_id, - anyMerge(sample_task_identifier) as sample_task_identifier + anyMerge(sample_friendly_id) as sample_friendly_id FROM trigger_dev.errors_v1 WHERE organization_id = {organizationId: String} AND project_id = {projectId: String} AND environment_id = {environmentId: String} - AND maxMerge(last_seen) >= now() - INTERVAL {days: Int64} DAY - GROUP BY error_fingerprint + GROUP BY error_fingerprint, task_identifier + HAVING max(last_seen) >= now() - INTERVAL {days: Int64} DAY ORDER BY last_seen DESC LIMIT {limit: Int64} OFFSET {offset: Int64} @@ -172,6 +168,58 @@ export function getErrorInstancesListQueryBuilder( }); } +export const ErrorHourlyOccurrencesQueryResult = z.object({ + error_fingerprint: z.string(), + hour_epoch: z.number(), + count: z.number(), +}); + +export type ErrorHourlyOccurrencesQueryResult = z.infer; + +export const ErrorHourlyOccurrencesQueryParams = z.object({ + organizationId: z.string(), + projectId: z.string(), + environmentId: z.string(), + fingerprints: z.array(z.string()), + hours: z.number().int().default(24), +}); + +export type ErrorHourlyOccurrencesQueryParams = z.infer; + +/** + * Gets hourly occurrence counts for specific error fingerprints over the past N hours. + * Queries task_runs_v2 directly, grouped by fingerprint and hour. + */ +export function getErrorHourlyOccurrences(ch: ClickhouseReader, settings?: ClickHouseSettings) { + return ch.query({ + name: "getErrorHourlyOccurrences", + query: ` + SELECT + error_fingerprint, + toUnixTimestamp(toStartOfHour(created_at)) as hour_epoch, + count() as count + FROM trigger_dev.task_runs_v2 FINAL + WHERE + organization_id = {organizationId: String} + AND project_id = {projectId: String} + AND environment_id = {environmentId: String} + AND created_at >= now() - INTERVAL {hours: Int64} HOUR + AND error_fingerprint IN {fingerprints: Array(String)} + AND status IN ('SYSTEM_FAILURE', 'CRASHED', 'INTERRUPTED', 'COMPLETED_WITH_ERRORS') + AND _is_deleted = 0 + GROUP BY + error_fingerprint, + hour_epoch + ORDER BY + error_fingerprint ASC, + hour_epoch ASC + `, + schema: ErrorHourlyOccurrencesQueryResult, + params: ErrorHourlyOccurrencesQueryParams, + settings, + }); +} + /** * Gets individual run instances for a specific error fingerprint. */ diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index cfb2bd097f9..58ee7dca17a 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -32,6 +32,7 @@ import { getErrorInstances, getErrorGroupsListQueryBuilder, getErrorInstancesListQueryBuilder, + getErrorHourlyOccurrences, } from "./errors.js"; import { Logger, type LogLevel } from "@trigger.dev/core/logger"; import type { Agent as HttpAgent } from "http"; @@ -241,6 +242,7 @@ export class ClickHouse { return { getGroups: getErrorGroups(this.reader), getInstances: getErrorInstances(this.reader), + getHourlyOccurrences: getErrorHourlyOccurrences(this.reader), listQueryBuilder: getErrorGroupsListQueryBuilder(this.reader), instancesQueryBuilder: getErrorInstancesListQueryBuilder(this.reader), }; From abfceaa71d93c4dfcb5ce47ad5182765d3ad70c8 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 2 Mar 2026 17:48:46 +0000 Subject: [PATCH 05/12] Not being used --- ...Param.env.$envParam.errors.$fingerprint.ts | 51 ------------------- 1 file changed, 51 deletions(-) delete mode 100644 apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts deleted file mode 100644 index 6230e7ac697..00000000000 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint.ts +++ /dev/null @@ -1,51 +0,0 @@ -import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; -import { json } from "@remix-run/node"; -import { requireUser } from "~/services/session.server"; -import { EnvironmentParamSchema } from "~/utils/pathBuilder"; -import { findProjectBySlug } from "~/models/project.server"; -import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; -import { ErrorGroupPresenter, ErrorGroupOptionsSchema } from "~/presenters/v3/ErrorGroupPresenter.server"; -import { $replica } from "~/db.server"; -import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; - -export const loader = async ({ request, params }: LoaderFunctionArgs) => { - const user = await requireUser(request); - const userId = user.id; - - const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); - const fingerprint = params.fingerprint; - - if (!fingerprint) { - throw new Response("Fingerprint parameter is required", { status: 400 }); - } - - const project = await findProjectBySlug(organizationSlug, projectParam, userId); - if (!project) { - throw new Response("Project not found", { status: 404 }); - } - - const environment = await findEnvironmentBySlug(project.id, envParam, userId); - if (!environment) { - throw new Response("Environment not found", { status: 404 }); - } - - // Get pagination from query params - const url = new URL(request.url); - const cursor = url.searchParams.get("cursor") ?? undefined; - - const options = ErrorGroupOptionsSchema.parse({ - userId, - projectId: project.id, - fingerprint, - cursor, - }) as any; // Validated by ErrorGroupOptionsSchema at runtime - - const presenter = new ErrorGroupPresenter($replica, logsClickhouseClient); - const result = await presenter.call(project.organizationId, environment.id, options); - - return json({ - errorGroup: result.errorGroup, - instances: result.instances, - pagination: result.pagination, - }); -}; From c5b966d769699444c246537deb8fd53d45ec33d2 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Mon, 2 Mar 2026 22:27:42 +0000 Subject: [PATCH 06/12] Added chart to the error page --- .../app/components/primitives/DateTime.tsx | 34 ++ .../v3/ErrorGroupPresenter.server.ts | 224 ++++++--- .../v3/ErrorsListPresenter.server.ts | 14 +- .../route.tsx | 308 ++++++++---- .../route.tsx | 454 +++++++++++++++++ .../route.tsx | 469 +----------------- packages/core/src/v3/isomorphic/friendlyId.ts | 1 + 7 files changed, 868 insertions(+), 636 deletions(-) create mode 100644 apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx diff --git a/apps/webapp/app/components/primitives/DateTime.tsx b/apps/webapp/app/components/primitives/DateTime.tsx index 5241a976ee7..7fe687a29af 100644 --- a/apps/webapp/app/components/primitives/DateTime.tsx +++ b/apps/webapp/app/components/primitives/DateTime.tsx @@ -1,5 +1,6 @@ import { GlobeAltIcon, GlobeAmericasIcon } from "@heroicons/react/20/solid"; import { useRouteLoaderData } from "@remix-run/react"; +import { formatDistanceToNow } from "date-fns"; import { Laptop } from "lucide-react"; import { memo, type ReactNode, useMemo, useSyncExternalStore } from "react"; import { CopyButton } from "./CopyButton"; @@ -357,6 +358,39 @@ function formatDateTimeAccurate( return `${datePart} ${timePart}`; } +type RelativeDateTimeProps = { + date: Date | string; + timeZone?: string; +}; + +export const RelativeDateTime = ({ date, timeZone }: RelativeDateTimeProps) => { + const locales = useLocales(); + const userTimeZone = useUserTimeZone(); + + const realDate = useMemo(() => (typeof date === "string" ? new Date(date) : date), [date]); + + const relativeText = useMemo(() => { + const text = formatDistanceToNow(realDate, { addSuffix: true }); + return text.charAt(0).toUpperCase() + text.slice(1); + }, [realDate]); + + return ( + {relativeText}} + content={ + + } + side="right" + asChild={true} + /> + ); +}; + export const DateTimeShort = ({ date, hour12 = true }: DateTimeProps) => { const locales = useLocales(); const userTimeZone = useUserTimeZone(); diff --git a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts index 281bb030c7d..d3e7412048c 100644 --- a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts @@ -59,6 +59,27 @@ function decodeCursor(cursor: string): ErrorInstanceCursor | null { } } +function parseClickHouseDateTime(value: string): Date { + const asNum = Number(value); + if (!isNaN(asNum) && asNum > 1e12) { + return new Date(asNum); + } + return new Date(value.replace(" ", "T") + "Z"); +} + +export type ErrorGroupSummary = { + fingerprint: string; + errorType: string; + errorMessage: string; + stackTrace?: string; + taskIdentifier: string; + count: number; + firstSeen: Date; + lastSeen: Date; +}; + +export type ErrorGroupHourlyActivity = Array<{ date: Date; count: number }>; + export class ErrorGroupPresenter extends BasePresenter { constructor( private readonly replica: PrismaClientOrTransaction, @@ -70,13 +91,7 @@ export class ErrorGroupPresenter extends BasePresenter { public async call( organizationId: string, environmentId: string, - { - userId, - projectId, - fingerprint, - cursor, - pageSize = DEFAULT_PAGE_SIZE, - }: ErrorGroupOptions + { userId, projectId, fingerprint, cursor, pageSize = DEFAULT_PAGE_SIZE }: ErrorGroupOptions ) { const displayableEnvironment = await findDisplayableEnvironment(environmentId, userId); @@ -84,10 +99,148 @@ export class ErrorGroupPresenter extends BasePresenter { throw new ServiceValidationError("No environment found"); } - // Use the error instances query builder + // Run summary (aggregated) and instances queries in parallel + const [summary, instancesResult] = await Promise.all([ + this.getSummary(organizationId, projectId, environmentId, fingerprint), + this.getInstances(organizationId, projectId, environmentId, fingerprint, cursor, pageSize), + ]); + + // Get stack trace from the most recent instance + let stackTrace: string | undefined; + if (instancesResult.instances.length > 0) { + const firstInstance = instancesResult.instances[0]; + try { + const errorData = JSON.parse(firstInstance.error_text) as Record; + stackTrace = (errorData.stack || errorData.stacktrace) as string | undefined; + } catch { + // no stack trace available + } + } + + // Build error group combining aggregated summary with instance stack trace + let errorGroup: ErrorGroupSummary | undefined; + if (summary) { + errorGroup = { + ...summary, + stackTrace, + }; + } + + // Transform instances + const transformedInstances = instancesResult.instances.map((instance) => { + let parsedError: any; + try { + parsedError = JSON.parse(instance.error_text); + } catch { + parsedError = { message: instance.error_text }; + } + + return { + runId: instance.run_id, + friendlyId: instance.friendly_id, + taskIdentifier: instance.task_identifier, + createdAt: new Date(parseInt(instance.created_at) * 1000), + status: instance.status, + error: parsedError, + traceId: instance.trace_id, + taskVersion: instance.task_version, + }; + }); + + return { + errorGroup, + instances: transformedInstances, + runFriendlyIds: transformedInstances.map((i) => i.friendlyId), + pagination: instancesResult.pagination, + }; + } + + public async getHourlyOccurrences( + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string + ): Promise { + const hours = 168; // 7 days + + const [queryError, records] = await this.clickhouse.errors.getHourlyOccurrences({ + organizationId, + projectId, + environmentId, + fingerprints: [fingerprint], + hours, + }); + + if (queryError) { + throw queryError; + } + + const buckets: number[] = []; + const nowMs = Date.now(); + for (let i = hours - 1; i >= 0; i--) { + const hourStart = Math.floor((nowMs - i * 3_600_000) / 3_600_000) * 3_600; + buckets.push(hourStart); + } + + const byHour = new Map(); + for (const row of records ?? []) { + byHour.set(row.hour_epoch, row.count); + } + + return buckets.map((epoch) => ({ + date: new Date(epoch * 1000), + count: byHour.get(epoch) ?? 0, + })); + } + + private async getSummary( + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string + ): Promise | undefined> { + const queryBuilder = this.clickhouse.errors.listQueryBuilder(); + + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint = {fingerprint: String}", { fingerprint }); + + queryBuilder.groupBy("error_fingerprint, task_identifier"); + queryBuilder.limit(1); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError) { + throw queryError; + } + + if (!records || records.length === 0) { + return undefined; + } + + const record = records[0]; + return { + fingerprint: record.error_fingerprint, + errorType: record.error_type, + errorMessage: record.error_message, + taskIdentifier: record.task_identifier, + count: record.occurrence_count, + firstSeen: parseClickHouseDateTime(record.first_seen), + lastSeen: parseClickHouseDateTime(record.last_seen), + }; + } + + private async getInstances( + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string, + cursor: string | undefined, + pageSize: number + ) { const queryBuilder = this.clickhouse.errors.instancesQueryBuilder(); - // Apply filters queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); queryBuilder.where("project_id = {projectId: String}", { projectId }); queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); @@ -96,7 +249,6 @@ export class ErrorGroupPresenter extends BasePresenter { }); queryBuilder.where("_is_deleted = 0"); - // Cursor-based pagination const decodedCursor = cursor ? decodeCursor(cursor) : null; if (decodedCursor) { queryBuilder.where( @@ -121,7 +273,6 @@ export class ErrorGroupPresenter extends BasePresenter { const hasMore = results.length > pageSize; const instances = results.slice(0, pageSize); - // Build next cursor from the last item let nextCursor: string | undefined; if (hasMore && instances.length > 0) { const lastInstance = instances[instances.length - 1]; @@ -131,57 +282,8 @@ export class ErrorGroupPresenter extends BasePresenter { }); } - // Get error group summary from the first instance - let errorGroup: - | { - errorType: string; - errorMessage: string; - stackTrace?: string; - } - | undefined; - - if (instances.length > 0) { - const firstInstance = instances[0]; - try { - const errorData = JSON.parse(firstInstance.error_text); - errorGroup = { - errorType: errorData.type || errorData.name || "Error", - errorMessage: errorData.message || "Unknown error", - stackTrace: errorData.stack || errorData.stacktrace, - }; - } catch { - // If parsing fails, use fallback - errorGroup = { - errorType: "Error", - errorMessage: firstInstance.error_text.substring(0, 200), - }; - } - } - - // Transform results - const transformedInstances = instances.map((instance) => { - let parsedError: any; - try { - parsedError = JSON.parse(instance.error_text); - } catch { - parsedError = { message: instance.error_text }; - } - - return { - runId: instance.run_id, - friendlyId: instance.friendly_id, - taskIdentifier: instance.task_identifier, - createdAt: new Date(parseInt(instance.created_at) * 1000), - status: instance.status, - error: parsedError, - traceId: instance.trace_id, - taskVersion: instance.task_version, - }; - }); - return { - errorGroup, - instances: transformedInstances, + instances, pagination: { hasMore, nextCursor, diff --git a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts index f900ec359d6..459753822ce 100644 --- a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts @@ -53,12 +53,12 @@ export type ErrorHourlyActivity = ErrorHourlyOccurrences[string]; // Cursor for error groups pagination type ErrorGroupCursor = { - lastSeen: string; + occurrenceCount: number; fingerprint: string; }; const ErrorGroupCursorSchema = z.object({ - lastSeen: z.string(), + occurrenceCount: z.number(), fingerprint: z.string(), }); @@ -193,19 +193,19 @@ export class ErrorsListPresenter extends BasePresenter { ); } - // Cursor-based pagination + // Cursor-based pagination (sorted by occurrence_count DESC) const decodedCursor = cursor ? decodeCursor(cursor) : null; if (decodedCursor) { queryBuilder.having( - "(last_seen < {cursorLastSeen: String} OR (last_seen = {cursorLastSeen: String} AND error_fingerprint < {cursorFingerprint: String}))", + "(occurrence_count < {cursorOccurrenceCount: UInt64} OR (occurrence_count = {cursorOccurrenceCount: UInt64} AND error_fingerprint < {cursorFingerprint: String}))", { - cursorLastSeen: decodedCursor.lastSeen, + cursorOccurrenceCount: decodedCursor.occurrenceCount, cursorFingerprint: decodedCursor.fingerprint, } ); } - queryBuilder.orderBy("last_seen DESC, error_fingerprint DESC"); + queryBuilder.orderBy("occurrence_count DESC, error_fingerprint DESC"); queryBuilder.limit(pageSize + 1); const [queryError, records] = await queryBuilder.execute(); @@ -223,7 +223,7 @@ export class ErrorsListPresenter extends BasePresenter { if (hasMore && errorGroups.length > 0) { const lastError = errorGroups[errorGroups.length - 1]; nextCursor = encodeCursor({ - lastSeen: lastError.last_seen, + occurrenceCount: lastError.occurrence_count, fingerprint: lastError.error_fingerprint, }); } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx index 68216267555..04637855bed 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx @@ -1,34 +1,45 @@ import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; -import { type MetaFunction, Link } from "@remix-run/react"; -import { ArrowLeftIcon } from "@heroicons/react/20/solid"; +import { type MetaFunction } from "@remix-run/react"; import { ServiceValidationError } from "~/v3/services/baseService.server"; -import { - TypedAwait, - typeddefer, - type UseDataFunctionReturn, - useTypedLoaderData, -} from "remix-typedjson"; +import { TypedAwait, typeddefer, useTypedLoaderData } from "remix-typedjson"; import { requireUser } from "~/services/session.server"; -import { EnvironmentParamSchema, v3ErrorsPath, v3RunPath } from "~/utils/pathBuilder"; +import { EnvironmentParamSchema, v3ErrorsPath } from "~/utils/pathBuilder"; import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { ErrorGroupPresenter, - type ErrorInstance, + type ErrorGroupHourlyActivity, + type ErrorGroupSummary, } from "~/presenters/v3/ErrorGroupPresenter.server"; +import { + NextRunListPresenter, + type NextRunList, +} from "~/presenters/v3/NextRunListPresenter.server"; import { $replica } from "~/db.server"; -import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; +import { logsClickhouseClient, clickhouseClient } from "~/services/clickhouseInstance.server"; import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; import { PageBody, PageContainer } from "~/components/layout/AppLayout"; import { Suspense } from "react"; import { Spinner } from "~/components/primitives/Spinner"; import { Paragraph } from "~/components/primitives/Paragraph"; import { Callout } from "~/components/primitives/Callout"; -import { Button } from "~/components/primitives/Buttons"; -import { Badge } from "~/components/primitives/Badge"; import { Header2, Header3 } from "~/components/primitives/Headers"; import { formatDistanceToNow } from "date-fns"; -import { cn } from "~/utils/cn"; +import { formatNumberCompact } from "~/utils/numberFormatter"; +import * as Property from "~/components/primitives/PropertyTable"; +import { TaskRunsTable } from "~/components/runs/v3/TaskRunsTable"; +import { DateTime, formatDateTime } from "~/components/primitives/DateTime"; +import { ErrorId } from "@trigger.dev/core/v3/isomorphic"; +import { + Bar, + BarChart, + ReferenceLine, + ResponsiveContainer, + Tooltip, + YAxis, + type TooltipProps, +} from "recharts"; +import TooltipPortal from "~/components/primitives/TooltipPortal"; export const meta: MetaFunction = ({ data }) => { return [ @@ -67,6 +78,24 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { projectId: project.id, fingerprint, }) + .then(async (result) => { + if (result.runFriendlyIds.length === 0) { + return { ...result, runList: undefined }; + } + + const runListPresenter = new NextRunListPresenter($replica, clickhouseClient); + const runList = await runListPresenter.call(project.organizationId, environment.id, { + userId, + projectId: project.id, + runId: result.runFriendlyIds, + pageSize: 25, + }); + + return { + ...result, + runList, + }; + }) .catch((error) => { if (error instanceof ServiceValidationError) { return { error: error.message }; @@ -74,8 +103,13 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw error; }); + const hourlyActivityPromise = presenter + .getHourlyOccurrences(project.organizationId, project.id, environment.id, fingerprint) + .catch(() => [] as ErrorGroupHourlyActivity); + return typeddefer({ data: detailPromise, + hourlyActivity: hourlyActivityPromise, organizationSlug, projectParam, envParam, @@ -84,7 +118,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }; export default function Page() { - const { data, organizationSlug, projectParam, envParam } = useTypedLoaderData(); + const { data, hourlyActivity, organizationSlug, projectParam, envParam, fingerprint } = + useTypedLoaderData(); const errorsPath = v3ErrorsPath( { slug: organizationSlug }, @@ -95,14 +130,13 @@ export default function Page() { return ( -
- - - - -
+ {ErrorId.toFriendlyId(fingerprint)}} + />
@@ -127,7 +161,6 @@ export default function Page() { } > {(result) => { - // Check if result contains an error if ("error" in result) { return (
@@ -140,7 +173,8 @@ export default function Page() { return ( ; organizationSlug: string; projectParam: string; envParam: string; @@ -187,18 +217,49 @@ function ErrorGroupDetail({ } return ( -
+
{/* Error Summary */} -
-
- - {errorGroup.errorType} - - {errorGroup.errorMessage} +
+ {errorGroup.errorMessage} + +
+ + + ID + + {ErrorId.toFriendlyId(errorGroup.fingerprint)} + + + + Task + + {errorGroup.taskIdentifier} + + + + + + + Occurrences + {formatNumberCompact(errorGroup.count)} + + + First seen + + + + + + Last seen + + {formatDistanceToNow(errorGroup.lastSeen, { addSuffix: true })} + + +
{errorGroup.stackTrace && ( -
+
Stack Trace @@ -209,79 +270,120 @@ function ErrorGroupDetail({ )}
- {/* Instances List */} -
- Error Instances ({instances.length.toLocaleString()}) + {/* Activity over past 7 days by hour */} +
+ Activity (past 7 days) + }> + } + > + {(activity) => + activity.length > 0 ? ( + + ) : ( + + ) + } + + +
- {instances.length === 0 ? ( - No error instances found. + {/* Runs Table */} +
+ Recent runs + {runList ? ( + ) : ( -
- {instances.map((instance) => ( - - ))} -
+ + No runs found for this error. + )}
); } -function ErrorInstanceRow({ - instance, - organizationSlug, - projectParam, - envParam, -}: { - instance: ErrorInstance; - organizationSlug: string; - projectParam: string; - envParam: string; -}) { - const runPath = v3RunPath( - { slug: organizationSlug }, - { slug: projectParam }, - { slug: envParam }, - { friendlyId: instance.friendlyId } - ); +function ActivityChart({ activity }: { activity: ErrorGroupHourlyActivity }) { + const maxCount = Math.max(...activity.map((d) => d.count)); return ( - -
-
-
- {instance.friendlyId} - {instance.status} -
- - Task: {instance.taskIdentifier} - - - {formatDistanceToNow(instance.createdAt, { addSuffix: true })} • Version:{" "} - {instance.taskVersion} - -
+
+
+ + + + } + allowEscapeViewBox={{ x: true, y: true }} + wrapperStyle={{ zIndex: 1000 }} + animationDuration={0} + /> + + + {maxCount > 0 && ( + + )} + +
+ + {formatNumberCompact(maxCount)} + +
+ ); +} - {/* Show error details if available */} - {instance.error && typeof instance.error === "object" && "message" in instance.error && ( -
- - {String(instance.error.message)} - +const ActivityChartTooltip = ({ active, payload }: TooltipProps) => { + if (active && payload && payload.length > 0) { + const entry = payload[0].payload as { date: Date; count: number }; + const date = entry.date instanceof Date ? entry.date : new Date(entry.date); + const formattedDate = formatDateTime(date, "UTC", [], false, true); + + return ( + +
+ {formattedDate} +
+ {entry.count}{" "} + + {entry.count === 1 ? "occurrence" : "occurrences"} + +
- )} - +
+ ); + } + + return null; +}; + +function ActivityChartBlankState() { + return ( +
+ {[...Array(42)].map((_, i) => ( +
+ ))} +
); } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx new file mode 100644 index 00000000000..560141f45ce --- /dev/null +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx @@ -0,0 +1,454 @@ +import { XMarkIcon } from "@heroicons/react/20/solid"; +import { Form, type MetaFunction } from "@remix-run/react"; +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { ErrorId } from "@trigger.dev/core/v3/isomorphic"; +import { Suspense } from "react"; +import { + Bar, + BarChart, + ReferenceLine, + ResponsiveContainer, + Tooltip, + YAxis, + type TooltipProps, +} from "recharts"; +import { TypedAwait, typeddefer, useTypedLoaderData } from "remix-typedjson"; +import { PageBody } from "~/components/layout/AppLayout"; +import { LogsSearchInput } from "~/components/logs/LogsSearchInput"; +import { LogsTaskFilter } from "~/components/logs/LogsTaskFilter"; +import { Button } from "~/components/primitives/Buttons"; +import { Callout } from "~/components/primitives/Callout"; +import { formatDateTime, RelativeDateTime } from "~/components/primitives/DateTime"; +import { Header3 } from "~/components/primitives/Headers"; +import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; +import { Paragraph } from "~/components/primitives/Paragraph"; +import { Spinner } from "~/components/primitives/Spinner"; +import { + CopyableTableCell, + Table, + TableBody, + TableCell, + TableCellChevron, + TableHeader, + TableHeaderCell, + TableRow, +} from "~/components/primitives/Table"; +import TooltipPortal from "~/components/primitives/TooltipPortal"; +import { TimeFilter } from "~/components/runs/v3/SharedFilters"; +import { $replica } from "~/db.server"; +import { useOptimisticLocation } from "~/hooks/useOptimisticLocation"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { + ErrorsListPresenter, + type ErrorGroup, + type ErrorHourlyActivity, + type ErrorHourlyOccurrences, + type ErrorsList, +} from "~/presenters/v3/ErrorsListPresenter.server"; +import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; +import { requireUser } from "~/services/session.server"; +import { formatNumberCompact } from "~/utils/numberFormatter"; +import { EnvironmentParamSchema, v3ErrorPath } from "~/utils/pathBuilder"; +import { ServiceValidationError } from "~/v3/services/baseService.server"; + +export const meta: MetaFunction = () => { + return [ + { + title: `Errors | Trigger.dev`, + }, + ]; +}; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const user = await requireUser(request); + const userId = user.id; + + const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) { + throw new Response("Project not found", { status: 404 }); + } + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) { + throw new Response("Environment not found", { status: 404 }); + } + + // Get filters from query params + const url = new URL(request.url); + const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); + const search = url.searchParams.get("search") ?? undefined; + const period = url.searchParams.get("period") ?? undefined; + const fromStr = url.searchParams.get("from"); + const toStr = url.searchParams.get("to"); + const from = fromStr ? parseInt(fromStr, 10) : undefined; + const to = toStr ? parseInt(toStr, 10) : undefined; + + // Get the user's plan to determine retention limit + const plan = await getCurrentPlan(project.organizationId); + const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; + + const presenter = new ErrorsListPresenter($replica, logsClickhouseClient); + + const listPromise = presenter + .call(project.organizationId, environment.id, { + userId, + projectId: project.id, + tasks: tasks.length > 0 ? tasks : undefined, + search, + period, + from, + to, + defaultPeriod: "7d", + retentionLimitDays, + }) + .catch((error) => { + if (error instanceof ServiceValidationError) { + return { error: error.message }; + } + throw error; + }); + + const hourlyOccurrencesPromise = listPromise.then((result) => { + if ("error" in result) return {} as ErrorHourlyOccurrences; + const fingerprints = result.errorGroups.map((g) => g.fingerprint); + if (fingerprints.length === 0) return {} as ErrorHourlyOccurrences; + return presenter.getHourlyOccurrences( + project.organizationId, + project.id, + environment.id, + fingerprints + ); + }); + + return typeddefer({ + data: listPromise, + hourlyOccurrences: hourlyOccurrencesPromise, + defaultPeriod: "7d", + retentionLimitDays, + organizationSlug, + projectParam, + envParam, + }); +}; + +export default function Page() { + const { + data, + hourlyOccurrences, + defaultPeriod, + retentionLimitDays, + organizationSlug, + projectParam, + envParam, + } = useTypedLoaderData(); + + return ( + <> + + + + + + +
+
+
+ + Loading errors… +
+
+
+ } + > + + +
+ + Unable to load errors. Please refresh the page or try again in a moment. + +
+
+ } + > + {(result) => { + // Check if result contains an error + if ("error" in result) { + return ( +
+ +
+ + {result.error} + +
+
+ ); + } + return ( +
+ + +
+ ); + }} + + + + + ); +} + +function FiltersBar({ + list, + defaultPeriod, + retentionLimitDays, +}: { + list?: ErrorsList; + defaultPeriod?: string; + retentionLimitDays: number; +}) { + const location = useOptimisticLocation(); + const searchParams = new URLSearchParams(location.search); + const hasFilters = + searchParams.has("tasks") || + searchParams.has("search") || + searchParams.has("period") || + searchParams.has("from") || + searchParams.has("to"); + + return ( +
+
+ {list ? ( + <> + + + + {hasFilters && ( +
+
+
+ ); +} + +function ErrorsList({ + errorGroups, + hourlyOccurrences, + organizationSlug, + projectParam, + envParam, +}: { + errorGroups: ErrorGroup[]; + hourlyOccurrences: Promise; + organizationSlug: string; + projectParam: string; + envParam: string; +}) { + if (errorGroups.length === 0) { + return ( +
+
+ No errors found + + No errors have been recorded in the selected time period. + +
+
+ ); + } + + return ( + + + + ID + Task + Error + Occurrences + Past 24h + First seen + Last seen + + + + {errorGroups.map((errorGroup) => ( + + ))} + +
+ ); +} + +function ErrorGroupRow({ + errorGroup, + hourlyOccurrences, + organizationSlug, + projectParam, + envParam, +}: { + errorGroup: ErrorGroup; + hourlyOccurrences: Promise; + organizationSlug: string; + projectParam: string; + envParam: string; +}) { + const errorPath = v3ErrorPath( + { slug: organizationSlug }, + { slug: projectParam }, + { slug: envParam }, + { fingerprint: errorGroup.fingerprint } + ); + + const errorMessage = `${errorGroup.errorMessage}`; + + return ( + + + {errorGroup.fingerprint.slice(-8)} + + {errorGroup.taskIdentifier} + + {errorMessage} + + {errorGroup.count.toLocaleString()} + + }> + }> + {(data) => { + const activity = data[errorGroup.fingerprint]; + return activity ? ( + + ) : ( + + ); + }} + + + + + + + + + + + ); +} + +function ErrorActivityGraph({ activity }: { activity: ErrorHourlyActivity }) { + const maxCount = Math.max(...activity.map((d) => d.count)); + + return ( +
+
+ + + + } + allowEscapeViewBox={{ x: true, y: true }} + wrapperStyle={{ zIndex: 1000 }} + animationDuration={0} + /> + + + {maxCount > 0 && ( + + )} + + +
+ + {formatNumberCompact(maxCount)} + +
+ ); +} + +const ErrorActivityTooltip = ({ active, payload }: TooltipProps) => { + if (active && payload && payload.length > 0) { + const entry = payload[0].payload as { date: Date; count: number }; + const date = entry.date instanceof Date ? entry.date : new Date(entry.date); + const formattedDate = formatDateTime(date, "UTC", [], false, true); + + return ( + +
+ {formattedDate} +
+ {entry.count}{" "} + + {entry.count === 1 ? "occurrence" : "occurrences"} + +
+
+
+ ); + } + + return null; +}; + +function ErrorActivityBlankState() { + return ( +
+ {[...Array(24)].map((_, i) => ( +
+ ))} +
+ ); +} diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx index 2de7a3ba0af..f6723ddebaa 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors/route.tsx @@ -1,471 +1,10 @@ -import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; -import { type MetaFunction, Form, Link, Outlet } from "@remix-run/react"; -import { XMarkIcon } from "@heroicons/react/20/solid"; -import { ServiceValidationError } from "~/v3/services/baseService.server"; -import { TypedAwait, typeddefer, useTypedLoaderData } from "remix-typedjson"; -import { requireUser } from "~/services/session.server"; -import { getCurrentPlan } from "~/services/platform.v3.server"; -import { EnvironmentParamSchema, v3ErrorPath } from "~/utils/pathBuilder"; -import { findProjectBySlug } from "~/models/project.server"; -import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; -import { - ErrorsListPresenter, - type ErrorsList, - type ErrorHourlyOccurrences, - type ErrorHourlyActivity, - type ErrorGroup, -} from "~/presenters/v3/ErrorsListPresenter.server"; -import { $replica } from "~/db.server"; -import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; -import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; -import { PageBody, PageContainer } from "~/components/layout/AppLayout"; -import { Suspense } from "react"; -import { - Bar, - BarChart, - ReferenceLine, - ResponsiveContainer, - Tooltip, - type TooltipProps, - YAxis, -} from "recharts"; -import { useOptimisticLocation } from "~/hooks/useOptimisticLocation"; -import { Spinner } from "~/components/primitives/Spinner"; -import { Paragraph } from "~/components/primitives/Paragraph"; -import { Callout } from "~/components/primitives/Callout"; -import { LogsSearchInput } from "~/components/logs/LogsSearchInput"; -import { LogsTaskFilter } from "~/components/logs/LogsTaskFilter"; -import { TimeFilter } from "~/components/runs/v3/SharedFilters"; -import { Button } from "~/components/primitives/Buttons"; -import { Badge } from "~/components/primitives/Badge"; -import { Header1, Header3 } from "~/components/primitives/Headers"; -import { formatDateTime } from "~/components/primitives/DateTime"; -import TooltipPortal from "~/components/primitives/TooltipPortal"; -import { formatDistanceToNow } from "date-fns"; -import { cn } from "~/utils/cn"; -import { formatNumberCompact } from "~/utils/numberFormatter"; -import { - CopyableTableCell, - Table, - TableBlankRow, - TableBody, - TableCell, - TableCellChevron, - TableCellMenu, - TableHeader, - TableHeaderCell, - TableRow, -} from "~/components/primitives/Table"; - -export const meta: MetaFunction = () => { - return [ - { - title: `Errors | Trigger.dev`, - }, - ]; -}; - -export const loader = async ({ request, params }: LoaderFunctionArgs) => { - const user = await requireUser(request); - const userId = user.id; - - const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); - - const project = await findProjectBySlug(organizationSlug, projectParam, userId); - if (!project) { - throw new Response("Project not found", { status: 404 }); - } - - const environment = await findEnvironmentBySlug(project.id, envParam, userId); - if (!environment) { - throw new Response("Environment not found", { status: 404 }); - } - - // Get filters from query params - const url = new URL(request.url); - const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); - const search = url.searchParams.get("search") ?? undefined; - const period = url.searchParams.get("period") ?? undefined; - const fromStr = url.searchParams.get("from"); - const toStr = url.searchParams.get("to"); - const from = fromStr ? parseInt(fromStr, 10) : undefined; - const to = toStr ? parseInt(toStr, 10) : undefined; - - // Get the user's plan to determine retention limit - const plan = await getCurrentPlan(project.organizationId); - const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; - - const presenter = new ErrorsListPresenter($replica, logsClickhouseClient); - - const listPromise = presenter - .call(project.organizationId, environment.id, { - userId, - projectId: project.id, - tasks: tasks.length > 0 ? tasks : undefined, - search, - period, - from, - to, - defaultPeriod: "7d", - retentionLimitDays, - }) - .catch((error) => { - if (error instanceof ServiceValidationError) { - return { error: error.message }; - } - throw error; - }); - - const hourlyOccurrencesPromise = listPromise.then((result) => { - if ("error" in result) return {} as ErrorHourlyOccurrences; - const fingerprints = result.errorGroups.map((g) => g.fingerprint); - if (fingerprints.length === 0) return {} as ErrorHourlyOccurrences; - return presenter.getHourlyOccurrences( - project.organizationId, - project.id, - environment.id, - fingerprints - ); - }); - - return typeddefer({ - data: listPromise, - hourlyOccurrences: hourlyOccurrencesPromise, - defaultPeriod: "7d", - retentionLimitDays, - organizationSlug, - projectParam, - envParam, - }); -}; +import { Outlet } from "@remix-run/react"; +import { PageContainer } from "~/components/layout/AppLayout"; export default function Page() { - const { - data, - hourlyOccurrences, - defaultPeriod, - retentionLimitDays, - organizationSlug, - projectParam, - envParam, - } = useTypedLoaderData(); - return ( - <> - - - - - - - -
-
-
- - Loading errors… -
-
-
- } - > - - -
- - Unable to load errors. Please refresh the page or try again in a moment. - -
-
- } - > - {(result) => { - // Check if result contains an error - if ("error" in result) { - return ( -
- -
- - {result.error} - -
-
- ); - } - return ( -
- - -
- ); - }} - - - - + - - ); -} - -function FiltersBar({ - list, - defaultPeriod, - retentionLimitDays, -}: { - list?: ErrorsList; - defaultPeriod?: string; - retentionLimitDays: number; -}) { - const location = useOptimisticLocation(); - const searchParams = new URLSearchParams(location.search); - const hasFilters = - searchParams.has("tasks") || - searchParams.has("search") || - searchParams.has("period") || - searchParams.has("from") || - searchParams.has("to"); - - return ( -
-
- {list ? ( - <> - - - - {hasFilters && ( -
-
-
- ); -} - -function ErrorsList({ - errorGroups, - hourlyOccurrences, - organizationSlug, - projectParam, - envParam, -}: { - errorGroups: ErrorGroup[]; - hourlyOccurrences: Promise; - organizationSlug: string; - projectParam: string; - envParam: string; -}) { - if (errorGroups.length === 0) { - return ( -
-
- No errors found - - No errors have been recorded in the selected time period. - -
-
- ); - } - - return ( - - - - ID - Error - Occurrences - Past 24h - Task - First seen - Last seen - Go to page - - - - {errorGroups.map((errorGroup) => ( - - ))} - -
- ); -} - -function ErrorGroupRow({ - errorGroup, - hourlyOccurrences, - organizationSlug, - projectParam, - envParam, -}: { - errorGroup: ErrorGroup; - hourlyOccurrences: Promise; - organizationSlug: string; - projectParam: string; - envParam: string; -}) { - const errorPath = v3ErrorPath( - { slug: organizationSlug }, - { slug: projectParam }, - { slug: envParam }, - { fingerprint: errorGroup.fingerprint } - ); - - const errorMessage = `${errorGroup.errorMessage}`; - - return ( - - - {errorGroup.fingerprint.slice(-8)} - - {errorGroup.taskIdentifier} - - {errorMessage} - - {errorGroup.count.toLocaleString()} - - }> - }> - {(data) => { - const activity = data[errorGroup.fingerprint]; - return activity ? ( - - ) : ( - - ); - }} - - - - - {formatDistanceToNow(errorGroup.firstSeen, { addSuffix: true })} - - - {formatDistanceToNow(errorGroup.lastSeen, { addSuffix: true })} - - - - ); -} - -function ErrorActivityGraph({ activity }: { activity: ErrorHourlyActivity }) { - const maxCount = Math.max(...activity.map((d) => d.count)); - - return ( -
-
- - - - } - allowEscapeViewBox={{ x: true, y: true }} - wrapperStyle={{ zIndex: 1000 }} - animationDuration={0} - /> - - {maxCount > 0 && ( - - )} - - -
- - {formatNumberCompact(maxCount)} - -
- ); -} - -const ErrorActivityTooltip = ({ active, payload }: TooltipProps) => { - if (active && payload && payload.length > 0) { - const entry = payload[0].payload as { date: Date; count: number }; - const date = entry.date instanceof Date ? entry.date : new Date(entry.date); - const formattedDate = formatDateTime(date, "UTC", [], false, true); - - return ( - -
- {formattedDate} -
- {entry.count}{" "} - - {entry.count === 1 ? "occurrence" : "occurrences"} - -
-
-
- ); - } - - return null; -}; - -function ErrorActivityBlankState() { - return ( -
- {[...Array(24)].map((_, i) => ( -
- ))} -
+ ); } diff --git a/packages/core/src/v3/isomorphic/friendlyId.ts b/packages/core/src/v3/isomorphic/friendlyId.ts index 90fa31bd573..a230f8c7450 100644 --- a/packages/core/src/v3/isomorphic/friendlyId.ts +++ b/packages/core/src/v3/isomorphic/friendlyId.ts @@ -96,6 +96,7 @@ export const WaitpointId = new IdUtil("waitpoint"); export const BatchId = new IdUtil("batch"); export const BulkActionId = new IdUtil("bulk"); export const AttemptId = new IdUtil("attempt"); +export const ErrorId = new IdUtil("error"); export class IdGenerator { private alphabet: string; From cae22ee3695f077ac8b350d2a1cef84698e52a3d Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 3 Mar 2026 11:54:38 +0000 Subject: [PATCH 07/12] Improved error page layout --- .../app/components/navigation/SideMenu.tsx | 3 +- .../route.tsx | 166 ++++++++++-------- 2 files changed, 92 insertions(+), 77 deletions(-) diff --git a/apps/webapp/app/components/navigation/SideMenu.tsx b/apps/webapp/app/components/navigation/SideMenu.tsx index 96b082448d5..2c486576bee 100644 --- a/apps/webapp/app/components/navigation/SideMenu.tsx +++ b/apps/webapp/app/components/navigation/SideMenu.tsx @@ -114,6 +114,7 @@ import { SideMenuHeader } from "./SideMenuHeader"; import { SideMenuItem } from "./SideMenuItem"; import { SideMenuSection } from "./SideMenuSection"; import { type SideMenuSectionId } from "./sideMenuTypes"; +import { IconBugFilled } from "@tabler/icons-react"; /** Get the collapsed state for a specific side menu section from user preferences */ function getSectionCollapsed( @@ -478,7 +479,7 @@ export function SideMenu({ )} = ({ data }) => { return [ @@ -217,12 +208,12 @@ function ErrorGroupDetail({ } return ( -
+
{/* Error Summary */}
{errorGroup.errorMessage} -
+
ID @@ -243,6 +234,9 @@ function ErrorGroupDetail({ Occurrences {formatNumberCompact(errorGroup.count)} + + + First seen @@ -271,13 +265,10 @@ function ErrorGroupDetail({
{/* Activity over past 7 days by hour */} -
- Activity (past 7 days) +
+ Activity (past 7 days) }> - } - > + }> {(activity) => activity.length > 0 ? ( @@ -291,7 +282,7 @@ function ErrorGroupDetail({ {/* Runs Table */}
- Recent runs + Recent runs {runList ? ( d.count)); +const activityChartConfig: ChartConfig = { + count: { + label: "Occurrences", + color: "#EC003F", + }, +}; - return ( -
-
- - - - } - allowEscapeViewBox={{ x: true, y: true }} - wrapperStyle={{ zIndex: 1000 }} - animationDuration={0} - /> - - - {maxCount > 0 && ( - - )} - - -
- - {formatNumberCompact(maxCount)} - -
+function ActivityChart({ activity }: { activity: ErrorGroupHourlyActivity }) { + const data = useMemo( + () => + activity.map((d) => ({ + ...d, + __timestamp: d.date instanceof Date ? d.date.getTime() : new Date(d.date).getTime(), + })), + [activity] ); -} -const ActivityChartTooltip = ({ active, payload }: TooltipProps) => { - if (active && payload && payload.length > 0) { - const entry = payload[0].payload as { date: Date; count: number }; - const date = entry.date instanceof Date ? entry.date : new Date(entry.date); - const formattedDate = formatDateTime(date, "UTC", [], false, true); + const midnightTicks = useMemo(() => { + const ticks: number[] = []; + for (const d of data) { + const date = new Date(d.__timestamp); + if (date.getHours() === 0 && date.getMinutes() === 0) { + ticks.push(d.__timestamp); + } + } + return ticks; + }, [data]); - return ( - -
- {formattedDate} -
- {entry.count}{" "} - - {entry.count === 1 ? "occurrence" : "occurrences"} - -
-
-
- ); - } + const xAxisFormatter = useMemo(() => { + return (value: number) => { + const date = new Date(value); + return date.toLocaleDateString("en-US", { + month: "short", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + }; + }, []); - return null; -}; + const tooltipLabelFormatter = useMemo(() => { + return (_label: string, payload: Array<{ payload?: Record }>) => { + const timestamp = payload[0]?.payload?.__timestamp as number | undefined; + if (timestamp) { + const date = new Date(timestamp); + return date.toLocaleString("en-US", { + month: "short", + day: "numeric", + year: "numeric", + hour: "2-digit", + minute: "2-digit", + hour12: false, + }); + } + return _label; + }; + }, []); + + return ( + + + + ); +} function ActivityChartBlankState() { return ( -
+
{[...Array(42)].map((_, i) => (
))} From f8c82fa10db6f123bcd0fb108f254479bc294b47 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 3 Mar 2026 12:46:49 +0000 Subject: [PATCH 08/12] Set the default time period to 1d --- .../route.tsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx index 560141f45ce..0c5135e12b4 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx @@ -102,7 +102,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { period, from, to, - defaultPeriod: "7d", + defaultPeriod: "1d", retentionLimitDays, }) .catch((error) => { @@ -127,7 +127,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { return typeddefer({ data: listPromise, hourlyOccurrences: hourlyOccurrencesPromise, - defaultPeriod: "7d", + defaultPeriod: "1d", retentionLimitDays, organizationSlug, projectParam, From 7c390a8503c16969aa4e42ffb14223a45f10c647 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 3 Mar 2026 12:47:02 +0000 Subject: [PATCH 09/12] Removed unused resource route --- ...ects.$projectParam.env.$envParam.errors.ts | 67 ------------------- 1 file changed, 67 deletions(-) delete mode 100644 apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts deleted file mode 100644 index fd36003e8f3..00000000000 --- a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts +++ /dev/null @@ -1,67 +0,0 @@ -import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; -import { json } from "@remix-run/node"; -import { requireUser } from "~/services/session.server"; -import { EnvironmentParamSchema } from "~/utils/pathBuilder"; -import { findProjectBySlug } from "~/models/project.server"; -import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; -import { ErrorsListPresenter, ErrorsListOptionsSchema } from "~/presenters/v3/ErrorsListPresenter.server"; -import { $replica } from "~/db.server"; -import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; -import { getCurrentPlan } from "~/services/platform.v3.server"; - -export const loader = async ({ request, params }: LoaderFunctionArgs) => { - const user = await requireUser(request); - const userId = user.id; - - const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); - - const project = await findProjectBySlug(organizationSlug, projectParam, userId); - if (!project) { - throw new Response("Project not found", { status: 404 }); - } - - const environment = await findEnvironmentBySlug(project.id, envParam, userId); - if (!environment) { - throw new Response("Environment not found", { status: 404 }); - } - - // Get the user's plan to determine retention limit - const plan = await getCurrentPlan(project.organizationId); - const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; - - // Get filters from query params - const url = new URL(request.url); - const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); - const search = url.searchParams.get("search") ?? undefined; - const cursor = url.searchParams.get("cursor") ?? undefined; - const period = url.searchParams.get("period") ?? undefined; - const fromStr = url.searchParams.get("from"); - const toStr = url.searchParams.get("to"); - let from = fromStr ? parseInt(fromStr, 10) : undefined; - let to = toStr ? parseInt(toStr, 10) : undefined; - - if (Number.isNaN(from)) from = undefined; - if (Number.isNaN(to)) to = undefined; - - const options = ErrorsListOptionsSchema.parse({ - userId, - projectId: project.id, - tasks: tasks.length > 0 ? tasks : undefined, - search, - cursor, - period, - from, - to, - defaultPeriod: "7d", - retentionLimitDays, - }) as any; // Validated by ErrorsListOptionsSchema at runtime - - const presenter = new ErrorsListPresenter($replica, logsClickhouseClient); - const result = await presenter.call(project.organizationId, environment.id, options); - - return json({ - errorGroups: result.errorGroups, - pagination: result.pagination, - filters: result.filters, - }); -}; From 14e42de5b78409450a773a3acc3b8be362d006da Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 3 Mar 2026 14:40:02 +0000 Subject: [PATCH 10/12] Switch to using error_occurrences_v1 so we can do proper time filtering --- .../v3/ErrorGroupPresenter.server.ts | 78 ++++--- .../v3/ErrorsListPresenter.server.ts | 191 ++++++++++++------ .../route.tsx | 41 ++-- .../route.tsx | 49 ++--- ...ects.$projectParam.env.$envParam.errors.ts | 67 ++++++ .../023_create_error_occurrences_v1.sql | 88 ++++++++ internal-packages/clickhouse/src/errors.ts | 124 ++++++++++++ internal-packages/clickhouse/src/index.ts | 11 + 8 files changed, 522 insertions(+), 127 deletions(-) create mode 100644 apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts create mode 100644 internal-packages/clickhouse/schema/023_create_error_occurrences_v1.sql diff --git a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts index d3e7412048c..207cba8f126 100644 --- a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts @@ -1,5 +1,11 @@ import { z } from "zod"; -import { type ClickHouse } from "@internal/clickhouse"; +import { + type ClickHouse, + type TimeGranularity, + detectTimeGranularity, + granularityToInterval, + granularityToStepMs, +} from "@internal/clickhouse"; import { type PrismaClientOrTransaction } from "@trigger.dev/database"; import { type Direction } from "~/components/ListPagination"; import { findDisplayableEnvironment } from "~/models/runtimeEnvironment.server"; @@ -78,7 +84,8 @@ export type ErrorGroupSummary = { lastSeen: Date; }; -export type ErrorGroupHourlyActivity = Array<{ date: Date; count: number }>; +export type ErrorGroupOccurrences = Awaited>; +export type ErrorGroupActivity = ErrorGroupOccurrences["data"]; export class ErrorGroupPresenter extends BasePresenter { constructor( @@ -155,42 +162,67 @@ export class ErrorGroupPresenter extends BasePresenter { }; } - public async getHourlyOccurrences( + /** + * Returns bucketed occurrence counts for a single fingerprint over a time range. + * Granularity is determined automatically from the range span. + */ + public async getOccurrences( organizationId: string, projectId: string, environmentId: string, - fingerprint: string - ): Promise { - const hours = 168; // 7 days - - const [queryError, records] = await this.clickhouse.errors.getHourlyOccurrences({ - organizationId, - projectId, - environmentId, - fingerprints: [fingerprint], - hours, + fingerprint: string, + from: Date, + to: Date + ): Promise<{ + granularity: TimeGranularity; + data: Array<{ date: Date; count: number }>; + }> { + const granularity = detectTimeGranularity(from, to); + const intervalExpr = granularityToInterval(granularity); + const stepMs = granularityToStepMs(granularity); + + const queryBuilder = this.clickhouse.errors.createOccurrencesQueryBuilder(intervalExpr); + + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint = {fingerprint: String}", { fingerprint }); + queryBuilder.where("minute >= toStartOfMinute(fromUnixTimestamp64Milli({fromTimeMs: Int64}))", { + fromTimeMs: from.getTime(), }); + queryBuilder.where("minute <= toStartOfMinute(fromUnixTimestamp64Milli({toTimeMs: Int64}))", { + toTimeMs: to.getTime(), + }); + + queryBuilder.groupBy("error_fingerprint, bucket_epoch"); + queryBuilder.orderBy("bucket_epoch ASC"); + + const [queryError, records] = await queryBuilder.execute(); if (queryError) { throw queryError; } + // Build time buckets covering the full range const buckets: number[] = []; - const nowMs = Date.now(); - for (let i = hours - 1; i >= 0; i--) { - const hourStart = Math.floor((nowMs - i * 3_600_000) / 3_600_000) * 3_600; - buckets.push(hourStart); + const startEpoch = Math.floor(from.getTime() / stepMs) * (stepMs / 1000); + const endEpoch = Math.ceil(to.getTime() / 1000); + for (let epoch = startEpoch; epoch <= endEpoch; epoch += stepMs / 1000) { + buckets.push(epoch); } - const byHour = new Map(); + const byBucket = new Map(); for (const row of records ?? []) { - byHour.set(row.hour_epoch, row.count); + byBucket.set(row.bucket_epoch, (byBucket.get(row.bucket_epoch) ?? 0) + row.count); } - return buckets.map((epoch) => ({ - date: new Date(epoch * 1000), - count: byHour.get(epoch) ?? 0, - })); + return { + granularity, + data: buckets.map((epoch) => ({ + date: new Date(epoch * 1000), + count: byBucket.get(epoch) ?? 0, + })), + }; } private async getSummary( diff --git a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts index 459753822ce..05cf6c9b619 100644 --- a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts @@ -1,5 +1,11 @@ import { z } from "zod"; -import { type ClickHouse } from "@internal/clickhouse"; +import { + type ClickHouse, + type TimeGranularity, + detectTimeGranularity, + granularityToInterval, + granularityToStepMs, +} from "@internal/clickhouse"; import { type PrismaClientOrTransaction } from "@trigger.dev/database"; import { type Direction } from "~/components/ListPagination"; import { timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; @@ -46,12 +52,9 @@ const DEFAULT_PAGE_SIZE = 50; export type ErrorsList = Awaited>; export type ErrorGroup = ErrorsList["errorGroups"][0]; export type ErrorsListAppliedFilters = ErrorsList["filters"]; -export type ErrorHourlyOccurrences = Awaited< - ReturnType ->; -export type ErrorHourlyActivity = ErrorHourlyOccurrences[string]; +export type ErrorOccurrences = Awaited>; +export type ErrorOccurrenceActivity = ErrorOccurrences["data"][string]; -// Cursor for error groups pagination type ErrorGroupCursor = { occurrenceCount: number; fingerprint: string; @@ -85,7 +88,6 @@ function parseClickHouseDateTime(value: string): Date { if (!isNaN(asNum) && asNum > 1e12) { return new Date(asNum); } - // ClickHouse returns 'YYYY-MM-DD HH:mm:ss.SSS' in UTC return new Date(value.replace(" ", "T") + "Z"); } @@ -122,13 +124,12 @@ export class ErrorsListPresenter extends BasePresenter { period, from, to, - defaultPeriod: defaultPeriod ?? "7d", + defaultPeriod: defaultPeriod ?? "1d", }); let effectiveFrom = time.from; let effectiveTo = time.to; - // Apply retention limit if provided let wasClampedByRetention = false; if (retentionLimitDays !== undefined && effectiveFrom) { const retentionCutoffDate = new Date(Date.now() - retentionLimitDays * 24 * 60 * 60 * 1000); @@ -155,38 +156,32 @@ export class ErrorsListPresenter extends BasePresenter { throw new ServiceValidationError("No environment found"); } - // Calculate days parameter for ClickHouse query - const now = new Date(); - const daysAgo = effectiveFrom - ? Math.ceil((now.getTime() - effectiveFrom.getTime()) / (1000 * 60 * 60 * 24)) - : 30; + // Query the per-minute error_occurrences_v1 table for time-scoped counts + const queryBuilder = this.clickhouse.errors.occurrencesListQueryBuilder(); - // Query the pre-aggregated errors_v1 table - const queryBuilder = this.clickhouse.errors.listQueryBuilder(); - - // Apply base WHERE filters queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); queryBuilder.where("project_id = {projectId: String}", { projectId }); queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); - // Task filter (task_identifier is part of the key, so use WHERE) + // Precise time range filtering via WHERE on the minute column + queryBuilder.where("minute >= toStartOfMinute(fromUnixTimestamp64Milli({fromTimeMs: Int64}))", { + fromTimeMs: effectiveFrom.getTime(), + }); + queryBuilder.where("minute <= toStartOfMinute(fromUnixTimestamp64Milli({toTimeMs: Int64}))", { + toTimeMs: effectiveTo.getTime(), + }); + if (tasks && tasks.length > 0) { queryBuilder.where("task_identifier IN {tasks: Array(String)}", { tasks }); } - // Group by key columns to merge partial aggregations queryBuilder.groupBy("error_fingerprint, task_identifier"); - // Time range filter - queryBuilder.having("max(last_seen_date) >= now() - INTERVAL {days: Int64} DAY", { - days: daysAgo, - }); - - // Search filter - searches in error type and message + // Text search via HAVING (operates on aggregated values) if (search && search.trim() !== "") { const searchTerm = escapeClickHouseString(search.trim()).toLowerCase(); queryBuilder.having( - "(lower(any(error_type)) like {searchPattern: String} OR lower(any(error_message)) like {searchPattern: String})", + "(lower(error_type) like {searchPattern: String} OR lower(error_message) like {searchPattern: String})", { searchPattern: `%${searchTerm}%`, } @@ -218,7 +213,6 @@ export class ErrorsListPresenter extends BasePresenter { const hasMore = results.length > pageSize; const errorGroups = results.slice(0, pageSize); - // Build next cursor from the last item let nextCursor: string | undefined; if (hasMore && errorGroups.length > 0) { const lastError = errorGroups[errorGroups.length - 1]; @@ -228,18 +222,27 @@ export class ErrorsListPresenter extends BasePresenter { }); } - // Transform results - const transformedErrorGroups = errorGroups.map((error) => ({ - errorType: error.error_type, - errorMessage: error.error_message, - fingerprint: error.error_fingerprint, - taskIdentifier: error.task_identifier, - firstSeen: parseClickHouseDateTime(error.first_seen), - lastSeen: parseClickHouseDateTime(error.last_seen), - count: error.occurrence_count, - sampleRunId: error.sample_run_id, - sampleFriendlyId: error.sample_friendly_id, - })); + // Fetch global first_seen / last_seen from the errors_v1 summary table + const fingerprints = errorGroups.map((e) => e.error_fingerprint); + const globalSummaryMap = await this.getGlobalSummary( + organizationId, + projectId, + environmentId, + fingerprints + ); + + const transformedErrorGroups = errorGroups.map((error) => { + const global = globalSummaryMap.get(error.error_fingerprint); + return { + errorType: error.error_type, + errorMessage: error.error_message, + fingerprint: error.error_fingerprint, + taskIdentifier: error.task_identifier, + firstSeen: global?.firstSeen ?? new Date(), + lastSeen: global?.lastSeen ?? new Date(), + count: error.occurrence_count, + }; + }); return { errorGroups: transformedErrorGroups, @@ -251,6 +254,8 @@ export class ErrorsListPresenter extends BasePresenter { tasks, search, period: time, + from: effectiveFrom, + to: effectiveTo, hasFilters, possibleTasks, wasClampedByRetention, @@ -258,58 +263,112 @@ export class ErrorsListPresenter extends BasePresenter { }; } - public async getHourlyOccurrences( + /** + * Returns bucketed occurrence counts for the given fingerprints over a time range. + * Granularity is determined automatically from the range span. + */ + public async getOccurrences( organizationId: string, projectId: string, environmentId: string, - fingerprints: string[] - ): Promise>> { + fingerprints: string[], + from: Date, + to: Date + ): Promise<{ + granularity: TimeGranularity; + data: Record>; + }> { if (fingerprints.length === 0) { - return {}; + return { granularity: "hours", data: {} }; } - const hours = 24; + const granularity = detectTimeGranularity(from, to); + const intervalExpr = granularityToInterval(granularity); + const stepMs = granularityToStepMs(granularity); - const [queryError, records] = await this.clickhouse.errors.getHourlyOccurrences({ - organizationId, - projectId, - environmentId, - fingerprints, - hours, + const queryBuilder = this.clickhouse.errors.createOccurrencesQueryBuilder(intervalExpr); + + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint IN {fingerprints: Array(String)}", { fingerprints }); + queryBuilder.where("minute >= toStartOfMinute(fromUnixTimestamp64Milli({fromTimeMs: Int64}))", { + fromTimeMs: from.getTime(), }); + queryBuilder.where("minute <= toStartOfMinute(fromUnixTimestamp64Milli({toTimeMs: Int64}))", { + toTimeMs: to.getTime(), + }); + + queryBuilder.groupBy("error_fingerprint, bucket_epoch"); + queryBuilder.orderBy("error_fingerprint ASC, bucket_epoch ASC"); + + const [queryError, records] = await queryBuilder.execute(); if (queryError) { throw queryError; } - // Build 24 hourly buckets as epoch seconds (UTC, floored to hour) + // Build time buckets covering the full range const buckets: number[] = []; - const nowMs = Date.now(); - for (let i = hours - 1; i >= 0; i--) { - const hourStart = Math.floor((nowMs - i * 3_600_000) / 3_600_000) * 3_600; - buckets.push(hourStart); + const startEpoch = Math.floor(from.getTime() / stepMs) * (stepMs / 1000); + const endEpoch = Math.ceil(to.getTime() / 1000); + for (let epoch = startEpoch; epoch <= endEpoch; epoch += stepMs / 1000) { + buckets.push(epoch); } - // Index ClickHouse results by fingerprint → epoch → count + // Index results by fingerprint -> epoch -> count const grouped = new Map>(); for (const row of records ?? []) { - let byHour = grouped.get(row.error_fingerprint); - if (!byHour) { - byHour = new Map(); - grouped.set(row.error_fingerprint, byHour); + let byBucket = grouped.get(row.error_fingerprint); + if (!byBucket) { + byBucket = new Map(); + grouped.set(row.error_fingerprint, byBucket); } - byHour.set(row.hour_epoch, row.count); + byBucket.set(row.bucket_epoch, (byBucket.get(row.bucket_epoch) ?? 0) + row.count); } - const result: Record> = {}; + const data: Record> = {}; for (const fp of fingerprints) { - const byHour = grouped.get(fp); - result[fp] = buckets.map((epoch) => ({ + const byBucket = grouped.get(fp); + data[fp] = buckets.map((epoch) => ({ date: new Date(epoch * 1000), - count: byHour?.get(epoch) ?? 0, + count: byBucket?.get(epoch) ?? 0, })); } + return { granularity, data }; + } + + /** + * Fetches global first_seen / last_seen for a set of fingerprints from errors_v1. + */ + private async getGlobalSummary( + organizationId: string, + projectId: string, + environmentId: string, + fingerprints: string[] + ): Promise> { + const result = new Map(); + if (fingerprints.length === 0) return result; + + const queryBuilder = this.clickhouse.errors.listQueryBuilder(); + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint IN {fingerprints: Array(String)}", { fingerprints }); + queryBuilder.groupBy("error_fingerprint, task_identifier"); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError || !records) return result; + + for (const record of records) { + result.set(record.error_fingerprint, { + firstSeen: parseClickHouseDateTime(record.first_seen), + lastSeen: parseClickHouseDateTime(record.last_seen), + }); + } + return result; } } diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx index 94f3af88cdb..7a710c2e68c 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx @@ -8,7 +8,8 @@ import { findProjectBySlug } from "~/models/project.server"; import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { ErrorGroupPresenter, - type ErrorGroupHourlyActivity, + type ErrorGroupActivity, + type ErrorGroupOccurrences, type ErrorGroupSummary, } from "~/presenters/v3/ErrorGroupPresenter.server"; import { @@ -94,13 +95,23 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw error; }); - const hourlyActivityPromise = presenter - .getHourlyOccurrences(project.organizationId, project.id, environment.id, fingerprint) - .catch(() => [] as ErrorGroupHourlyActivity); + const now = new Date(); + const sevenDaysAgo = new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000); + + const activityPromise = presenter + .getOccurrences( + project.organizationId, + project.id, + environment.id, + fingerprint, + sevenDaysAgo, + now + ) + .catch(() => ({ granularity: "hours" as const, data: [] as ErrorGroupActivity })); return typeddefer({ data: detailPromise, - hourlyActivity: hourlyActivityPromise, + activity: activityPromise, organizationSlug, projectParam, envParam, @@ -109,7 +120,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }; export default function Page() { - const { data, hourlyActivity, organizationSlug, projectParam, envParam, fingerprint } = + const { data, activity, organizationSlug, projectParam, envParam, fingerprint } = useTypedLoaderData(); const errorsPath = v3ErrorsPath( @@ -165,7 +176,7 @@ export default function Page() { ; + activity: Promise; organizationSlug: string; projectParam: string; envParam: string; @@ -264,14 +275,14 @@ function ErrorGroupDetail({ )}
- {/* Activity over past 7 days by hour */} + {/* Activity over past 7 days */}
Activity (past 7 days) }> - }> - {(activity) => - activity.length > 0 ? ( - + }> + {(result) => + result.data.length > 0 ? ( + ) : ( ) @@ -316,7 +327,7 @@ const activityChartConfig: ChartConfig = { }, }; -function ActivityChart({ activity }: { activity: ErrorGroupHourlyActivity }) { +function ActivityChart({ activity }: { activity: ErrorGroupActivity }) { const data = useMemo( () => activity.map((d) => ({ diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx index 0c5135e12b4..011348a56b9 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx @@ -42,8 +42,8 @@ import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; import { ErrorsListPresenter, type ErrorGroup, - type ErrorHourlyActivity, - type ErrorHourlyOccurrences, + type ErrorOccurrenceActivity, + type ErrorOccurrences, type ErrorsList, } from "~/presenters/v3/ErrorsListPresenter.server"; import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; @@ -77,7 +77,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw new Response("Environment not found", { status: 404 }); } - // Get filters from query params const url = new URL(request.url); const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); const search = url.searchParams.get("search") ?? undefined; @@ -87,7 +86,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { const from = fromStr ? parseInt(fromStr, 10) : undefined; const to = toStr ? parseInt(toStr, 10) : undefined; - // Get the user's plan to determine retention limit const plan = await getCurrentPlan(project.organizationId); const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; @@ -112,21 +110,23 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw error; }); - const hourlyOccurrencesPromise = listPromise.then((result) => { - if ("error" in result) return {} as ErrorHourlyOccurrences; + const occurrencesPromise = listPromise.then((result) => { + if ("error" in result) return { granularity: "hours" as const, data: {} }; const fingerprints = result.errorGroups.map((g) => g.fingerprint); - if (fingerprints.length === 0) return {} as ErrorHourlyOccurrences; - return presenter.getHourlyOccurrences( + if (fingerprints.length === 0) return { granularity: "hours" as const, data: {} }; + return presenter.getOccurrences( project.organizationId, project.id, environment.id, - fingerprints + fingerprints, + result.filters.from, + result.filters.to ); }); return typeddefer({ data: listPromise, - hourlyOccurrences: hourlyOccurrencesPromise, + occurrences: occurrencesPromise, defaultPeriod: "1d", retentionLimitDays, organizationSlug, @@ -138,7 +138,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { export default function Page() { const { data, - hourlyOccurrences, + occurrences, defaultPeriod, retentionLimitDays, organizationSlug, @@ -180,7 +180,6 @@ export default function Page() { } > {(result) => { - // Check if result contains an error if ("error" in result) { return (
@@ -205,7 +204,7 @@ export default function Page() { /> ; + occurrences: Promise; organizationSlug: string; projectParam: string; envParam: string; @@ -311,7 +310,7 @@ function ErrorsList({ Task Error Occurrences - Past 24h + Activity First seen Last seen @@ -321,7 +320,7 @@ function ErrorsList({ ; + occurrences: Promise; organizationSlug: string; projectParam: string; envParam: string; @@ -366,9 +365,9 @@ function ErrorGroupRow({ {errorGroup.count.toLocaleString()} }> - }> - {(data) => { - const activity = data[errorGroup.fingerprint]; + }> + {(result) => { + const activity = result.data[errorGroup.fingerprint]; return activity ? ( ) : ( @@ -388,7 +387,11 @@ function ErrorGroupRow({ ); } -function ErrorActivityGraph({ activity }: { activity: ErrorHourlyActivity }) { +function ErrorActivityGraph({ + activity, +}: { + activity: ErrorOccurrenceActivity; +}) { const maxCount = Math.max(...activity.map((d) => d.count)); return ( diff --git a/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts new file mode 100644 index 00000000000..fd36003e8f3 --- /dev/null +++ b/apps/webapp/app/routes/resources.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.ts @@ -0,0 +1,67 @@ +import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; +import { json } from "@remix-run/node"; +import { requireUser } from "~/services/session.server"; +import { EnvironmentParamSchema } from "~/utils/pathBuilder"; +import { findProjectBySlug } from "~/models/project.server"; +import { findEnvironmentBySlug } from "~/models/runtimeEnvironment.server"; +import { ErrorsListPresenter, ErrorsListOptionsSchema } from "~/presenters/v3/ErrorsListPresenter.server"; +import { $replica } from "~/db.server"; +import { logsClickhouseClient } from "~/services/clickhouseInstance.server"; +import { getCurrentPlan } from "~/services/platform.v3.server"; + +export const loader = async ({ request, params }: LoaderFunctionArgs) => { + const user = await requireUser(request); + const userId = user.id; + + const { projectParam, organizationSlug, envParam } = EnvironmentParamSchema.parse(params); + + const project = await findProjectBySlug(organizationSlug, projectParam, userId); + if (!project) { + throw new Response("Project not found", { status: 404 }); + } + + const environment = await findEnvironmentBySlug(project.id, envParam, userId); + if (!environment) { + throw new Response("Environment not found", { status: 404 }); + } + + // Get the user's plan to determine retention limit + const plan = await getCurrentPlan(project.organizationId); + const retentionLimitDays = plan?.v3Subscription?.plan?.limits.logRetentionDays.number ?? 30; + + // Get filters from query params + const url = new URL(request.url); + const tasks = url.searchParams.getAll("tasks").filter((t) => t.length > 0); + const search = url.searchParams.get("search") ?? undefined; + const cursor = url.searchParams.get("cursor") ?? undefined; + const period = url.searchParams.get("period") ?? undefined; + const fromStr = url.searchParams.get("from"); + const toStr = url.searchParams.get("to"); + let from = fromStr ? parseInt(fromStr, 10) : undefined; + let to = toStr ? parseInt(toStr, 10) : undefined; + + if (Number.isNaN(from)) from = undefined; + if (Number.isNaN(to)) to = undefined; + + const options = ErrorsListOptionsSchema.parse({ + userId, + projectId: project.id, + tasks: tasks.length > 0 ? tasks : undefined, + search, + cursor, + period, + from, + to, + defaultPeriod: "7d", + retentionLimitDays, + }) as any; // Validated by ErrorsListOptionsSchema at runtime + + const presenter = new ErrorsListPresenter($replica, logsClickhouseClient); + const result = await presenter.call(project.organizationId, environment.id, options); + + return json({ + errorGroups: result.errorGroups, + pagination: result.pagination, + filters: result.filters, + }); +}; diff --git a/internal-packages/clickhouse/schema/023_create_error_occurrences_v1.sql b/internal-packages/clickhouse/schema/023_create_error_occurrences_v1.sql new file mode 100644 index 00000000000..34b268ea5d9 --- /dev/null +++ b/internal-packages/clickhouse/schema/023_create_error_occurrences_v1.sql @@ -0,0 +1,88 @@ +-- +goose Up +-- Per-minute error occurrence counts, keyed by fingerprint + task + version. +-- Powers precise time-range filtering and dynamic-granularity occurrence charts. +CREATE TABLE + trigger_dev.error_occurrences_v1 ( + organization_id String, + project_id String, + environment_id String, + task_identifier String, + error_fingerprint String, + task_version String, + minute DateTime, + error_type String, + error_message String, + stack_trace String, + count UInt64, + INDEX idx_error_type_search lower(error_type) TYPE ngrambf_v1 (3, 32768, 2, 0) GRANULARITY 1, + INDEX idx_error_message_search lower(error_message) TYPE ngrambf_v1 (3, 32768, 2, 0) GRANULARITY 1 + ) ENGINE = SummingMergeTree (count) +PARTITION BY + toDate (minute) +ORDER BY + ( + organization_id, + project_id, + environment_id, + task_identifier, + error_fingerprint, + task_version, + minute + ) TTL minute + INTERVAL 90 DAY SETTINGS index_granularity = 8192; + +CREATE MATERIALIZED VIEW trigger_dev.mv_error_occurrences_v1 TO trigger_dev.error_occurrences_v1 AS +SELECT + organization_id, + project_id, + environment_id, + task_identifier, + error_fingerprint, + task_version, + toStartOfMinute (created_at) as minute, + any ( + coalesce( + nullIf(toString (error.data.type), ''), + nullIf(toString (error.data.name), ''), + 'Error' + ) + ) as error_type, + any ( + coalesce( + nullIf( + substring(toString (error.data.message), 1, 500), + '' + ), + 'Unknown error' + ) + ) as error_message, + any ( + coalesce( + substring(toString (error.data.stack), 1, 2000), + '' + ) + ) as stack_trace, + count() as count +FROM + trigger_dev.task_runs_v2 +WHERE + error_fingerprint != '' + AND status IN ( + 'SYSTEM_FAILURE', + 'CRASHED', + 'INTERRUPTED', + 'COMPLETED_WITH_ERRORS' + ) + AND _is_deleted = 0 +GROUP BY + organization_id, + project_id, + environment_id, + task_identifier, + error_fingerprint, + task_version, + minute; + +-- +goose Down +DROP VIEW IF EXISTS trigger_dev.mv_error_occurrences_v1; + +DROP TABLE IF EXISTS trigger_dev.error_occurrences_v1; \ No newline at end of file diff --git a/internal-packages/clickhouse/src/errors.ts b/internal-packages/clickhouse/src/errors.ts index 040561ebd5d..d15d07392aa 100644 --- a/internal-packages/clickhouse/src/errors.ts +++ b/internal-packages/clickhouse/src/errors.ts @@ -1,6 +1,7 @@ import { ClickHouseSettings } from "@clickhouse/client"; import { z } from "zod"; import { ClickhouseReader } from "./client/types.js"; +import { ClickhouseQueryBuilder } from "./client/queryBuilder.js"; export const ErrorGroupsListQueryResult = z.object({ error_fingerprint: z.string(), @@ -252,3 +253,126 @@ export function getErrorInstances(ch: ClickhouseReader, settings?: ClickHouseSet settings, }); } + +// --------------------------------------------------------------------------- +// error_occurrences_v1 – per-minute bucketed error counts +// --------------------------------------------------------------------------- + +export const ErrorOccurrencesListQueryResult = z.object({ + error_fingerprint: z.string(), + task_identifier: z.string(), + error_type: z.string(), + error_message: z.string(), + occurrence_count: z.number(), +}); + +export type ErrorOccurrencesListQueryResult = z.infer; + +/** + * Query builder for listing error groups from the per-minute error_occurrences_v1 table. + * Time filtering is done via WHERE on the `minute` column, giving precise time-scoped counts. + */ +export function getErrorOccurrencesListQueryBuilder( + ch: ClickhouseReader, + settings?: ClickHouseSettings +) { + return ch.queryBuilder({ + name: "getErrorOccurrencesList", + baseQuery: ` + SELECT + error_fingerprint, + task_identifier, + any(error_type) as error_type, + any(error_message) as error_message, + sum(count) as occurrence_count + FROM trigger_dev.error_occurrences_v1 + `, + schema: ErrorOccurrencesListQueryResult, + settings, + }); +} + +export const ErrorOccurrencesBucketQueryResult = z.object({ + error_fingerprint: z.string(), + bucket_epoch: z.number(), + count: z.number(), +}); + +export type ErrorOccurrencesBucketQueryResult = z.infer; + +/** + * Creates a query builder for bucketed error occurrence counts. + * The `intervalExpr` is a ClickHouse INTERVAL literal (e.g. "INTERVAL 1 HOUR"). + * Returns a builder directly since the base query varies with each granularity. + */ +export function createErrorOccurrencesQueryBuilder( + ch: ClickhouseReader, + intervalExpr: string, + settings?: ClickHouseSettings +): ClickhouseQueryBuilder { + return new ClickhouseQueryBuilder( + "getErrorOccurrencesBucketed", + ` + SELECT + error_fingerprint, + toUnixTimestamp(toStartOfInterval(minute, ${intervalExpr})) as bucket_epoch, + sum(count) as count + FROM trigger_dev.error_occurrences_v1 + `, + ch, + ErrorOccurrencesBucketQueryResult, + settings + ); +} + +// --------------------------------------------------------------------------- +// Time granularity helpers +// --------------------------------------------------------------------------- + +export type TimeGranularity = "minutes" | "hours" | "days" | "weeks" | "months"; + +const MINUTE_MS = 60 * 1000; +const HOUR_MS = 60 * MINUTE_MS; +const DAY_MS = 24 * HOUR_MS; +const WEEK_MS = 7 * DAY_MS; +const MONTH_MS = 30 * DAY_MS; + +/** + * Determines the appropriate time granularity for bucketing based on the + * span of a time range, following the same thresholds as the Query chart UI. + */ +export function detectTimeGranularity(from: Date, to: Date): TimeGranularity { + const rangeMs = to.getTime() - from.getTime(); + + if (rangeMs <= 2 * HOUR_MS) return "minutes"; + if (rangeMs <= 2 * DAY_MS) return "hours"; + if (rangeMs <= 2 * WEEK_MS) return "days"; + if (rangeMs <= 3 * MONTH_MS) return "weeks"; + return "months"; +} + +const GRANULARITY_TO_INTERVAL: Record = { + minutes: "INTERVAL 1 MINUTE", + hours: "INTERVAL 1 HOUR", + days: "INTERVAL 1 DAY", + weeks: "INTERVAL 1 WEEK", + months: "INTERVAL 1 MONTH", +}; + +const GRANULARITY_TO_STEP_MS: Record = { + minutes: MINUTE_MS, + hours: HOUR_MS, + days: DAY_MS, + weeks: WEEK_MS, + months: MONTH_MS, +}; + +/** Returns a ClickHouse INTERVAL expression for use with toStartOfInterval(). */ +export function granularityToInterval(granularity: TimeGranularity): string { + return GRANULARITY_TO_INTERVAL[granularity]; +} + +/** Returns the step size in milliseconds for a granularity (approximate for months). */ +export function granularityToStepMs(granularity: TimeGranularity): number { + return GRANULARITY_TO_STEP_MS[granularity]; +} diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 58ee7dca17a..4d0ded6e7cf 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -33,6 +33,14 @@ import { getErrorGroupsListQueryBuilder, getErrorInstancesListQueryBuilder, getErrorHourlyOccurrences, + getErrorOccurrencesListQueryBuilder, + createErrorOccurrencesQueryBuilder, +} from "./errors.js"; +export { + detectTimeGranularity, + granularityToInterval, + granularityToStepMs, + type TimeGranularity, } from "./errors.js"; import { Logger, type LogLevel } from "@trigger.dev/core/logger"; import type { Agent as HttpAgent } from "http"; @@ -245,6 +253,9 @@ export class ClickHouse { getHourlyOccurrences: getErrorHourlyOccurrences(this.reader), listQueryBuilder: getErrorGroupsListQueryBuilder(this.reader), instancesQueryBuilder: getErrorInstancesListQueryBuilder(this.reader), + occurrencesListQueryBuilder: getErrorOccurrencesListQueryBuilder(this.reader), + createOccurrencesQueryBuilder: (intervalExpr: string) => + createErrorOccurrencesQueryBuilder(this.reader, intervalExpr), }; } } From 9b463db4806eb3be72ec03d93abe0fed1ea348e5 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 3 Mar 2026 16:33:22 +0000 Subject: [PATCH 11/12] Better time bucketing and nicer presenter logic --- .../v3/ErrorGroupPresenter.server.ts | 218 +++++------------- .../v3/ErrorsListPresenter.server.ts | 31 +-- .../v3/NextRunListPresenter.server.ts | 3 + .../route.tsx | 27 +-- .../route.tsx | 4 +- .../clickhouseRunsRepository.server.ts | 6 + .../runsRepository/runsRepository.server.ts | 1 + apps/webapp/app/utils/timeGranularity.ts | 36 +++ apps/webapp/test/timeGranularity.test.ts | 54 +++++ internal-packages/clickhouse/src/errors.ts | 91 -------- internal-packages/clickhouse/src/index.ts | 9 +- internal-packages/clickhouse/src/intervals.ts | 5 + 12 files changed, 186 insertions(+), 299 deletions(-) create mode 100644 apps/webapp/app/utils/timeGranularity.ts create mode 100644 apps/webapp/test/timeGranularity.test.ts create mode 100644 internal-packages/clickhouse/src/intervals.ts diff --git a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts index 207cba8f126..ae8e199fe9e 100644 --- a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts @@ -1,69 +1,41 @@ import { z } from "zod"; -import { - type ClickHouse, - type TimeGranularity, - detectTimeGranularity, - granularityToInterval, - granularityToStepMs, -} from "@internal/clickhouse"; +import { type ClickHouse, msToClickHouseInterval } from "@internal/clickhouse"; +import { TimeGranularity } from "~/utils/timeGranularity"; + +const errorGroupGranularity = new TimeGranularity([ + { max: "1h", granularity: "1m" }, + { max: "1d", granularity: "30m" }, + { max: "1w", granularity: "8h" }, + { max: "31d", granularity: "1d" }, + { max: "45d", granularity: "1w" }, + { max: "Infinity", granularity: "30d" }, +]); import { type PrismaClientOrTransaction } from "@trigger.dev/database"; -import { type Direction } from "~/components/ListPagination"; import { findDisplayableEnvironment } from "~/models/runtimeEnvironment.server"; import { ServiceValidationError } from "~/v3/services/baseService.server"; import { BasePresenter } from "~/presenters/v3/basePresenter.server"; +import { + NextRunListPresenter, + type NextRunList, +} from "~/presenters/v3/NextRunListPresenter.server"; export type ErrorGroupOptions = { userId?: string; projectId: string; fingerprint: string; - // pagination - direction?: Direction; - cursor?: string; - pageSize?: number; + runsPageSize?: number; }; export const ErrorGroupOptionsSchema = z.object({ userId: z.string().optional(), projectId: z.string(), fingerprint: z.string(), - direction: z.enum(["forward", "backward"]).optional(), - cursor: z.string().optional(), - pageSize: z.number().int().positive().max(1000).optional(), + runsPageSize: z.number().int().positive().max(1000).optional(), }); -const DEFAULT_PAGE_SIZE = 50; +const DEFAULT_RUNS_PAGE_SIZE = 25; export type ErrorGroupDetail = Awaited>; -export type ErrorInstance = ErrorGroupDetail["instances"][0]; - -// Cursor for error instances pagination -type ErrorInstanceCursor = { - createdAt: string; - runId: string; -}; - -const ErrorInstanceCursorSchema = z.object({ - createdAt: z.string(), - runId: z.string(), -}); - -function encodeCursor(cursor: ErrorInstanceCursor): string { - return Buffer.from(JSON.stringify(cursor)).toString("base64"); -} - -function decodeCursor(cursor: string): ErrorInstanceCursor | null { - try { - const decoded = Buffer.from(cursor, "base64").toString("utf-8"); - const parsed = JSON.parse(decoded); - const validated = ErrorInstanceCursorSchema.safeParse(parsed); - if (!validated.success) { - return null; - } - return validated.data as ErrorInstanceCursor; - } catch { - return null; - } -} function parseClickHouseDateTime(value: string): Date { const asNum = Number(value); @@ -77,7 +49,6 @@ export type ErrorGroupSummary = { fingerprint: string; errorType: string; errorMessage: string; - stackTrace?: string; taskIdentifier: string; count: number; firstSeen: Date; @@ -90,6 +61,7 @@ export type ErrorGroupActivity = ErrorGroupOccurrences["data"]; export class ErrorGroupPresenter extends BasePresenter { constructor( private readonly replica: PrismaClientOrTransaction, + private readonly logsClickhouse: ClickHouse, private readonly clickhouse: ClickHouse ) { super(undefined, replica); @@ -98,7 +70,12 @@ export class ErrorGroupPresenter extends BasePresenter { public async call( organizationId: string, environmentId: string, - { userId, projectId, fingerprint, cursor, pageSize = DEFAULT_PAGE_SIZE }: ErrorGroupOptions + { + userId, + projectId, + fingerprint, + runsPageSize = DEFAULT_RUNS_PAGE_SIZE, + }: ErrorGroupOptions ) { const displayableEnvironment = await findDisplayableEnvironment(environmentId, userId); @@ -106,59 +83,19 @@ export class ErrorGroupPresenter extends BasePresenter { throw new ServiceValidationError("No environment found"); } - // Run summary (aggregated) and instances queries in parallel - const [summary, instancesResult] = await Promise.all([ + const [summary, runList] = await Promise.all([ this.getSummary(organizationId, projectId, environmentId, fingerprint), - this.getInstances(organizationId, projectId, environmentId, fingerprint, cursor, pageSize), + this.getRunList(organizationId, environmentId, { + userId, + projectId, + fingerprint, + pageSize: runsPageSize, + }), ]); - // Get stack trace from the most recent instance - let stackTrace: string | undefined; - if (instancesResult.instances.length > 0) { - const firstInstance = instancesResult.instances[0]; - try { - const errorData = JSON.parse(firstInstance.error_text) as Record; - stackTrace = (errorData.stack || errorData.stacktrace) as string | undefined; - } catch { - // no stack trace available - } - } - - // Build error group combining aggregated summary with instance stack trace - let errorGroup: ErrorGroupSummary | undefined; - if (summary) { - errorGroup = { - ...summary, - stackTrace, - }; - } - - // Transform instances - const transformedInstances = instancesResult.instances.map((instance) => { - let parsedError: any; - try { - parsedError = JSON.parse(instance.error_text); - } catch { - parsedError = { message: instance.error_text }; - } - - return { - runId: instance.run_id, - friendlyId: instance.friendly_id, - taskIdentifier: instance.task_identifier, - createdAt: new Date(parseInt(instance.created_at) * 1000), - status: instance.status, - error: parsedError, - traceId: instance.trace_id, - taskVersion: instance.task_version, - }; - }); - return { - errorGroup, - instances: transformedInstances, - runFriendlyIds: transformedInstances.map((i) => i.friendlyId), - pagination: instancesResult.pagination, + errorGroup: summary, + runList, }; } @@ -174,14 +111,12 @@ export class ErrorGroupPresenter extends BasePresenter { from: Date, to: Date ): Promise<{ - granularity: TimeGranularity; data: Array<{ date: Date; count: number }>; }> { - const granularity = detectTimeGranularity(from, to); - const intervalExpr = granularityToInterval(granularity); - const stepMs = granularityToStepMs(granularity); + const granularityMs = errorGroupGranularity.getTimeGranularityMs(from, to); + const intervalExpr = msToClickHouseInterval(granularityMs); - const queryBuilder = this.clickhouse.errors.createOccurrencesQueryBuilder(intervalExpr); + const queryBuilder = this.logsClickhouse.errors.createOccurrencesQueryBuilder(intervalExpr); queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); queryBuilder.where("project_id = {projectId: String}", { projectId }); @@ -205,9 +140,9 @@ export class ErrorGroupPresenter extends BasePresenter { // Build time buckets covering the full range const buckets: number[] = []; - const startEpoch = Math.floor(from.getTime() / stepMs) * (stepMs / 1000); + const startEpoch = Math.floor(from.getTime() / granularityMs) * (granularityMs / 1000); const endEpoch = Math.ceil(to.getTime() / 1000); - for (let epoch = startEpoch; epoch <= endEpoch; epoch += stepMs / 1000) { + for (let epoch = startEpoch; epoch <= endEpoch; epoch += granularityMs / 1000) { buckets.push(epoch); } @@ -217,7 +152,6 @@ export class ErrorGroupPresenter extends BasePresenter { } return { - granularity, data: buckets.map((epoch) => ({ date: new Date(epoch * 1000), count: byBucket.get(epoch) ?? 0, @@ -230,8 +164,8 @@ export class ErrorGroupPresenter extends BasePresenter { projectId: string, environmentId: string, fingerprint: string - ): Promise | undefined> { - const queryBuilder = this.clickhouse.errors.listQueryBuilder(); + ): Promise { + const queryBuilder = this.logsClickhouse.errors.listQueryBuilder(); queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); queryBuilder.where("project_id = {projectId: String}", { projectId }); @@ -263,63 +197,29 @@ export class ErrorGroupPresenter extends BasePresenter { }; } - private async getInstances( + private async getRunList( organizationId: string, - projectId: string, environmentId: string, - fingerprint: string, - cursor: string | undefined, - pageSize: number - ) { - const queryBuilder = this.clickhouse.errors.instancesQueryBuilder(); - - queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); - queryBuilder.where("project_id = {projectId: String}", { projectId }); - queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); - queryBuilder.where("error_fingerprint = {errorFingerprint: String}", { - errorFingerprint: fingerprint, - }); - queryBuilder.where("_is_deleted = 0"); - - const decodedCursor = cursor ? decodeCursor(cursor) : null; - if (decodedCursor) { - queryBuilder.where( - `(created_at < {cursorCreatedAt: String} OR (created_at = {cursorCreatedAt: String} AND run_id < {cursorRunId: String}))`, - { - cursorCreatedAt: decodedCursor.createdAt, - cursorRunId: decodedCursor.runId, - } - ); - } - - queryBuilder.orderBy("created_at DESC, run_id DESC"); - queryBuilder.limit(pageSize + 1); - - const [queryError, records] = await queryBuilder.execute(); - - if (queryError) { - throw queryError; + options: { + userId?: string; + projectId: string; + fingerprint: string; + pageSize: number; } + ): Promise { + const runListPresenter = new NextRunListPresenter(this.replica, this.clickhouse); + + const result = await runListPresenter.call(organizationId, environmentId, { + userId: options.userId, + projectId: options.projectId, + errorFingerprint: options.fingerprint, + pageSize: options.pageSize, + }); - const results = records || []; - const hasMore = results.length > pageSize; - const instances = results.slice(0, pageSize); - - let nextCursor: string | undefined; - if (hasMore && instances.length > 0) { - const lastInstance = instances[instances.length - 1]; - nextCursor = encodeCursor({ - createdAt: lastInstance.created_at, - runId: lastInstance.run_id, - }); + if (result.runs.length === 0) { + return undefined; } - return { - instances, - pagination: { - hasMore, - nextCursor, - }, - }; + return result; } } diff --git a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts index 05cf6c9b619..ca5ef771f45 100644 --- a/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorsListPresenter.server.ts @@ -1,11 +1,14 @@ import { z } from "zod"; -import { - type ClickHouse, - type TimeGranularity, - detectTimeGranularity, - granularityToInterval, - granularityToStepMs, -} from "@internal/clickhouse"; +import { type ClickHouse, msToClickHouseInterval } from "@internal/clickhouse"; +import { TimeGranularity } from "~/utils/timeGranularity"; + +const errorsListGranularity = new TimeGranularity([ + { max: "2h", granularity: "1m" }, + { max: "2d", granularity: "1h" }, + { max: "2w", granularity: "1d" }, + { max: "3 months", granularity: "1w" }, + { max: "Infinity", granularity: "30d" }, +]); import { type PrismaClientOrTransaction } from "@trigger.dev/database"; import { type Direction } from "~/components/ListPagination"; import { timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; @@ -275,16 +278,14 @@ export class ErrorsListPresenter extends BasePresenter { from: Date, to: Date ): Promise<{ - granularity: TimeGranularity; data: Record>; }> { if (fingerprints.length === 0) { - return { granularity: "hours", data: {} }; + return { data: {} }; } - const granularity = detectTimeGranularity(from, to); - const intervalExpr = granularityToInterval(granularity); - const stepMs = granularityToStepMs(granularity); + const granularityMs = errorsListGranularity.getTimeGranularityMs(from, to); + const intervalExpr = msToClickHouseInterval(granularityMs); const queryBuilder = this.clickhouse.errors.createOccurrencesQueryBuilder(intervalExpr); @@ -310,9 +311,9 @@ export class ErrorsListPresenter extends BasePresenter { // Build time buckets covering the full range const buckets: number[] = []; - const startEpoch = Math.floor(from.getTime() / stepMs) * (stepMs / 1000); + const startEpoch = Math.floor(from.getTime() / granularityMs) * (granularityMs / 1000); const endEpoch = Math.ceil(to.getTime() / 1000); - for (let epoch = startEpoch; epoch <= endEpoch; epoch += stepMs / 1000) { + for (let epoch = startEpoch; epoch <= endEpoch; epoch += granularityMs / 1000) { buckets.push(epoch); } @@ -336,7 +337,7 @@ export class ErrorsListPresenter extends BasePresenter { })); } - return { granularity, data }; + return { data }; } /** diff --git a/apps/webapp/app/presenters/v3/NextRunListPresenter.server.ts b/apps/webapp/app/presenters/v3/NextRunListPresenter.server.ts index 2375ea161a9..766a28dcbd6 100644 --- a/apps/webapp/app/presenters/v3/NextRunListPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/NextRunListPresenter.server.ts @@ -33,6 +33,7 @@ export type RunListOptions = { runId?: string[]; queues?: string[]; machines?: MachinePresetName[]; + errorFingerprint?: string; //pagination direction?: Direction; cursor?: string; @@ -70,6 +71,7 @@ export class NextRunListPresenter { runId, queues, machines, + errorFingerprint, from, to, direction = "forward", @@ -182,6 +184,7 @@ export class NextRunListPresenter { bulkId, queues, machines, + errorFingerprint, page: { size: pageSize, cursor, diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx index 7a710c2e68c..46015c0b870 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx @@ -12,10 +12,7 @@ import { type ErrorGroupOccurrences, type ErrorGroupSummary, } from "~/presenters/v3/ErrorGroupPresenter.server"; -import { - NextRunListPresenter, - type NextRunList, -} from "~/presenters/v3/NextRunListPresenter.server"; +import { type NextRunList } from "~/presenters/v3/NextRunListPresenter.server"; import { $replica } from "~/db.server"; import { logsClickhouseClient, clickhouseClient } from "~/services/clickhouseInstance.server"; import { NavBar, PageTitle } from "~/components/primitives/PageHeader"; @@ -62,7 +59,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw new Response("Environment not found", { status: 404 }); } - const presenter = new ErrorGroupPresenter($replica, logsClickhouseClient); + const presenter = new ErrorGroupPresenter($replica, logsClickhouseClient, clickhouseClient); const detailPromise = presenter .call(project.organizationId, environment.id, { @@ -70,24 +67,6 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { projectId: project.id, fingerprint, }) - .then(async (result) => { - if (result.runFriendlyIds.length === 0) { - return { ...result, runList: undefined }; - } - - const runListPresenter = new NextRunListPresenter($replica, clickhouseClient); - const runList = await runListPresenter.call(project.organizationId, environment.id, { - userId, - projectId: project.id, - runId: result.runFriendlyIds, - pageSize: 25, - }); - - return { - ...result, - runList, - }; - }) .catch((error) => { if (error instanceof ServiceValidationError) { return { error: error.message }; @@ -107,7 +86,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { sevenDaysAgo, now ) - .catch(() => ({ granularity: "hours" as const, data: [] as ErrorGroupActivity })); + .catch(() => ({ data: [] as ErrorGroupActivity })); return typeddefer({ data: detailPromise, diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx index 011348a56b9..430e4cd42b5 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx @@ -111,9 +111,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { }); const occurrencesPromise = listPromise.then((result) => { - if ("error" in result) return { granularity: "hours" as const, data: {} }; + if ("error" in result) return { data: {} }; const fingerprints = result.errorGroups.map((g) => g.fingerprint); - if (fingerprints.length === 0) return { granularity: "hours" as const, data: {} }; + if (fingerprints.length === 0) return { data: {} }; return presenter.getOccurrences( project.organizationId, project.id, diff --git a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts index 9d3a92e9110..92d5142d3a6 100644 --- a/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/clickhouseRunsRepository.server.ts @@ -328,4 +328,10 @@ function applyRunFiltersToQueryBuilder( machines: options.machines, }); } + + if (options.errorFingerprint) { + queryBuilder.where("error_fingerprint = {errorFingerprint: String}", { + errorFingerprint: options.errorFingerprint, + }); + } } diff --git a/apps/webapp/app/services/runsRepository/runsRepository.server.ts b/apps/webapp/app/services/runsRepository/runsRepository.server.ts index 90b58b8a980..76a1f8eedae 100644 --- a/apps/webapp/app/services/runsRepository/runsRepository.server.ts +++ b/apps/webapp/app/services/runsRepository/runsRepository.server.ts @@ -44,6 +44,7 @@ const RunListInputOptionsSchema = z.object({ bulkId: z.string().optional(), queues: z.array(z.string()).optional(), machines: MachinePresetName.array().optional(), + errorFingerprint: z.string().optional(), }); export type RunListInputOptions = z.infer; diff --git a/apps/webapp/app/utils/timeGranularity.ts b/apps/webapp/app/utils/timeGranularity.ts new file mode 100644 index 00000000000..699da3693ee --- /dev/null +++ b/apps/webapp/app/utils/timeGranularity.ts @@ -0,0 +1,36 @@ +import parseDuration from "parse-duration"; + +export type TimeGranularityBracket = { + max: string; + granularity: string; +}; + +type ParsedBracket = { + maxMs: number; + granularityMs: number; +}; + +export class TimeGranularity { + private readonly parsed: ParsedBracket[]; + + constructor(brackets: TimeGranularityBracket[]) { + if (brackets.length === 0) { + throw new Error("TimeGranularity requires at least one bracket"); + } + + this.parsed = brackets.map((b) => ({ + maxMs: parseDuration(b.max) ?? Infinity, + granularityMs: parseDuration(b.granularity)!, + })); + } + + getTimeGranularityMs(from: Date, to: Date): number { + const rangeMs = to.getTime() - from.getTime(); + for (const bracket of this.parsed) { + if (rangeMs <= bracket.maxMs) { + return bracket.granularityMs; + } + } + return this.parsed[this.parsed.length - 1].granularityMs; + } +} diff --git a/apps/webapp/test/timeGranularity.test.ts b/apps/webapp/test/timeGranularity.test.ts new file mode 100644 index 00000000000..dca24e7efa8 --- /dev/null +++ b/apps/webapp/test/timeGranularity.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from "vitest"; +import { TimeGranularity } from "~/utils/timeGranularity"; + +const SECOND = 1_000; +const MINUTE = 60 * SECOND; +const HOUR = 60 * MINUTE; + +function makeRange(durationMs: number): [Date, Date] { + const from = new Date("2025-01-01T00:00:00Z"); + const to = new Date(from.getTime() + durationMs); + return [from, to]; +} + +describe("TimeGranularity", () => { + const granularity = new TimeGranularity([ + { max: "1h", granularity: "10s" }, + { max: "6h", granularity: "1m" }, + { max: "Infinity", granularity: "10m" }, + ]); + + it("returns the first bracket when range is within its max", () => { + const [from, to] = makeRange(30 * MINUTE); + expect(granularity.getTimeGranularityMs(from, to)).toBe(10 * SECOND); + }); + + it("returns a middle bracket when range exceeds the first but not the second", () => { + const [from, to] = makeRange(2 * HOUR); + expect(granularity.getTimeGranularityMs(from, to)).toBe(1 * MINUTE); + }); + + it("returns the last bracket when range exceeds all non-Infinity maxes", () => { + const [from, to] = makeRange(24 * HOUR); + expect(granularity.getTimeGranularityMs(from, to)).toBe(10 * MINUTE); + }); + + it("matches a bracket when range exactly equals its max", () => { + const [from, to] = makeRange(1 * HOUR); + expect(granularity.getTimeGranularityMs(from, to)).toBe(10 * SECOND); + }); + + it("moves to the next bracket when range exceeds a boundary by 1ms", () => { + const [from, to] = makeRange(1 * HOUR + 1); + expect(granularity.getTimeGranularityMs(from, to)).toBe(1 * MINUTE); + }); + + it("returns the first bracket's granularity for a zero-length range", () => { + const date = new Date("2025-01-01T00:00:00Z"); + expect(granularity.getTimeGranularityMs(date, date)).toBe(10 * SECOND); + }); + + it("throws when constructed with an empty array", () => { + expect(() => new TimeGranularity([])).toThrow("at least one bracket"); + }); +}); diff --git a/internal-packages/clickhouse/src/errors.ts b/internal-packages/clickhouse/src/errors.ts index d15d07392aa..1c268d76faa 100644 --- a/internal-packages/clickhouse/src/errors.ts +++ b/internal-packages/clickhouse/src/errors.ts @@ -129,46 +129,6 @@ export const ErrorInstanceQueryParams = z.object({ export type ErrorInstanceQueryParams = z.infer; -export const ErrorInstancesListQueryResult = z.object({ - run_id: z.string(), - friendly_id: z.string(), - task_identifier: z.string(), - created_at: z.string(), - status: z.string(), - error_text: z.string(), - trace_id: z.string(), - task_version: z.string(), -}); - -export type ErrorInstancesListQueryResult = z.infer; - -/** - * Gets a query builder for listing error instances from task_runs_v2. - * Allows flexible filtering and pagination for runs with a specific error fingerprint. - */ -export function getErrorInstancesListQueryBuilder( - ch: ClickhouseReader, - settings?: ClickHouseSettings -) { - return ch.queryBuilder({ - name: "getErrorInstancesList", - baseQuery: ` - SELECT - run_id, - friendly_id, - task_identifier, - toString(created_at) as created_at, - status, - error_text, - trace_id, - task_version - FROM trigger_dev.task_runs_v2 FINAL - `, - schema: ErrorInstancesListQueryResult, - settings, - }); -} - export const ErrorHourlyOccurrencesQueryResult = z.object({ error_fingerprint: z.string(), hour_epoch: z.number(), @@ -325,54 +285,3 @@ export function createErrorOccurrencesQueryBuilder( ); } -// --------------------------------------------------------------------------- -// Time granularity helpers -// --------------------------------------------------------------------------- - -export type TimeGranularity = "minutes" | "hours" | "days" | "weeks" | "months"; - -const MINUTE_MS = 60 * 1000; -const HOUR_MS = 60 * MINUTE_MS; -const DAY_MS = 24 * HOUR_MS; -const WEEK_MS = 7 * DAY_MS; -const MONTH_MS = 30 * DAY_MS; - -/** - * Determines the appropriate time granularity for bucketing based on the - * span of a time range, following the same thresholds as the Query chart UI. - */ -export function detectTimeGranularity(from: Date, to: Date): TimeGranularity { - const rangeMs = to.getTime() - from.getTime(); - - if (rangeMs <= 2 * HOUR_MS) return "minutes"; - if (rangeMs <= 2 * DAY_MS) return "hours"; - if (rangeMs <= 2 * WEEK_MS) return "days"; - if (rangeMs <= 3 * MONTH_MS) return "weeks"; - return "months"; -} - -const GRANULARITY_TO_INTERVAL: Record = { - minutes: "INTERVAL 1 MINUTE", - hours: "INTERVAL 1 HOUR", - days: "INTERVAL 1 DAY", - weeks: "INTERVAL 1 WEEK", - months: "INTERVAL 1 MONTH", -}; - -const GRANULARITY_TO_STEP_MS: Record = { - minutes: MINUTE_MS, - hours: HOUR_MS, - days: DAY_MS, - weeks: WEEK_MS, - months: MONTH_MS, -}; - -/** Returns a ClickHouse INTERVAL expression for use with toStartOfInterval(). */ -export function granularityToInterval(granularity: TimeGranularity): string { - return GRANULARITY_TO_INTERVAL[granularity]; -} - -/** Returns the step size in milliseconds for a granularity (approximate for months). */ -export function granularityToStepMs(granularity: TimeGranularity): number { - return GRANULARITY_TO_STEP_MS[granularity]; -} diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 4d0ded6e7cf..653be85af49 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -31,17 +31,11 @@ import { getErrorGroups, getErrorInstances, getErrorGroupsListQueryBuilder, - getErrorInstancesListQueryBuilder, getErrorHourlyOccurrences, getErrorOccurrencesListQueryBuilder, createErrorOccurrencesQueryBuilder, } from "./errors.js"; -export { - detectTimeGranularity, - granularityToInterval, - granularityToStepMs, - type TimeGranularity, -} from "./errors.js"; +export { msToClickHouseInterval } from "./intervals.js"; import { Logger, type LogLevel } from "@trigger.dev/core/logger"; import type { Agent as HttpAgent } from "http"; import type { Agent as HttpsAgent } from "https"; @@ -252,7 +246,6 @@ export class ClickHouse { getInstances: getErrorInstances(this.reader), getHourlyOccurrences: getErrorHourlyOccurrences(this.reader), listQueryBuilder: getErrorGroupsListQueryBuilder(this.reader), - instancesQueryBuilder: getErrorInstancesListQueryBuilder(this.reader), occurrencesListQueryBuilder: getErrorOccurrencesListQueryBuilder(this.reader), createOccurrencesQueryBuilder: (intervalExpr: string) => createErrorOccurrencesQueryBuilder(this.reader, intervalExpr), diff --git a/internal-packages/clickhouse/src/intervals.ts b/internal-packages/clickhouse/src/intervals.ts new file mode 100644 index 00000000000..d21fd73815c --- /dev/null +++ b/internal-packages/clickhouse/src/intervals.ts @@ -0,0 +1,5 @@ +/** Converts a granularity in milliseconds to a ClickHouse INTERVAL expression. */ +export function msToClickHouseInterval(ms: number): string { + const seconds = Math.round(ms / 1000); + return `INTERVAL ${seconds} SECOND`; +} From c97fe8846d71f6ab973135508aeb750196022aa7 Mon Sep 17 00:00:00 2001 From: Matt Aitken Date: Tue, 3 Mar 2026 18:17:09 +0000 Subject: [PATCH 12/12] Time filtering on the error detail page --- .../v3/ErrorGroupPresenter.server.ts | 67 ++++++++++++++++- .../route.tsx | 72 +++++++++++++------ .../route.tsx | 40 +++++++---- apps/webapp/app/utils/semver.ts | 41 +++++++++++ internal-packages/clickhouse/src/errors.ts | 31 +++++++- internal-packages/clickhouse/src/index.ts | 2 + 6 files changed, 216 insertions(+), 37 deletions(-) create mode 100644 apps/webapp/app/utils/semver.ts diff --git a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts index ae8e199fe9e..6c8fd5a0d63 100644 --- a/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts +++ b/apps/webapp/app/presenters/v3/ErrorGroupPresenter.server.ts @@ -11,6 +11,7 @@ const errorGroupGranularity = new TimeGranularity([ { max: "Infinity", granularity: "30d" }, ]); import { type PrismaClientOrTransaction } from "@trigger.dev/database"; +import { timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; import { findDisplayableEnvironment } from "~/models/runtimeEnvironment.server"; import { ServiceValidationError } from "~/v3/services/baseService.server"; import { BasePresenter } from "~/presenters/v3/basePresenter.server"; @@ -18,12 +19,16 @@ import { NextRunListPresenter, type NextRunList, } from "~/presenters/v3/NextRunListPresenter.server"; +import { sortVersionsDescending } from "~/utils/semver"; export type ErrorGroupOptions = { userId?: string; projectId: string; fingerprint: string; runsPageSize?: number; + period?: string; + from?: number; + to?: number; }; export const ErrorGroupOptionsSchema = z.object({ @@ -31,6 +36,9 @@ export const ErrorGroupOptionsSchema = z.object({ projectId: z.string(), fingerprint: z.string(), runsPageSize: z.number().int().positive().max(1000).optional(), + period: z.string().optional(), + from: z.number().int().nonnegative().optional(), + to: z.number().int().nonnegative().optional(), }); const DEFAULT_RUNS_PAGE_SIZE = 25; @@ -53,6 +61,7 @@ export type ErrorGroupSummary = { count: number; firstSeen: Date; lastSeen: Date; + affectedVersions: string[]; }; export type ErrorGroupOccurrences = Awaited>; @@ -75,6 +84,9 @@ export class ErrorGroupPresenter extends BasePresenter { projectId, fingerprint, runsPageSize = DEFAULT_RUNS_PAGE_SIZE, + period, + from, + to, }: ErrorGroupOptions ) { const displayableEnvironment = await findDisplayableEnvironment(environmentId, userId); @@ -83,19 +95,37 @@ export class ErrorGroupPresenter extends BasePresenter { throw new ServiceValidationError("No environment found"); } - const [summary, runList] = await Promise.all([ + const time = timeFilterFromTo({ + period, + from, + to, + defaultPeriod: "7d", + }); + + const [summary, affectedVersions, runList] = await Promise.all([ this.getSummary(organizationId, projectId, environmentId, fingerprint), + this.getAffectedVersions(organizationId, projectId, environmentId, fingerprint), this.getRunList(organizationId, environmentId, { userId, projectId, fingerprint, pageSize: runsPageSize, + from: time.from.getTime(), + to: time.to.getTime(), }), ]); + if (summary) { + summary.affectedVersions = affectedVersions; + } + return { errorGroup: summary, runList, + filters: { + from: time.from, + to: time.to, + }, }; } @@ -194,9 +224,40 @@ export class ErrorGroupPresenter extends BasePresenter { count: record.occurrence_count, firstSeen: parseClickHouseDateTime(record.first_seen), lastSeen: parseClickHouseDateTime(record.last_seen), + affectedVersions: [], }; } + /** + * Returns the most recent distinct task_version values for an error fingerprint, + * sorted by semantic version descending (newest first). + * Queries error_occurrences_v1 where task_version is part of the ORDER BY key. + */ + private async getAffectedVersions( + organizationId: string, + projectId: string, + environmentId: string, + fingerprint: string + ): Promise { + const queryBuilder = this.logsClickhouse.errors.affectedVersionsQueryBuilder(); + + queryBuilder.where("organization_id = {organizationId: String}", { organizationId }); + queryBuilder.where("project_id = {projectId: String}", { projectId }); + queryBuilder.where("environment_id = {environmentId: String}", { environmentId }); + queryBuilder.where("error_fingerprint = {fingerprint: String}", { fingerprint }); + queryBuilder.where("task_version != ''"); + queryBuilder.limit(100); + + const [queryError, records] = await queryBuilder.execute(); + + if (queryError || !records) { + return []; + } + + const versions = records.map((r) => r.task_version).filter((v) => v.length > 0); + return sortVersionsDescending(versions).slice(0, 5); + } + private async getRunList( organizationId: string, environmentId: string, @@ -205,6 +266,8 @@ export class ErrorGroupPresenter extends BasePresenter { projectId: string; fingerprint: string; pageSize: number; + from?: number; + to?: number; } ): Promise { const runListPresenter = new NextRunListPresenter(this.replica, this.clickhouse); @@ -214,6 +277,8 @@ export class ErrorGroupPresenter extends BasePresenter { projectId: options.projectId, errorFingerprint: options.fingerprint, pageSize: options.pageSize, + from: options.from, + to: options.to, }); if (result.runs.length === 0) { diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx index 46015c0b870..d74784a1f19 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors.$fingerprint/route.tsx @@ -29,6 +29,8 @@ import { TaskRunsTable } from "~/components/runs/v3/TaskRunsTable"; import { DateTime } from "~/components/primitives/DateTime"; import { ErrorId } from "@trigger.dev/core/v3/isomorphic"; import { Chart, type ChartConfig } from "~/components/primitives/charts/ChartCompound"; +import { TimeFilter, timeFilterFromTo } from "~/components/runs/v3/SharedFilters"; +import { useOptimisticLocation } from "~/hooks/useOptimisticLocation"; export const meta: MetaFunction = ({ data }) => { return [ @@ -59,6 +61,13 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw new Response("Environment not found", { status: 404 }); } + const url = new URL(request.url); + const period = url.searchParams.get("period") ?? undefined; + const fromStr = url.searchParams.get("from"); + const toStr = url.searchParams.get("to"); + const from = fromStr ? parseInt(fromStr, 10) : undefined; + const to = toStr ? parseInt(toStr, 10) : undefined; + const presenter = new ErrorGroupPresenter($replica, logsClickhouseClient, clickhouseClient); const detailPromise = presenter @@ -66,6 +75,9 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { userId, projectId: project.id, fingerprint, + period, + from, + to, }) .catch((error) => { if (error instanceof ServiceValidationError) { @@ -74,8 +86,7 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { throw error; }); - const now = new Date(); - const sevenDaysAgo = new Date(now.getTime() - 7 * 24 * 60 * 60 * 1000); + const time = timeFilterFromTo({ period, from, to, defaultPeriod: "7d" }); const activityPromise = presenter .getOccurrences( @@ -83,8 +94,8 @@ export const loader = async ({ request, params }: LoaderFunctionArgs) => { project.id, environment.id, fingerprint, - sevenDaysAgo, - now + time.from, + time.to ) .catch(() => ({ data: [] as ErrorGroupActivity })); @@ -102,11 +113,25 @@ export default function Page() { const { data, activity, organizationSlug, projectParam, envParam, fingerprint } = useTypedLoaderData(); - const errorsPath = v3ErrorsPath( - { slug: organizationSlug }, - { slug: projectParam }, - { slug: envParam } - ); + const location = useOptimisticLocation(); + const searchParams = new URLSearchParams(location.search); + + const errorsPath = useMemo(() => { + const base = v3ErrorsPath( + { slug: organizationSlug }, + { slug: projectParam }, + { slug: envParam } + ); + const carry = new URLSearchParams(); + const period = searchParams.get("period"); + const from = searchParams.get("from"); + const to = searchParams.get("to"); + if (period) carry.set("period", period); + if (from) carry.set("from", from); + if (to) carry.set("to", to); + const qs = carry.toString(); + return qs ? `${base}?${qs}` : base; + }, [organizationSlug, projectParam, envParam, searchParams.toString()]); return ( @@ -203,6 +228,10 @@ function ErrorGroupDetail({
{errorGroup.errorMessage} +
+ +
+
@@ -224,6 +253,16 @@ function ErrorGroupDetail({ Occurrences {formatNumberCompact(errorGroup.count)} + {errorGroup.affectedVersions.length > 0 && ( + + Affected versions + + + {errorGroup.affectedVersions.join(", ")} + + + + )} @@ -241,22 +280,11 @@ function ErrorGroupDetail({
- - {errorGroup.stackTrace && ( -
- - Stack Trace - -
-              {errorGroup.stackTrace}
-            
-
- )}
- {/* Activity over past 7 days */} + {/* Activity chart */}
- Activity (past 7 days) + Activity }> }> {(result) => diff --git a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx index 430e4cd42b5..fee960191ca 100644 --- a/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx +++ b/apps/webapp/app/routes/_app.orgs.$organizationSlug.projects.$projectParam.env.$envParam.errors._index/route.tsx @@ -2,7 +2,7 @@ import { XMarkIcon } from "@heroicons/react/20/solid"; import { Form, type MetaFunction } from "@remix-run/react"; import { type LoaderFunctionArgs } from "@remix-run/server-runtime"; import { ErrorId } from "@trigger.dev/core/v3/isomorphic"; -import { Suspense } from "react"; +import { Suspense, useMemo } from "react"; import { Bar, BarChart, @@ -243,7 +243,11 @@ function FiltersBar({ {list ? ( <> - + {hasFilters && (
@@ -344,12 +348,26 @@ function ErrorGroupRow({ projectParam: string; envParam: string; }) { - const errorPath = v3ErrorPath( - { slug: organizationSlug }, - { slug: projectParam }, - { slug: envParam }, - { fingerprint: errorGroup.fingerprint } - ); + const location = useOptimisticLocation(); + const searchParams = new URLSearchParams(location.search); + + const errorPath = useMemo(() => { + const base = v3ErrorPath( + { slug: organizationSlug }, + { slug: projectParam }, + { slug: envParam }, + { fingerprint: errorGroup.fingerprint } + ); + const carry = new URLSearchParams(); + const period = searchParams.get("period"); + const from = searchParams.get("from"); + const to = searchParams.get("to"); + if (period) carry.set("period", period); + if (from) carry.set("from", from); + if (to) carry.set("to", to); + const qs = carry.toString(); + return qs ? `${base}?${qs}` : base; + }, [organizationSlug, projectParam, envParam, errorGroup.fingerprint, searchParams.toString()]); const errorMessage = `${errorGroup.errorMessage}`; @@ -387,11 +405,7 @@ function ErrorGroupRow({ ); } -function ErrorActivityGraph({ - activity, -}: { - activity: ErrorOccurrenceActivity; -}) { +function ErrorActivityGraph({ activity }: { activity: ErrorOccurrenceActivity }) { const maxCount = Math.max(...activity.map((d) => d.count)); return ( diff --git a/apps/webapp/app/utils/semver.ts b/apps/webapp/app/utils/semver.ts new file mode 100644 index 00000000000..e53abf9ee09 --- /dev/null +++ b/apps/webapp/app/utils/semver.ts @@ -0,0 +1,41 @@ +/** + * Parses a version string into comparable numeric parts. + * Handles formats like "1.2.3", "20240115.1", "v1.0.0", plain timestamps, etc. + * Non-numeric pre-release suffixes (e.g. "-beta.1") are stripped for ordering purposes. + */ +function parseVersionParts(version: string): number[] { + const cleaned = version.replace(/^v/i, "").replace(/[-+].*$/, ""); + return cleaned.split(".").map((p) => { + const n = parseInt(p, 10); + return isNaN(n) ? 0 : n; + }); +} + +/** + * Compares two version strings using numeric segment comparison (descending). + * Falls back to lexicographic comparison when segments are equal. + * Returns a negative number if `a` should come before `b` (i.e. `a` is newer). + */ +export function compareVersionsDescending(a: string, b: string): number { + const partsA = parseVersionParts(a); + const partsB = parseVersionParts(b); + const maxLen = Math.max(partsA.length, partsB.length); + + for (let i = 0; i < maxLen; i++) { + const segA = partsA[i] ?? 0; + const segB = partsB[i] ?? 0; + if (segA !== segB) { + return segB - segA; + } + } + + return b.localeCompare(a); +} + +/** + * Sorts an array of version strings in descending order (newest first). + * Non-destructive – returns a new array. + */ +export function sortVersionsDescending(versions: string[]): string[] { + return [...versions].sort(compareVersionsDescending); +} diff --git a/internal-packages/clickhouse/src/errors.ts b/internal-packages/clickhouse/src/errors.ts index 1c268d76faa..326a4defd82 100644 --- a/internal-packages/clickhouse/src/errors.ts +++ b/internal-packages/clickhouse/src/errors.ts @@ -214,6 +214,36 @@ export function getErrorInstances(ch: ClickhouseReader, settings?: ClickHouseSet }); } +// --------------------------------------------------------------------------- +// Affected versions – distinct task_version from error_occurrences_v1 +// --------------------------------------------------------------------------- + +export const ErrorAffectedVersionsQueryResult = z.object({ + task_version: z.string(), +}); + +export type ErrorAffectedVersionsQueryResult = z.infer; + +/** + * Query builder for fetching distinct task_version values for an error fingerprint + * from the error_occurrences_v1 SummingMergeTree table. + * task_version is part of the ORDER BY key, so this is efficient. + */ +export function getErrorAffectedVersionsQueryBuilder( + ch: ClickhouseReader, + settings?: ClickHouseSettings +) { + return ch.queryBuilder({ + name: "getErrorAffectedVersions", + baseQuery: ` + SELECT DISTINCT task_version + FROM trigger_dev.error_occurrences_v1 + `, + schema: ErrorAffectedVersionsQueryResult, + settings, + }); +} + // --------------------------------------------------------------------------- // error_occurrences_v1 – per-minute bucketed error counts // --------------------------------------------------------------------------- @@ -284,4 +314,3 @@ export function createErrorOccurrencesQueryBuilder( settings ); } - diff --git a/internal-packages/clickhouse/src/index.ts b/internal-packages/clickhouse/src/index.ts index 653be85af49..b6fbd92177b 100644 --- a/internal-packages/clickhouse/src/index.ts +++ b/internal-packages/clickhouse/src/index.ts @@ -34,6 +34,7 @@ import { getErrorHourlyOccurrences, getErrorOccurrencesListQueryBuilder, createErrorOccurrencesQueryBuilder, + getErrorAffectedVersionsQueryBuilder, } from "./errors.js"; export { msToClickHouseInterval } from "./intervals.js"; import { Logger, type LogLevel } from "@trigger.dev/core/logger"; @@ -245,6 +246,7 @@ export class ClickHouse { getGroups: getErrorGroups(this.reader), getInstances: getErrorInstances(this.reader), getHourlyOccurrences: getErrorHourlyOccurrences(this.reader), + affectedVersionsQueryBuilder: getErrorAffectedVersionsQueryBuilder(this.reader), listQueryBuilder: getErrorGroupsListQueryBuilder(this.reader), occurrencesListQueryBuilder: getErrorOccurrencesListQueryBuilder(this.reader), createOccurrencesQueryBuilder: (intervalExpr: string) =>