Skip to content

Commit 1b8129a

Browse files
feat: add failed_permanent metric for worker monitoring (#2107)
* feat: add last failure timestamp metric for worker monitoring Add a Prometheus Gauge metric to track the timestamp of the last failure for each worker. This complements the existing failed job counter by providing visibility into when failures last occurred for monitoring and alerting purposes. Changes: - Added workerLastFailureGauge metric in metrics.ts - Updated all 9 workers to set the gauge on failure: - crawler, feed, webhook, assetPreProcessing - inference, adminMaintenance, ruleEngine - video, search * refactor: track both all failures and permanent failures with counter Remove the gauge metric and use the existing counter to track both: - All failures (including retry attempts): status="failed" - Permanent failures (retries exhausted): status="failed_permanent" This provides better visibility into retry behavior and permanent vs temporary failures without adding a separate metric. Changes: - Removed workerLastFailureGauge from metrics.ts - Updated all 9 workers to track failed_permanent when numRetriesLeft == 0 - Maintained existing failed counter for all failure attempts * style: format worker files with prettier --------- Co-authored-by: Claude <[email protected]>
1 parent d9ef832 commit 1b8129a

File tree

9 files changed

+32
-0
lines changed

9 files changed

+32
-0
lines changed

apps/workers/workers/adminMaintenanceWorker.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ export class AdminMaintenanceWorker {
3434
workerStatsCounter
3535
.labels(`adminMaintenance:${job.data?.type}`, "failed")
3636
.inc();
37+
if (job.numRetriesLeft == 0) {
38+
workerStatsCounter
39+
.labels(
40+
`adminMaintenance:${job.data?.type}`,
41+
"failed_permanent",
42+
)
43+
.inc();
44+
}
3745
logger.error(
3846
`[adminMaintenance:${job.data?.type}][${job.id}] Job failed: ${job.error}\n${job.error.stack}`,
3947
);

apps/workers/workers/assetPreprocessingWorker.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,11 @@ export class AssetPreprocessingWorker {
4747
},
4848
onError: async (job) => {
4949
workerStatsCounter.labels("assetPreProcessing", "failed").inc();
50+
if (job.numRetriesLeft == 0) {
51+
workerStatsCounter
52+
.labels("assetPreProcessing", "failed_permanent")
53+
.inc();
54+
}
5055
const jobId = job.id;
5156
logger.error(
5257
`[assetPreprocessing][${jobId}] Asset preprocessing failed: ${job.error}\n${job.error.stack}`,

apps/workers/workers/crawlerWorker.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -313,6 +313,9 @@ export class CrawlerWorker {
313313
},
314314
onError: async (job) => {
315315
workerStatsCounter.labels("crawler", "failed").inc();
316+
if (job.numRetriesLeft == 0) {
317+
workerStatsCounter.labels("crawler", "failed_permanent").inc();
318+
}
316319
const jobId = job.id;
317320
logger.error(
318321
`[Crawler][${jobId}] Crawling job failed: ${job.error}\n${job.error.stack}`,

apps/workers/workers/feedWorker.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ export class FeedWorker {
6767
},
6868
onError: async (job) => {
6969
workerStatsCounter.labels("feed", "failed").inc();
70+
if (job.numRetriesLeft == 0) {
71+
workerStatsCounter.labels("feed", "failed_permanent").inc();
72+
}
7073
const jobId = job.id;
7174
logger.error(
7275
`[feed][${jobId}] Feed fetch job failed: ${job.error}\n${job.error.stack}`,

apps/workers/workers/inference/inferenceWorker.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ export class OpenAiWorker {
5656
`[inference][${jobId}] inference job failed: ${job.error}\n${job.error.stack}`,
5757
);
5858
if (job.numRetriesLeft == 0) {
59+
workerStatsCounter.labels("inference", "failed_permanent").inc();
5960
await attemptMarkStatus(job?.data, "failure");
6061
}
6162
},

apps/workers/workers/ruleEngineWorker.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ export class RuleEngineWorker {
2929
},
3030
onError: (job) => {
3131
workerStatsCounter.labels("ruleEngine", "failed").inc();
32+
if (job.numRetriesLeft == 0) {
33+
workerStatsCounter.labels("ruleEngine", "failed_permanent").inc();
34+
}
3235
const jobId = job.id;
3336
logger.error(
3437
`[ruleEngine][${jobId}] rule engine job failed: ${job.error}\n${job.error.stack}`,

apps/workers/workers/searchWorker.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ export class SearchIndexingWorker {
3434
},
3535
onError: (job) => {
3636
workerStatsCounter.labels("search", "failed").inc();
37+
if (job.numRetriesLeft == 0) {
38+
workerStatsCounter.labels("search", "failed_permanent").inc();
39+
}
3740
const jobId = job.id;
3841
logger.error(
3942
`[search][${jobId}] search job failed: ${job.error}\n${job.error.stack}`,

apps/workers/workers/videoWorker.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ export class VideoWorker {
4646
},
4747
onError: async (job) => {
4848
workerStatsCounter.labels("video", "failed").inc();
49+
if (job.numRetriesLeft == 0) {
50+
workerStatsCounter.labels("video", "failed_permanent").inc();
51+
}
4952
const jobId = job.id;
5053
logger.error(
5154
`[VideoCrawler][${jobId}] Video Download job failed: ${job.error}`,

apps/workers/workers/webhookWorker.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ export class WebhookWorker {
2828
},
2929
onError: async (job) => {
3030
workerStatsCounter.labels("webhook", "failed").inc();
31+
if (job.numRetriesLeft == 0) {
32+
workerStatsCounter.labels("webhook", "failed_permanent").inc();
33+
}
3134
const jobId = job.id;
3235
logger.error(
3336
`[webhook][${jobId}] webhook job failed: ${job.error}\n${job.error.stack}`,

0 commit comments

Comments
 (0)