Phase 2d — gateway_jobs retention (Janitor goroutine)

Periodic cleanup goroutine, started alongside the worker when DATABASE_URL is set. Three concerns : - DELETE rows with status='done' older than QUEUE_DONE_RETENTION (default 168h / 7 days). Past success rows have no value beyond debug runway. - UPDATE rows stuck in status='running' for more than QUEUE_STUCK_TIMEOUT (default 30m) back to 'pending' so a worker can retry. Handles the case of a pod crashing mid-job (without this, jobs stay orphaned forever). - 'dead' rows are NEVER auto-purged (volume negligible, kept for forensics). Configurable via env : - QUEUE_DONE_RETENTION (default 168h) - QUEUE_STUCK_TIMEOUT (default 30m) - QUEUE_JANITOR_INTERVAL (default 1h) The janitor runs once immediately at startup (recovers anything orphaned by the previous pod before opening for new traffic), then ticks on the interval. Queue interface gains PurgeDone + RecoverStuck — both use Postgres' make_interval(secs) for safe parameterization. 4 new unit tests via fakeQueue mock (47 total, race clean).
2026-05-09 16:06:54 +02:00
parent 95380dac99
commit abe77f5873
4 changed files with 257 additions and 0 deletions
--- a/queue.go
+++ b/queue.go
@@ -37,6 +37,13 @@ type Queue interface {
 	// MarkFailed schedules a retry (status back to 'pending', next_retry_at
 	// in the future). After maxAttempts the row is set to 'dead'.
 	MarkFailed(ctx context.Context, id int64, attempt int, err error, maxAttempts int) error
+	// PurgeDone removes rows with status='done' older than `olderThan`.
+	// Returns the number of rows deleted. Used by the Janitor goroutine.
+	PurgeDone(ctx context.Context, olderThan time.Duration) (int64, error)
+	// RecoverStuck reverts rows stuck in status='running' for more than
+	// `stuckFor` back to 'pending' so a worker can retry. Handles the case
+	// of a pod dying mid-job. Returns the number of rows reverted.
+	RecoverStuck(ctx context.Context, stuckFor time.Duration) (int64, error)
 }

 type PostgresQueue struct {
@@ -80,6 +87,17 @@ UPDATE gateway_jobs
       last_error = $2,
       updated_at = NOW()
 WHERE id = $1
+`
+	purgeDoneSQL = `
+DELETE FROM gateway_jobs
+ WHERE status = 'done'
+   AND updated_at < NOW() - make_interval(secs => $1::int)
+`
+	recoverStuckSQL = `
+UPDATE gateway_jobs
+   SET status = 'pending', updated_at = NOW()
+ WHERE status = 'running'
+   AND updated_at < NOW() - make_interval(secs => $1::int)
 `
 )

@@ -123,6 +141,22 @@ func (q *PostgresQueue) MarkFailed(ctx context.Context, id int64, attempt int, j
 	return err
 }

+func (q *PostgresQueue) PurgeDone(ctx context.Context, olderThan time.Duration) (int64, error) {
+	res, err := q.db.ExecContext(ctx, purgeDoneSQL, int(olderThan.Seconds()))
+	if err != nil {
+		return 0, err
+	}
+	return res.RowsAffected()
+}
+
+func (q *PostgresQueue) RecoverStuck(ctx context.Context, stuckFor time.Duration) (int64, error) {
+	res, err := q.db.ExecContext(ctx, recoverStuckSQL, int(stuckFor.Seconds()))
+	if err != nil {
+		return 0, err
+	}
+	return res.RowsAffected()
+}
+
 // Backoff returns the delay before the next attempt of a failed job.
 // 30s, 2m, 10m, 1h (capped). Spans roughly one night by attempt 5.
 func Backoff(attempt int) time.Duration {