Phase 2d — gateway_jobs retention (Janitor goroutine)

Periodic cleanup goroutine, started alongside the worker when DATABASE_URL is set. Three concerns : - DELETE rows with status='done' older than QUEUE_DONE_RETENTION (default 168h / 7 days). Past success rows have no value beyond debug runway. - UPDATE rows stuck in status='running' for more than QUEUE_STUCK_TIMEOUT (default 30m) back to 'pending' so a worker can retry. Handles the case of a pod crashing mid-job (without this, jobs stay orphaned forever). - 'dead' rows are NEVER auto-purged (volume negligible, kept for forensics). Configurable via env : - QUEUE_DONE_RETENTION (default 168h) - QUEUE_STUCK_TIMEOUT (default 30m) - QUEUE_JANITOR_INTERVAL (default 1h) The janitor runs once immediately at startup (recovers anything orphaned by the previous pod before opening for new traffic), then ticks on the interval. Queue interface gains PurgeDone + RecoverStuck — both use Postgres' make_interval(secs) for safe parameterization. 4 new unit tests via fakeQueue mock (47 total, race clean).
2026-05-09 16:06:54 +02:00
parent 95380dac99
commit abe77f5873
4 changed files with 257 additions and 0 deletions
--- a/janitor_test.go
+++ b/janitor_test.go
@@ -0,0 +1,117 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"sync"
+	"testing"
+	"time"
+)
+
+// fakeQueue records calls to PurgeDone / RecoverStuck. Other methods are
+// no-ops because the janitor doesn't touch them.
+type fakeQueue struct {
+	mu sync.Mutex
+
+	purgeAges []time.Duration
+	purgeRet  int64
+	purgeErr  error
+
+	stuckAges []time.Duration
+	stuckRet  int64
+	stuckErr  error
+}
+
+func (f *fakeQueue) Enqueue(_ context.Context, _ Job) error          { return nil }
+func (f *fakeQueue) Pop(_ context.Context) (*Job, error)             { return nil, nil }
+func (f *fakeQueue) MarkDone(_ context.Context, _ int64) error       { return nil }
+func (f *fakeQueue) MarkFailed(_ context.Context, _ int64, _ int, _ error, _ int) error {
+	return nil
+}
+
+func (f *fakeQueue) PurgeDone(_ context.Context, age time.Duration) (int64, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.purgeAges = append(f.purgeAges, age)
+	return f.purgeRet, f.purgeErr
+}
+
+func (f *fakeQueue) RecoverStuck(_ context.Context, age time.Duration) (int64, error) {
+	f.mu.Lock()
+	defer f.mu.Unlock()
+	f.stuckAges = append(f.stuckAges, age)
+	return f.stuckRet, f.stuckErr
+}
+
+func TestJanitor_TickCallsBothQueueMethods(t *testing.T) {
+	q := &fakeQueue{purgeRet: 3, stuckRet: 1}
+	j := NewJanitor(q, 7*24*time.Hour, 30*time.Minute, time.Hour)
+
+	j.tick(context.Background())
+
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.purgeAges) != 1 || q.purgeAges[0] != 7*24*time.Hour {
+		t.Fatalf("PurgeDone calls = %v, want one call with 168h", q.purgeAges)
+	}
+	if len(q.stuckAges) != 1 || q.stuckAges[0] != 30*time.Minute {
+		t.Fatalf("RecoverStuck calls = %v, want one call with 30m", q.stuckAges)
+	}
+}
+
+func TestJanitor_TickSurvivesPurgeError(t *testing.T) {
+	q := &fakeQueue{purgeErr: errors.New("boom")}
+	j := NewJanitor(q, time.Hour, time.Minute, time.Hour)
+
+	// Should not panic, should still call RecoverStuck despite Purge failure.
+	j.tick(context.Background())
+
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.stuckAges) != 1 {
+		t.Fatalf("RecoverStuck should still run after PurgeDone error, got %d calls", len(q.stuckAges))
+	}
+}
+
+func TestJanitor_DefaultsAppliedOnZeroOrNegative(t *testing.T) {
+	q := &fakeQueue{}
+	j := NewJanitor(q, 0, -time.Second, 0)
+
+	if j.doneRetention != defaultDoneRetention {
+		t.Errorf("doneRetention = %s, want %s", j.doneRetention, defaultDoneRetention)
+	}
+	if j.stuckTimeout != defaultStuckTimeout {
+		t.Errorf("stuckTimeout = %s, want %s", j.stuckTimeout, defaultStuckTimeout)
+	}
+	if j.interval != defaultJanitorInterval {
+		t.Errorf("interval = %s, want %s", j.interval, defaultJanitorInterval)
+	}
+}
+
+func TestJanitor_RunStopsOnContextCancel(t *testing.T) {
+	q := &fakeQueue{}
+	// Very short interval so the ticker fires at least once before we cancel.
+	j := NewJanitor(q, time.Hour, time.Minute, 5*time.Millisecond)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	done := make(chan struct{})
+	go func() {
+		j.Run(ctx)
+		close(done)
+	}()
+
+	// Let the immediate-startup tick + at least one interval tick fire.
+	time.Sleep(20 * time.Millisecond)
+	cancel()
+	select {
+	case <-done:
+	case <-time.After(time.Second):
+		t.Fatal("janitor did not stop after context cancel")
+	}
+
+	q.mu.Lock()
+	defer q.mu.Unlock()
+	if len(q.purgeAges) < 1 {
+		t.Fatalf("expected at least 1 purge call before cancel, got %d", len(q.purgeAges))
+	}
+}