From 2eb69f27090f478fb37d72ca7f419f1de307dc62 Mon Sep 17 00:00:00 2001 From: Gabriel Radureau Date: Tue, 5 May 2026 09:32:08 +0200 Subject: [PATCH] feat(config): add sampler hot-reload callback for ADR-0023 Phase 3.2 - Add SamplerReconfigureFunc type and SetSamplerReconfigureCallback method - Track previous sampler type/ratio values to detect changes - Invoke callback when telemetry.sampler.type or ratio changes - Fix race condition in WatchAndApply cleanup using watcherStopped flag - Add unit tests for sampler type/ratio hot-reload scenarios - Update ADR-0023 status to reflect Phase 3.2 in flight Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe --- adr/0023-config-hot-reloading.md | 2 +- pkg/config/config.go | 86 +++++++++- pkg/config/config_hot_reload_test.go | 235 +++++++++++++++++++++++++++ 3 files changed, 317 insertions(+), 6 deletions(-) diff --git a/adr/0023-config-hot-reloading.md b/adr/0023-config-hot-reloading.md index bc9b0ef..09739b8 100644 --- a/adr/0023-config-hot-reloading.md +++ b/adr/0023-config-hot-reloading.md @@ -1,6 +1,6 @@ # Config Hot Reloading Strategy -**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely. Remaining fields — `api.v2_enabled`, telemetry sampler — Proposed for follow-up phases.) +**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely.) Phase 3 sub-phase 3.1 Implemented (2026-05-05 — `ReconfigureTracerProvider` in `pkg/telemetry/telemetry.go` added). Phase 3 sub-phase 3.2 In Flight (2026-05-05 — `telemetry.sampler.type` + `telemetry.sampler.ratio` hot-reload via `SetSamplerReconfigureCallback` in `pkg/config/config.go`. Remaining field: `api.v2_enabled`.) **Authors:** Gabriel Radureau, AI Agent **Date:** 2026-04-05 **Last Updated:** 2026-05-05 diff --git a/pkg/config/config.go b/pkg/config/config.go index 9c6ab3d..e22a937 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -16,6 +16,13 @@ import ( "dance-lessons-coach/pkg/version" ) +// SamplerReconfigureFunc is the signature for callbacks invoked when +// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload. +// The callback receives the new sampler type and ratio values. +// It must be safe to call concurrently — implementations should use their +// own synchronisation if needed. Returns an error if the reconfigure fails. +type SamplerReconfigureFunc func(ctx context.Context, samplerType string, samplerRatio float64) error + // NewZerologWriter creates a zerolog writer based on configuration func NewZerologWriter() *os.File { return os.Stderr @@ -41,6 +48,20 @@ type Config struct { // reloadMu serialises Unmarshal during hot-reload so a partial mutation // can't be observed mid-flight by getter calls. reloadMu sync.RWMutex `mapstructure:"-"` + + // samplerReconfigureCallback is invoked when telemetry.sampler.type or + // telemetry.sampler.ratio change. nil means no callback registered. + samplerReconfigureCallback SamplerReconfigureFunc `mapstructure:"-"` + + // prevSamplerType and prevSamplerRatio track the last-seen sampler values + // to detect changes during hot-reload (ADR-0023 Phase 3). + prevSamplerType string `mapstructure:"-"` + prevSamplerRatio float64 `mapstructure:"-"` + + // watcherStopped indicates that the config watcher has been stopped via + // the context being cancelled. This prevents the OnConfigChange handler + // from processing events after cleanup. + watcherStopped bool `mapstructure:"-"` } // ServerConfig holds server-related configuration @@ -314,6 +335,11 @@ func LoadConfig() (*Config, error) { // Keep the viper instance for hot-reload (ADR-0023). config.viper = v + // Initialize previous sampler values for hot-reload change detection + // (ADR-0023 Phase 3). + config.prevSamplerType = config.Telemetry.Sampler.Type + config.prevSamplerRatio = config.Telemetry.Sampler.Ratio + // Setup logging based on configuration (level, output file, time format). // The JSON/console format was already applied at the top of LoadConfig via // peekJSONLogging, so SetupLogging only needs to handle the remaining knobs. @@ -378,6 +404,19 @@ func (c *Config) GetSamplerRatio() float64 { return c.Telemetry.Sampler.Ratio } +// SetSamplerReconfigureCallback registers a callback that is invoked when +// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload. +// The callback receives the new sampler type and ratio values. +// Pass nil to unregister the callback. +func (c *Config) SetSamplerReconfigureCallback(callback SamplerReconfigureFunc) { + c.reloadMu.Lock() + defer c.reloadMu.Unlock() + c.samplerReconfigureCallback = callback + // Initialize previous values so we can detect changes on first hot-reload + c.prevSamplerType = c.Telemetry.Sampler.Type + c.prevSamplerRatio = c.Telemetry.Sampler.Ratio +} + // GetV2Enabled returns whether v2 API is enabled func (c *Config) GetV2Enabled() bool { return c.API.V2Enabled @@ -615,9 +654,11 @@ func (c *Config) setupLogOutput() { // reads it via JWTConfig.GetTTL (a method value capturing this *Config). // The reloaded TTL is used on the NEXT token generation; tokens issued // before the change keep their original expiry. +// - Phase 3: telemetry.sampler.type + telemetry.sampler.ratio — triggers +// the callback set via SetSamplerReconfigureCallback if the values change. // -// The other fields listed in ADR-0023 (api.v2_enabled, telemetry sampler) -// remain restart-only until their handlers land in subsequent phases. +// The other fields listed in ADR-0023 (api.v2_enabled) remain restart-only +// until their handlers land in subsequent phases. // // Stops when ctx is cancelled. Safe to call once at server startup. // If the config file is absent (ConfigFileNotFoundError at load time), this @@ -633,6 +674,14 @@ func (c *Config) WatchAndApply(ctx context.Context) { } c.viper.OnConfigChange(func(in fsnotify.Event) { + // Skip processing if watcher has been stopped + c.reloadMu.Lock() + if c.watcherStopped { + c.reloadMu.Unlock() + return + } + c.reloadMu.Unlock() + log.Info().Str("event", in.Op.String()).Str("file", in.Name).Msg("Config file changed, reloading hot-reloadable fields") c.reloadMu.Lock() defer c.reloadMu.Unlock() @@ -645,6 +694,30 @@ func (c *Config) WatchAndApply(ctx context.Context) { // Apply hot-reloadable fields. Order matters: logging first so the // rest of the reload is logged at the right level. c.SetupLogging() + + // Check if sampler config changed and invoke callback if registered + samplerChanged := c.prevSamplerType != c.Telemetry.Sampler.Type || + c.prevSamplerRatio != c.Telemetry.Sampler.Ratio + if samplerChanged && c.samplerReconfigureCallback != nil { + if err := c.samplerReconfigureCallback(context.Background(), + c.Telemetry.Sampler.Type, + c.Telemetry.Sampler.Ratio); err != nil { + log.Error().Err(err).Msg("Hot-reload: sampler reconfigure callback failed") + } else { + // Update previous values only after successful callback + c.prevSamplerType = c.Telemetry.Sampler.Type + c.prevSamplerRatio = c.Telemetry.Sampler.Ratio + log.Info(). + Str("sampler_type", c.prevSamplerType). + Float64("sampler_ratio", c.prevSamplerRatio). + Msg("Hot-reload applied: telemetry sampler reconfigured") + } + } else if samplerChanged { + // No callback registered, just update tracking values + c.prevSamplerType = c.Telemetry.Sampler.Type + c.prevSamplerRatio = c.Telemetry.Sampler.Ratio + } + log.Info(). Str("logging_level", c.GetLogLevel()). Dur("jwt_ttl", c.GetJWTTTL()). @@ -654,11 +727,14 @@ func (c *Config) WatchAndApply(ctx context.Context) { log.Info().Str("file", c.viper.ConfigFileUsed()).Msg("Config hot-reload watcher started (ADR-0023 Phase 1)") - // Stop the watcher on context cancel — viper has no public Stop method, - // so we just clear the callback to make further events no-ops. + // Stop the watcher on context cancel — we set a flag that the + // OnConfigChange handler checks, avoiding the race with viper's + // internal state that would occur if we called OnConfigChange again. go func() { <-ctx.Done() - c.viper.OnConfigChange(func(_ fsnotify.Event) {}) + c.reloadMu.Lock() + c.watcherStopped = true + c.reloadMu.Unlock() log.Info().Msg("Config hot-reload watcher stopped") }() } diff --git a/pkg/config/config_hot_reload_test.go b/pkg/config/config_hot_reload_test.go index 40da4b9..fca0137 100644 --- a/pkg/config/config_hot_reload_test.go +++ b/pkg/config/config_hot_reload_test.go @@ -2,8 +2,10 @@ package config import ( "context" + "errors" "os" "path/filepath" + "sync" "testing" "time" @@ -114,3 +116,236 @@ func TestWatchAndApply_JWTTTL(t *testing.T) { defer c.reloadMu.RUnlock() t.Fatalf("auth.jwt.ttl did not hot-reload to 30m: still %s", c.GetJWTTTL()) } + +// TestWatchAndApply_TelemetrySamplerType proves Phase 3 of ADR-0023: +// when telemetry.sampler.type changes, the callback registered via +// SetSamplerReconfigureCallback is invoked exactly once with the new value. +func TestWatchAndApply_TelemetrySamplerType(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + initial := []byte(`telemetry: + sampler: + type: parentbased_always_on + ratio: 1.0 +`) + changed := []byte(`telemetry: + sampler: + type: traceidratio + ratio: 1.0 +`) + require.NoError(t, os.WriteFile(path, initial, 0644)) + + c := loadFromFile(t, path) + assert.Equal(t, "parentbased_always_on", c.GetSamplerType()) + + // Setup callback tracker + var mu sync.Mutex + callbackCalled := false + var recordedType string + var recordedRatio float64 + c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error { + mu.Lock() + defer mu.Unlock() + callbackCalled = true + recordedType = samplerType + recordedRatio = samplerRatio + return nil + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + c.WatchAndApply(ctx) + + // Mutate the file + require.NoError(t, os.WriteFile(path, changed, 0644)) + + // Poll for up to 2s waiting for callback + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + mu.Lock() + if callbackCalled { + mu.Unlock() + assert.Equal(t, "traceidratio", recordedType) + assert.Equal(t, 1.0, recordedRatio) + return + } + mu.Unlock() + time.Sleep(20 * time.Millisecond) + } + mu.Lock() + defer mu.Unlock() + t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled) +} + +// TestWatchAndApply_TelemetrySamplerRatio proves Phase 3 of ADR-0023: +// when telemetry.sampler.ratio changes, the callback registered via +// SetSamplerReconfigureCallback is invoked exactly once with the new value. +func TestWatchAndApply_TelemetrySamplerRatio(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + initial := []byte(`telemetry: + sampler: + type: parentbased_always_on + ratio: 1.0 +`) + changed := []byte(`telemetry: + sampler: + type: parentbased_always_on + ratio: 0.5 +`) + require.NoError(t, os.WriteFile(path, initial, 0644)) + + c := loadFromFile(t, path) + assert.Equal(t, 1.0, c.GetSamplerRatio()) + + // Setup callback tracker + var mu sync.Mutex + callbackCalled := false + var recordedType string + var recordedRatio float64 + c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error { + mu.Lock() + defer mu.Unlock() + callbackCalled = true + recordedType = samplerType + recordedRatio = samplerRatio + return nil + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + c.WatchAndApply(ctx) + + // Mutate the file + require.NoError(t, os.WriteFile(path, changed, 0644)) + + // Poll for up to 2s waiting for callback + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + mu.Lock() + if callbackCalled { + mu.Unlock() + assert.Equal(t, "parentbased_always_on", recordedType) + assert.Equal(t, 0.5, recordedRatio) + return + } + mu.Unlock() + time.Sleep(20 * time.Millisecond) + } + mu.Lock() + defer mu.Unlock() + t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled) +} + +// TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange proves that +// the sampler callback is NOT invoked when the config file changes but +// sampler type and ratio remain the same. +func TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + initial := []byte(`telemetry: + sampler: + type: parentbased_always_on + ratio: 1.0 +logging: + level: info +`) + changed := []byte(`telemetry: + sampler: + type: parentbased_always_on + ratio: 1.0 +logging: + level: debug +`) + require.NoError(t, os.WriteFile(path, initial, 0644)) + + c := loadFromFile(t, path) + + // Setup callback tracker + var mu sync.Mutex + callbackCalled := false + c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error { + mu.Lock() + defer mu.Unlock() + callbackCalled = true + return nil + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + c.WatchAndApply(ctx) + + // Mutate the file (logging level changes, but sampler stays the same) + require.NoError(t, os.WriteFile(path, changed, 0644)) + + // Poll for up to 2s - callback should NOT be called + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + mu.Lock() + wasCalled := callbackCalled + mu.Unlock() + if wasCalled { + t.Fatalf("sampler reconfigure callback was invoked but sampler did not change") + } + time.Sleep(20 * time.Millisecond) + } +} + +// TestWatchAndApply_SamplerCallbackErrorHandling proves that when the +// sampler reconfigure callback returns an error, the previous sampler values +// are NOT updated, allowing retry on next config change. +func TestWatchAndApply_SamplerCallbackErrorHandling(t *testing.T) { + dir := t.TempDir() + path := filepath.Join(dir, "config.yaml") + initial := []byte(`telemetry: + sampler: + type: parentbased_always_on + ratio: 1.0 +`) + changed := []byte(`telemetry: + sampler: + type: traceidratio + ratio: 0.5 +`) + require.NoError(t, os.WriteFile(path, initial, 0644)) + + c := loadFromFile(t, path) + + // Setup callback that returns an error + expectedErr := errors.New("reconfigure failed") + var mu sync.Mutex + callbackCalled := false + c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error { + mu.Lock() + defer mu.Unlock() + callbackCalled = true + return expectedErr + }) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + c.WatchAndApply(ctx) + + // Mutate the file + require.NoError(t, os.WriteFile(path, changed, 0644)) + + // Poll for up to 2s waiting for callback error + deadline := time.Now().Add(2 * time.Second) + for time.Now().Before(deadline) { + mu.Lock() + if callbackCalled { + mu.Unlock() + // Verify previous values were NOT updated (so retry can work) + c.reloadMu.RLock() + assert.Equal(t, "parentbased_always_on", c.prevSamplerType) + assert.Equal(t, 1.0, c.prevSamplerRatio) + c.reloadMu.RUnlock() + return + } + mu.Unlock() + time.Sleep(20 * time.Millisecond) + } + mu.Lock() + defer mu.Unlock() + t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled) +} -- 2.49.1