Compare commits
1 Commits
fix/should
...
feat/adr-0
| Author | SHA1 | Date | |
|---|---|---|---|
| 2eb69f2709 |
@@ -1,6 +1,6 @@
|
||||
# Config Hot Reloading Strategy
|
||||
|
||||
**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely. Remaining fields — `api.v2_enabled`, telemetry sampler — Proposed for follow-up phases.)
|
||||
**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely.) Phase 3 sub-phase 3.1 Implemented (2026-05-05 — `ReconfigureTracerProvider` in `pkg/telemetry/telemetry.go` added). Phase 3 sub-phase 3.2 In Flight (2026-05-05 — `telemetry.sampler.type` + `telemetry.sampler.ratio` hot-reload via `SetSamplerReconfigureCallback` in `pkg/config/config.go`. Remaining field: `api.v2_enabled`.)
|
||||
**Authors:** Gabriel Radureau, AI Agent
|
||||
**Date:** 2026-04-05
|
||||
**Last Updated:** 2026-05-05
|
||||
|
||||
@@ -16,6 +16,13 @@ import (
|
||||
"dance-lessons-coach/pkg/version"
|
||||
)
|
||||
|
||||
// SamplerReconfigureFunc is the signature for callbacks invoked when
|
||||
// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload.
|
||||
// The callback receives the new sampler type and ratio values.
|
||||
// It must be safe to call concurrently — implementations should use their
|
||||
// own synchronisation if needed. Returns an error if the reconfigure fails.
|
||||
type SamplerReconfigureFunc func(ctx context.Context, samplerType string, samplerRatio float64) error
|
||||
|
||||
// NewZerologWriter creates a zerolog writer based on configuration
|
||||
func NewZerologWriter() *os.File {
|
||||
return os.Stderr
|
||||
@@ -41,6 +48,20 @@ type Config struct {
|
||||
// reloadMu serialises Unmarshal during hot-reload so a partial mutation
|
||||
// can't be observed mid-flight by getter calls.
|
||||
reloadMu sync.RWMutex `mapstructure:"-"`
|
||||
|
||||
// samplerReconfigureCallback is invoked when telemetry.sampler.type or
|
||||
// telemetry.sampler.ratio change. nil means no callback registered.
|
||||
samplerReconfigureCallback SamplerReconfigureFunc `mapstructure:"-"`
|
||||
|
||||
// prevSamplerType and prevSamplerRatio track the last-seen sampler values
|
||||
// to detect changes during hot-reload (ADR-0023 Phase 3).
|
||||
prevSamplerType string `mapstructure:"-"`
|
||||
prevSamplerRatio float64 `mapstructure:"-"`
|
||||
|
||||
// watcherStopped indicates that the config watcher has been stopped via
|
||||
// the context being cancelled. This prevents the OnConfigChange handler
|
||||
// from processing events after cleanup.
|
||||
watcherStopped bool `mapstructure:"-"`
|
||||
}
|
||||
|
||||
// ServerConfig holds server-related configuration
|
||||
@@ -314,6 +335,11 @@ func LoadConfig() (*Config, error) {
|
||||
// Keep the viper instance for hot-reload (ADR-0023).
|
||||
config.viper = v
|
||||
|
||||
// Initialize previous sampler values for hot-reload change detection
|
||||
// (ADR-0023 Phase 3).
|
||||
config.prevSamplerType = config.Telemetry.Sampler.Type
|
||||
config.prevSamplerRatio = config.Telemetry.Sampler.Ratio
|
||||
|
||||
// Setup logging based on configuration (level, output file, time format).
|
||||
// The JSON/console format was already applied at the top of LoadConfig via
|
||||
// peekJSONLogging, so SetupLogging only needs to handle the remaining knobs.
|
||||
@@ -378,6 +404,19 @@ func (c *Config) GetSamplerRatio() float64 {
|
||||
return c.Telemetry.Sampler.Ratio
|
||||
}
|
||||
|
||||
// SetSamplerReconfigureCallback registers a callback that is invoked when
|
||||
// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload.
|
||||
// The callback receives the new sampler type and ratio values.
|
||||
// Pass nil to unregister the callback.
|
||||
func (c *Config) SetSamplerReconfigureCallback(callback SamplerReconfigureFunc) {
|
||||
c.reloadMu.Lock()
|
||||
defer c.reloadMu.Unlock()
|
||||
c.samplerReconfigureCallback = callback
|
||||
// Initialize previous values so we can detect changes on first hot-reload
|
||||
c.prevSamplerType = c.Telemetry.Sampler.Type
|
||||
c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
|
||||
}
|
||||
|
||||
// GetV2Enabled returns whether v2 API is enabled
|
||||
func (c *Config) GetV2Enabled() bool {
|
||||
return c.API.V2Enabled
|
||||
@@ -615,9 +654,11 @@ func (c *Config) setupLogOutput() {
|
||||
// reads it via JWTConfig.GetTTL (a method value capturing this *Config).
|
||||
// The reloaded TTL is used on the NEXT token generation; tokens issued
|
||||
// before the change keep their original expiry.
|
||||
// - Phase 3: telemetry.sampler.type + telemetry.sampler.ratio — triggers
|
||||
// the callback set via SetSamplerReconfigureCallback if the values change.
|
||||
//
|
||||
// The other fields listed in ADR-0023 (api.v2_enabled, telemetry sampler)
|
||||
// remain restart-only until their handlers land in subsequent phases.
|
||||
// The other fields listed in ADR-0023 (api.v2_enabled) remain restart-only
|
||||
// until their handlers land in subsequent phases.
|
||||
//
|
||||
// Stops when ctx is cancelled. Safe to call once at server startup.
|
||||
// If the config file is absent (ConfigFileNotFoundError at load time), this
|
||||
@@ -633,6 +674,14 @@ func (c *Config) WatchAndApply(ctx context.Context) {
|
||||
}
|
||||
|
||||
c.viper.OnConfigChange(func(in fsnotify.Event) {
|
||||
// Skip processing if watcher has been stopped
|
||||
c.reloadMu.Lock()
|
||||
if c.watcherStopped {
|
||||
c.reloadMu.Unlock()
|
||||
return
|
||||
}
|
||||
c.reloadMu.Unlock()
|
||||
|
||||
log.Info().Str("event", in.Op.String()).Str("file", in.Name).Msg("Config file changed, reloading hot-reloadable fields")
|
||||
c.reloadMu.Lock()
|
||||
defer c.reloadMu.Unlock()
|
||||
@@ -645,6 +694,30 @@ func (c *Config) WatchAndApply(ctx context.Context) {
|
||||
// Apply hot-reloadable fields. Order matters: logging first so the
|
||||
// rest of the reload is logged at the right level.
|
||||
c.SetupLogging()
|
||||
|
||||
// Check if sampler config changed and invoke callback if registered
|
||||
samplerChanged := c.prevSamplerType != c.Telemetry.Sampler.Type ||
|
||||
c.prevSamplerRatio != c.Telemetry.Sampler.Ratio
|
||||
if samplerChanged && c.samplerReconfigureCallback != nil {
|
||||
if err := c.samplerReconfigureCallback(context.Background(),
|
||||
c.Telemetry.Sampler.Type,
|
||||
c.Telemetry.Sampler.Ratio); err != nil {
|
||||
log.Error().Err(err).Msg("Hot-reload: sampler reconfigure callback failed")
|
||||
} else {
|
||||
// Update previous values only after successful callback
|
||||
c.prevSamplerType = c.Telemetry.Sampler.Type
|
||||
c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
|
||||
log.Info().
|
||||
Str("sampler_type", c.prevSamplerType).
|
||||
Float64("sampler_ratio", c.prevSamplerRatio).
|
||||
Msg("Hot-reload applied: telemetry sampler reconfigured")
|
||||
}
|
||||
} else if samplerChanged {
|
||||
// No callback registered, just update tracking values
|
||||
c.prevSamplerType = c.Telemetry.Sampler.Type
|
||||
c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
|
||||
}
|
||||
|
||||
log.Info().
|
||||
Str("logging_level", c.GetLogLevel()).
|
||||
Dur("jwt_ttl", c.GetJWTTTL()).
|
||||
@@ -654,11 +727,14 @@ func (c *Config) WatchAndApply(ctx context.Context) {
|
||||
|
||||
log.Info().Str("file", c.viper.ConfigFileUsed()).Msg("Config hot-reload watcher started (ADR-0023 Phase 1)")
|
||||
|
||||
// Stop the watcher on context cancel — viper has no public Stop method,
|
||||
// so we just clear the callback to make further events no-ops.
|
||||
// Stop the watcher on context cancel — we set a flag that the
|
||||
// OnConfigChange handler checks, avoiding the race with viper's
|
||||
// internal state that would occur if we called OnConfigChange again.
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
c.viper.OnConfigChange(func(_ fsnotify.Event) {})
|
||||
c.reloadMu.Lock()
|
||||
c.watcherStopped = true
|
||||
c.reloadMu.Unlock()
|
||||
log.Info().Msg("Config hot-reload watcher stopped")
|
||||
}()
|
||||
}
|
||||
|
||||
@@ -2,8 +2,10 @@ package config
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -114,3 +116,236 @@ func TestWatchAndApply_JWTTTL(t *testing.T) {
|
||||
defer c.reloadMu.RUnlock()
|
||||
t.Fatalf("auth.jwt.ttl did not hot-reload to 30m: still %s", c.GetJWTTTL())
|
||||
}
|
||||
|
||||
// TestWatchAndApply_TelemetrySamplerType proves Phase 3 of ADR-0023:
|
||||
// when telemetry.sampler.type changes, the callback registered via
|
||||
// SetSamplerReconfigureCallback is invoked exactly once with the new value.
|
||||
func TestWatchAndApply_TelemetrySamplerType(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "config.yaml")
|
||||
initial := []byte(`telemetry:
|
||||
sampler:
|
||||
type: parentbased_always_on
|
||||
ratio: 1.0
|
||||
`)
|
||||
changed := []byte(`telemetry:
|
||||
sampler:
|
||||
type: traceidratio
|
||||
ratio: 1.0
|
||||
`)
|
||||
require.NoError(t, os.WriteFile(path, initial, 0644))
|
||||
|
||||
c := loadFromFile(t, path)
|
||||
assert.Equal(t, "parentbased_always_on", c.GetSamplerType())
|
||||
|
||||
// Setup callback tracker
|
||||
var mu sync.Mutex
|
||||
callbackCalled := false
|
||||
var recordedType string
|
||||
var recordedRatio float64
|
||||
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCalled = true
|
||||
recordedType = samplerType
|
||||
recordedRatio = samplerRatio
|
||||
return nil
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
c.WatchAndApply(ctx)
|
||||
|
||||
// Mutate the file
|
||||
require.NoError(t, os.WriteFile(path, changed, 0644))
|
||||
|
||||
// Poll for up to 2s waiting for callback
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
if callbackCalled {
|
||||
mu.Unlock()
|
||||
assert.Equal(t, "traceidratio", recordedType)
|
||||
assert.Equal(t, 1.0, recordedRatio)
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
|
||||
}
|
||||
|
||||
// TestWatchAndApply_TelemetrySamplerRatio proves Phase 3 of ADR-0023:
|
||||
// when telemetry.sampler.ratio changes, the callback registered via
|
||||
// SetSamplerReconfigureCallback is invoked exactly once with the new value.
|
||||
func TestWatchAndApply_TelemetrySamplerRatio(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "config.yaml")
|
||||
initial := []byte(`telemetry:
|
||||
sampler:
|
||||
type: parentbased_always_on
|
||||
ratio: 1.0
|
||||
`)
|
||||
changed := []byte(`telemetry:
|
||||
sampler:
|
||||
type: parentbased_always_on
|
||||
ratio: 0.5
|
||||
`)
|
||||
require.NoError(t, os.WriteFile(path, initial, 0644))
|
||||
|
||||
c := loadFromFile(t, path)
|
||||
assert.Equal(t, 1.0, c.GetSamplerRatio())
|
||||
|
||||
// Setup callback tracker
|
||||
var mu sync.Mutex
|
||||
callbackCalled := false
|
||||
var recordedType string
|
||||
var recordedRatio float64
|
||||
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCalled = true
|
||||
recordedType = samplerType
|
||||
recordedRatio = samplerRatio
|
||||
return nil
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
c.WatchAndApply(ctx)
|
||||
|
||||
// Mutate the file
|
||||
require.NoError(t, os.WriteFile(path, changed, 0644))
|
||||
|
||||
// Poll for up to 2s waiting for callback
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
if callbackCalled {
|
||||
mu.Unlock()
|
||||
assert.Equal(t, "parentbased_always_on", recordedType)
|
||||
assert.Equal(t, 0.5, recordedRatio)
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
|
||||
}
|
||||
|
||||
// TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange proves that
|
||||
// the sampler callback is NOT invoked when the config file changes but
|
||||
// sampler type and ratio remain the same.
|
||||
func TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "config.yaml")
|
||||
initial := []byte(`telemetry:
|
||||
sampler:
|
||||
type: parentbased_always_on
|
||||
ratio: 1.0
|
||||
logging:
|
||||
level: info
|
||||
`)
|
||||
changed := []byte(`telemetry:
|
||||
sampler:
|
||||
type: parentbased_always_on
|
||||
ratio: 1.0
|
||||
logging:
|
||||
level: debug
|
||||
`)
|
||||
require.NoError(t, os.WriteFile(path, initial, 0644))
|
||||
|
||||
c := loadFromFile(t, path)
|
||||
|
||||
// Setup callback tracker
|
||||
var mu sync.Mutex
|
||||
callbackCalled := false
|
||||
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCalled = true
|
||||
return nil
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
c.WatchAndApply(ctx)
|
||||
|
||||
// Mutate the file (logging level changes, but sampler stays the same)
|
||||
require.NoError(t, os.WriteFile(path, changed, 0644))
|
||||
|
||||
// Poll for up to 2s - callback should NOT be called
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
wasCalled := callbackCalled
|
||||
mu.Unlock()
|
||||
if wasCalled {
|
||||
t.Fatalf("sampler reconfigure callback was invoked but sampler did not change")
|
||||
}
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
// TestWatchAndApply_SamplerCallbackErrorHandling proves that when the
|
||||
// sampler reconfigure callback returns an error, the previous sampler values
|
||||
// are NOT updated, allowing retry on next config change.
|
||||
func TestWatchAndApply_SamplerCallbackErrorHandling(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
path := filepath.Join(dir, "config.yaml")
|
||||
initial := []byte(`telemetry:
|
||||
sampler:
|
||||
type: parentbased_always_on
|
||||
ratio: 1.0
|
||||
`)
|
||||
changed := []byte(`telemetry:
|
||||
sampler:
|
||||
type: traceidratio
|
||||
ratio: 0.5
|
||||
`)
|
||||
require.NoError(t, os.WriteFile(path, initial, 0644))
|
||||
|
||||
c := loadFromFile(t, path)
|
||||
|
||||
// Setup callback that returns an error
|
||||
expectedErr := errors.New("reconfigure failed")
|
||||
var mu sync.Mutex
|
||||
callbackCalled := false
|
||||
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
callbackCalled = true
|
||||
return expectedErr
|
||||
})
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
c.WatchAndApply(ctx)
|
||||
|
||||
// Mutate the file
|
||||
require.NoError(t, os.WriteFile(path, changed, 0644))
|
||||
|
||||
// Poll for up to 2s waiting for callback error
|
||||
deadline := time.Now().Add(2 * time.Second)
|
||||
for time.Now().Before(deadline) {
|
||||
mu.Lock()
|
||||
if callbackCalled {
|
||||
mu.Unlock()
|
||||
// Verify previous values were NOT updated (so retry can work)
|
||||
c.reloadMu.RLock()
|
||||
assert.Equal(t, "parentbased_always_on", c.prevSamplerType)
|
||||
assert.Equal(t, 1.0, c.prevSamplerRatio)
|
||||
c.reloadMu.RUnlock()
|
||||
return
|
||||
}
|
||||
mu.Unlock()
|
||||
time.Sleep(20 * time.Millisecond)
|
||||
}
|
||||
mu.Lock()
|
||||
defer mu.Unlock()
|
||||
t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user