2 Commits

Author SHA1 Message Date
7d95259667 🐛 fix(config): remove racy log.Info in WatchAndApply cancel goroutine
All checks were successful
CI/CD Pipeline / Build Docker Cache (push) Successful in 17s
CI/CD Pipeline / CI Pipeline (push) Successful in 6m20s
CI/CD Pipeline / Trigger Docker Push (push) Has been skipped
The cancel-handler goroutine emitted a "watcher stopped" log AFTER
setting the watcherStopped flag. Because viper's internal watcher
goroutine (started by viper.WatchConfig) has no public Stop and can
outlive the context, this log call would race with subsequent
zerolog.SetGlobalLevel from the next test's LoadConfig → SetupLogging.

Symptom (under go test -race ./pkg/config/...):
  WARNING: DATA RACE
  Write at zerolog.SetGlobalLevel
  Previous read by zerolog.(*Logger).disabled in log.Info(...)

Fix: drop the informational log. The flag is sufficient — the cancel
ack does not need to be observable. Test cleanups (defer cancel())
already serialize via the t.Cleanup teardown order.

Verified: go test -race ./pkg/config/... passes, full BDD suite green.
2026-05-05 09:39:46 +02:00
3d9746ed65 🐛 fix(ci): remove dollar-double-brace expression from comment that still gets interpolated (#47)
All checks were successful
CI/CD Pipeline / Build Docker Cache (push) Successful in 10s
CI/CD Pipeline / CI Pipeline (push) Successful in 3m44s
CI/CD Pipeline / Trigger Docker Push (push) Successful in 5s
Co-authored-by: Gabriel Radureau <arcodange@gmail.com>
Co-committed-by: Gabriel Radureau <arcodange@gmail.com>
2026-05-05 09:34:00 +02:00
4 changed files with 326 additions and 11 deletions

View File

@@ -299,10 +299,11 @@ jobs:
# Check for version bump on main branch
if [ "${{ github.ref }}" = "refs/heads/main" ]; then
echo "🔖 Checking for version bump..."
# Always read from git log: ${{ github.event.head_commit.message }} expression
# is interpolated literally into the shell script, so any backtick, unbalanced
# quote, or special char in a commit body breaks the next line of the script
# (observed on PR #32-#35: 'syntax error: unexpected newline'). git log is safe.
# Read commit message from git, NOT from the workflow event payload.
# The event-payload expression is interpolated literally into the
# rendered script (even inside comments — see PR #38 + #46), so any
# backtick / unbalanced quote / multi-line body breaks bash parsing.
# git log is interpolation-free and safe.
COMMIT_MSG=$(git log -1 --pretty=%B)
./scripts/ci-version-bump.sh "$COMMIT_MSG" --no-push
fi

View File

@@ -1,6 +1,6 @@
# Config Hot Reloading Strategy
**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely. Remaining fields — `api.v2_enabled`, telemetry sampler — Proposed for follow-up phases.)
**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely.) Phase 3 sub-phase 3.1 Implemented (2026-05-05 — `ReconfigureTracerProvider` in `pkg/telemetry/telemetry.go` added). Phase 3 sub-phase 3.2 In Flight (2026-05-05 — `telemetry.sampler.type` + `telemetry.sampler.ratio` hot-reload via `SetSamplerReconfigureCallback` in `pkg/config/config.go`. Remaining field: `api.v2_enabled`.)
**Authors:** Gabriel Radureau, AI Agent
**Date:** 2026-04-05
**Last Updated:** 2026-05-05

View File

@@ -16,6 +16,13 @@ import (
"dance-lessons-coach/pkg/version"
)
// SamplerReconfigureFunc is the signature for callbacks invoked when
// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload.
// The callback receives the new sampler type and ratio values.
// It must be safe to call concurrently — implementations should use their
// own synchronisation if needed. Returns an error if the reconfigure fails.
type SamplerReconfigureFunc func(ctx context.Context, samplerType string, samplerRatio float64) error
// NewZerologWriter creates a zerolog writer based on configuration
func NewZerologWriter() *os.File {
return os.Stderr
@@ -41,6 +48,20 @@ type Config struct {
// reloadMu serialises Unmarshal during hot-reload so a partial mutation
// can't be observed mid-flight by getter calls.
reloadMu sync.RWMutex `mapstructure:"-"`
// samplerReconfigureCallback is invoked when telemetry.sampler.type or
// telemetry.sampler.ratio change. nil means no callback registered.
samplerReconfigureCallback SamplerReconfigureFunc `mapstructure:"-"`
// prevSamplerType and prevSamplerRatio track the last-seen sampler values
// to detect changes during hot-reload (ADR-0023 Phase 3).
prevSamplerType string `mapstructure:"-"`
prevSamplerRatio float64 `mapstructure:"-"`
// watcherStopped indicates that the config watcher has been stopped via
// the context being cancelled. This prevents the OnConfigChange handler
// from processing events after cleanup.
watcherStopped bool `mapstructure:"-"`
}
// ServerConfig holds server-related configuration
@@ -314,6 +335,11 @@ func LoadConfig() (*Config, error) {
// Keep the viper instance for hot-reload (ADR-0023).
config.viper = v
// Initialize previous sampler values for hot-reload change detection
// (ADR-0023 Phase 3).
config.prevSamplerType = config.Telemetry.Sampler.Type
config.prevSamplerRatio = config.Telemetry.Sampler.Ratio
// Setup logging based on configuration (level, output file, time format).
// The JSON/console format was already applied at the top of LoadConfig via
// peekJSONLogging, so SetupLogging only needs to handle the remaining knobs.
@@ -378,6 +404,19 @@ func (c *Config) GetSamplerRatio() float64 {
return c.Telemetry.Sampler.Ratio
}
// SetSamplerReconfigureCallback registers a callback that is invoked when
// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload.
// The callback receives the new sampler type and ratio values.
// Pass nil to unregister the callback.
func (c *Config) SetSamplerReconfigureCallback(callback SamplerReconfigureFunc) {
c.reloadMu.Lock()
defer c.reloadMu.Unlock()
c.samplerReconfigureCallback = callback
// Initialize previous values so we can detect changes on first hot-reload
c.prevSamplerType = c.Telemetry.Sampler.Type
c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
}
// GetV2Enabled returns whether v2 API is enabled
func (c *Config) GetV2Enabled() bool {
return c.API.V2Enabled
@@ -615,9 +654,11 @@ func (c *Config) setupLogOutput() {
// reads it via JWTConfig.GetTTL (a method value capturing this *Config).
// The reloaded TTL is used on the NEXT token generation; tokens issued
// before the change keep their original expiry.
// - Phase 3: telemetry.sampler.type + telemetry.sampler.ratio — triggers
// the callback set via SetSamplerReconfigureCallback if the values change.
//
// The other fields listed in ADR-0023 (api.v2_enabled, telemetry sampler)
// remain restart-only until their handlers land in subsequent phases.
// The other fields listed in ADR-0023 (api.v2_enabled) remain restart-only
// until their handlers land in subsequent phases.
//
// Stops when ctx is cancelled. Safe to call once at server startup.
// If the config file is absent (ConfigFileNotFoundError at load time), this
@@ -633,6 +674,14 @@ func (c *Config) WatchAndApply(ctx context.Context) {
}
c.viper.OnConfigChange(func(in fsnotify.Event) {
// Skip processing if watcher has been stopped
c.reloadMu.Lock()
if c.watcherStopped {
c.reloadMu.Unlock()
return
}
c.reloadMu.Unlock()
log.Info().Str("event", in.Op.String()).Str("file", in.Name).Msg("Config file changed, reloading hot-reloadable fields")
c.reloadMu.Lock()
defer c.reloadMu.Unlock()
@@ -645,6 +694,30 @@ func (c *Config) WatchAndApply(ctx context.Context) {
// Apply hot-reloadable fields. Order matters: logging first so the
// rest of the reload is logged at the right level.
c.SetupLogging()
// Check if sampler config changed and invoke callback if registered
samplerChanged := c.prevSamplerType != c.Telemetry.Sampler.Type ||
c.prevSamplerRatio != c.Telemetry.Sampler.Ratio
if samplerChanged && c.samplerReconfigureCallback != nil {
if err := c.samplerReconfigureCallback(context.Background(),
c.Telemetry.Sampler.Type,
c.Telemetry.Sampler.Ratio); err != nil {
log.Error().Err(err).Msg("Hot-reload: sampler reconfigure callback failed")
} else {
// Update previous values only after successful callback
c.prevSamplerType = c.Telemetry.Sampler.Type
c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
log.Info().
Str("sampler_type", c.prevSamplerType).
Float64("sampler_ratio", c.prevSamplerRatio).
Msg("Hot-reload applied: telemetry sampler reconfigured")
}
} else if samplerChanged {
// No callback registered, just update tracking values
c.prevSamplerType = c.Telemetry.Sampler.Type
c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
}
log.Info().
Str("logging_level", c.GetLogLevel()).
Dur("jwt_ttl", c.GetJWTTTL()).
@@ -654,11 +727,17 @@ func (c *Config) WatchAndApply(ctx context.Context) {
log.Info().Str("file", c.viper.ConfigFileUsed()).Msg("Config hot-reload watcher started (ADR-0023 Phase 1)")
// Stop the watcher on context cancel — viper has no public Stop method,
// so we just clear the callback to make further events no-ops.
// Stop the watcher on context cancel — we set a flag that the
// OnConfigChange handler checks, avoiding the race with viper's
// internal state that would occur if we called OnConfigChange again.
// We deliberately do NOT log here: viper's internal watcher goroutine
// has no public Stop, so it can outlive ctx, and a zerolog call here
// would race with the next test's LoadConfig → SetupLogging →
// zerolog.SetGlobalLevel under -race (observed 2026-05-05).
go func() {
<-ctx.Done()
c.viper.OnConfigChange(func(_ fsnotify.Event) {})
log.Info().Msg("Config hot-reload watcher stopped")
c.reloadMu.Lock()
c.watcherStopped = true
c.reloadMu.Unlock()
}()
}

View File

@@ -2,8 +2,10 @@ package config
import (
"context"
"errors"
"os"
"path/filepath"
"sync"
"testing"
"time"
@@ -114,3 +116,236 @@ func TestWatchAndApply_JWTTTL(t *testing.T) {
defer c.reloadMu.RUnlock()
t.Fatalf("auth.jwt.ttl did not hot-reload to 30m: still %s", c.GetJWTTTL())
}
// TestWatchAndApply_TelemetrySamplerType proves Phase 3 of ADR-0023:
// when telemetry.sampler.type changes, the callback registered via
// SetSamplerReconfigureCallback is invoked exactly once with the new value.
func TestWatchAndApply_TelemetrySamplerType(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
initial := []byte(`telemetry:
sampler:
type: parentbased_always_on
ratio: 1.0
`)
changed := []byte(`telemetry:
sampler:
type: traceidratio
ratio: 1.0
`)
require.NoError(t, os.WriteFile(path, initial, 0644))
c := loadFromFile(t, path)
assert.Equal(t, "parentbased_always_on", c.GetSamplerType())
// Setup callback tracker
var mu sync.Mutex
callbackCalled := false
var recordedType string
var recordedRatio float64
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
mu.Lock()
defer mu.Unlock()
callbackCalled = true
recordedType = samplerType
recordedRatio = samplerRatio
return nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
c.WatchAndApply(ctx)
// Mutate the file
require.NoError(t, os.WriteFile(path, changed, 0644))
// Poll for up to 2s waiting for callback
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
mu.Lock()
if callbackCalled {
mu.Unlock()
assert.Equal(t, "traceidratio", recordedType)
assert.Equal(t, 1.0, recordedRatio)
return
}
mu.Unlock()
time.Sleep(20 * time.Millisecond)
}
mu.Lock()
defer mu.Unlock()
t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
}
// TestWatchAndApply_TelemetrySamplerRatio proves Phase 3 of ADR-0023:
// when telemetry.sampler.ratio changes, the callback registered via
// SetSamplerReconfigureCallback is invoked exactly once with the new value.
func TestWatchAndApply_TelemetrySamplerRatio(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
initial := []byte(`telemetry:
sampler:
type: parentbased_always_on
ratio: 1.0
`)
changed := []byte(`telemetry:
sampler:
type: parentbased_always_on
ratio: 0.5
`)
require.NoError(t, os.WriteFile(path, initial, 0644))
c := loadFromFile(t, path)
assert.Equal(t, 1.0, c.GetSamplerRatio())
// Setup callback tracker
var mu sync.Mutex
callbackCalled := false
var recordedType string
var recordedRatio float64
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
mu.Lock()
defer mu.Unlock()
callbackCalled = true
recordedType = samplerType
recordedRatio = samplerRatio
return nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
c.WatchAndApply(ctx)
// Mutate the file
require.NoError(t, os.WriteFile(path, changed, 0644))
// Poll for up to 2s waiting for callback
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
mu.Lock()
if callbackCalled {
mu.Unlock()
assert.Equal(t, "parentbased_always_on", recordedType)
assert.Equal(t, 0.5, recordedRatio)
return
}
mu.Unlock()
time.Sleep(20 * time.Millisecond)
}
mu.Lock()
defer mu.Unlock()
t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
}
// TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange proves that
// the sampler callback is NOT invoked when the config file changes but
// sampler type and ratio remain the same.
func TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
initial := []byte(`telemetry:
sampler:
type: parentbased_always_on
ratio: 1.0
logging:
level: info
`)
changed := []byte(`telemetry:
sampler:
type: parentbased_always_on
ratio: 1.0
logging:
level: debug
`)
require.NoError(t, os.WriteFile(path, initial, 0644))
c := loadFromFile(t, path)
// Setup callback tracker
var mu sync.Mutex
callbackCalled := false
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
mu.Lock()
defer mu.Unlock()
callbackCalled = true
return nil
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
c.WatchAndApply(ctx)
// Mutate the file (logging level changes, but sampler stays the same)
require.NoError(t, os.WriteFile(path, changed, 0644))
// Poll for up to 2s - callback should NOT be called
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
mu.Lock()
wasCalled := callbackCalled
mu.Unlock()
if wasCalled {
t.Fatalf("sampler reconfigure callback was invoked but sampler did not change")
}
time.Sleep(20 * time.Millisecond)
}
}
// TestWatchAndApply_SamplerCallbackErrorHandling proves that when the
// sampler reconfigure callback returns an error, the previous sampler values
// are NOT updated, allowing retry on next config change.
func TestWatchAndApply_SamplerCallbackErrorHandling(t *testing.T) {
dir := t.TempDir()
path := filepath.Join(dir, "config.yaml")
initial := []byte(`telemetry:
sampler:
type: parentbased_always_on
ratio: 1.0
`)
changed := []byte(`telemetry:
sampler:
type: traceidratio
ratio: 0.5
`)
require.NoError(t, os.WriteFile(path, initial, 0644))
c := loadFromFile(t, path)
// Setup callback that returns an error
expectedErr := errors.New("reconfigure failed")
var mu sync.Mutex
callbackCalled := false
c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
mu.Lock()
defer mu.Unlock()
callbackCalled = true
return expectedErr
})
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
c.WatchAndApply(ctx)
// Mutate the file
require.NoError(t, os.WriteFile(path, changed, 0644))
// Poll for up to 2s waiting for callback error
deadline := time.Now().Add(2 * time.Second)
for time.Now().Before(deadline) {
mu.Lock()
if callbackCalled {
mu.Unlock()
// Verify previous values were NOT updated (so retry can work)
c.reloadMu.RLock()
assert.Equal(t, "parentbased_always_on", c.prevSamplerType)
assert.Equal(t, 1.0, c.prevSamplerRatio)
c.reloadMu.RUnlock()
return
}
mu.Unlock()
time.Sleep(20 * time.Millisecond)
}
mu.Lock()
defer mu.Unlock()
t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
}