feat(config): add sampler hot-reload callback for ADR-0023 Phase 3.2

- Add SamplerReconfigureFunc type and SetSamplerReconfigureCallback method - Track previous sampler type/ratio values to detect changes - Invoke callback when telemetry.sampler.type or ratio changes - Fix race condition in WatchAndApply cleanup using watcherStopped flag - Add unit tests for sampler type/ratio hot-reload scenarios - Update ADR-0023 status to reflect Phase 3.2 in flight Generated by Mistral Vibe. Co-Authored-By: Mistral Vibe <vibe@mistral.ai>
✨ feat(telemetry): ReconfigureTracerProvider for sampler hot-reload (ADR-0023 Phase 3, sub-phase 3.1) (#45 )
2026-05-05 09:32:08 +02:00 · 2026-05-05 09:27:20 +02:00 · 2026-05-05 09:09:22 +02:00 · 2026-05-05 09:04:48 +02:00 · 2026-05-05 08:45:19 +02:00 · 2026-05-05 08:40:27 +02:00
96 changed files with 17772 additions and 1862 deletions
--- a/.gitea/workflows/ci-cd.yaml
+++ b/.gitea/workflows/ci-cd.yaml
@@ -219,6 +219,12 @@ jobs:
          export DLC_DATABASE_PASSWORD=postgres
          export DLC_DATABASE_NAME=dance_lessons_coach_bdd_test
          export DLC_DATABASE_SSL_MODE=disable
+          # T12: per-package isolated Postgres schema with migrations (re-enables what
+          # PR #26 attempted but couldn't deliver because the empty schemas had no tables).
+          # The fix: testserver Start() now builds a per-package isolated repo via
+          # user.NewPostgresRepositoryFromDSN which DOES run AutoMigrate against the new
+          # schema. Packages then run in parallel (~2.85x speedup observed locally).
+          export BDD_SCHEMA_ISOLATION=true
          ./scripts/run-bdd-tests.sh
          
          # Generate BDD coverage report
@@ -293,7 +299,12 @@ jobs:
          # Check for version bump on main branch
          if [ "${{ github.ref }}" = "refs/heads/main" ]; then
            echo "🔖 Checking for version bump..."
-            ./scripts/ci-version-bump.sh "${{ github.event.head_commit.message }}" --no-push
+            # Always read from git log: ${{ github.event.head_commit.message }} expression
+            # is interpolated literally into the shell script, so any backtick, unbalanced
+            # quote, or special char in a commit body breaks the next line of the script
+            # (observed on PR #32-#35: 'syntax error: unexpected newline'). git log is safe.
+            COMMIT_MSG=$(git log -1 --pretty=%B)
+            ./scripts/ci-version-bump.sh "$COMMIT_MSG" --no-push
          fi
          
          # Single push for all commits (this is the ONLY push in the entire workflow)
--- a/.gitignore
+++ b/.gitignore
@@ -34,3 +34,14 @@ config/runner
 coverage.txt
 trigger.txt
 test_trigger.txt
+
+# Frontend
+frontend/node_modules/
+frontend/.nuxt/
+frontend/.output/
+frontend/dist/
+frontend/.env
+frontend/.cache/
+frontend/storybook-static/
+frontend/test-results/
+frontend/playwright-report/
--- a/.vibe/skills/gitea-client/scripts/gitea-client.sh
+++ b/.vibe/skills/gitea-client/scripts/gitea-client.sh
@@ -203,6 +203,31 @@ cmd_wait_job() {
 }

 # Comment on PR
+# Create a pull request
+cmd_create_pr() {
+    local owner="$1"
+    local repo="$2"
+    local title="$3"
+    local body="$4"
+    local head="$5"
+    local base="${6:-main}"
+
+    if [[ -z "$owner" || -z "$repo" || -z "$title" || -z "$head" ]]; then
+        echo "Usage: $0 create-pr <owner> <repo> <title> <body> <head_branch> [base_branch]" >&2
+        exit 1
+    fi
+
+    local endpoint="/repos/${owner}/${repo}/pulls"
+    local data
+    data=$(jq -n \
+        --arg title "$title" \
+        --arg body "$body" \
+        --arg head "$head" \
+        --arg base "$base" \
+        '{title: $title, body: $body, head: $head, base: $base}')
+    api_request "POST" "$endpoint" "$data"
+}
+
 cmd_comment_pr() {
    local owner="$1"
    local repo="$2"
@@ -215,7 +240,8 @@ cmd_comment_pr() {
    fi
    
    local endpoint="/repos/${owner}/${repo}/issues/${pr_number}/comments"
-    local data="{\"body\": \"${comment}\"}"
+    local data
+    data=$(jq -n --arg body "$comment" '{body: $body}')
    api_request "POST" "$endpoint" "$data"
 }

@@ -250,6 +276,7 @@ main() {
        monitor-workflow) cmd_monitor_workflow "$@" ;;
        diagnose-job) cmd_diagnose_job "$@" ;;
        recent-workflows) cmd_recent_workflows "$@" ;;
+        create-pr) cmd_create_pr "$@" ;;
        comment-pr) cmd_comment_pr "$@" ;;
        pr-status) cmd_pr_status "$@" ;;
        list-issues) cmd_list_issues "$@" ;;
@@ -274,6 +301,7 @@ main() {
            echo "  monitor-workflow <owner> <repo> <workflow_run_id> [interval_seconds]" >&2
            echo "  diagnose-job <owner> <repo> <job_id>" >&2
            echo "  recent-workflows <owner> <repo> [limit] [status_filter]" >&2
+            echo "  create-pr <owner> <repo> <title> <body> <head_branch> [base_branch]" >&2
            echo "  comment-pr <owner> <repo> <pr_number> <comment>" >&2
            echo "  pr-status <owner> <repo> <pr_number>" >&2
            echo "  list-issues <owner> <repo> [state]" >&2
--- a/AGENTS.md
+++ b/AGENTS.md
--- a/README.md
+++ b/README.md
@@ -1,421 +1,101 @@
 # dance-lessons-coach

-[![Build Status](https://gitea.arcodange.fr/arcodange/dance-lessons-coach/actions/workflows/ci-cd.yaml/badge.svg)](https://gitea.arcodange.fr/arcodange/dance-lessons-coach/actions/workflows/ci-cd.yaml/badge.svg)
+[![Build Status](https://gitea.arcodange.fr/arcodange/dance-lessons-coach/actions/workflows/ci-cd.yaml/badge.svg)](https://gitea.arcodange.fr/arcodange/dance-lessons-coach/actions/workflows/ci-cd.yaml)
 [![Go Report Card](https://goreportcard.com/badge/github.com/arcodange/dance-lessons-coach)](https://goreportcard.com/report/github.com/arcodange/dance-lessons-coach)
 [![Version](https://img.shields.io/badge/version-1.4.0-blue.svg)](https://gitea.arcodange.fr/arcodange/dance-lessons-coach/releases)
 [![License](https://img.shields.io/badge/license-MIT-green.svg)](LICENSE)
 [![BDD Coverage](https://img.shields.io/badge/BDD_Coverage-51.1%%-red?style=flat-square)](https://gitea.arcodange.lab/arcodange/dance-lessons-coach)
 [![UNIT Coverage](https://img.shields.io/badge/UNIT_Coverage-8.9%%-red?style=flat-square)](https://gitea.arcodange.lab/arcodange/dance-lessons-coach)

-A Go project demonstrating idiomatic package structure, CLI implementation, and JSON API with Chi router.
-=======
+Go web service demonstrating idiomatic package structure, versioned JSON API, and production-ready features.

 ## Features

- Greet function with default behavior
- Command-line interface
- JSON API with versioned endpoints
- Chi router integration
- Zerolog for high-performance logging
- Viper for configuration management
- Graceful shutdown with context
- Readiness endpoint for Kubernetes/service mesh integration
- OpenTelemetry integration with Jaeger support
- OpenAPI/Swagger documentation
- Unit tests
- Go 1.26.1 compatible
+- Versioned JSON API (`/api/v1`, `/api/v2`)
+- Chi router with graceful shutdown
+- Zerolog structured logging (console and JSON modes)
+- Viper configuration (file + env vars)
+- Readiness endpoint for Kubernetes / service mesh
+- OpenTelemetry / Jaeger distributed tracing
+- OpenAPI / Swagger UI (embedded in binary)
+- PostgreSQL user service with JWT auth
+- BDD + unit tests

-## Installation
+## Quick Start

 ```bash
-# Clone the repository
 git clone https://gitea.arcodange.lab/arcodange/dance-lessons-coach.git
 cd dance-lessons-coach
-
-# Build all binaries
-./scripts/build.sh
-
-# Use the new Cobra CLI
-./bin/dance-lessons-coach --help
-
-# Or use the legacy greet CLI
-go run ./cmd/greet
+./scripts/build.sh          # produces ./bin/server and ./bin/greet
+./scripts/start-server.sh start
 ```

-## CI/CD Pipeline
-
-dance-lessons-coach features an optimized CI/CD pipeline using GitHub Actions with container/services architecture:
-
-### Key Features
- ✅ **Container-based execution**: All steps run in pre-built Docker cache images
- ✅ **Service-based PostgreSQL**: Automatic database service provisioning
- ✅ **Smart caching**: Dependency-aware cache invalidation
- ✅ **Multi-platform**: Compatible with Gitea, GitHub, and GitLab
- ✅ **Fast execution**: No Docker Compose overhead
- ✅ **Reliable testing**: Full database connectivity with proper environment setup
-
-### Architecture
-
-The pipeline uses GitHub Actions' native `container` and `services` directives instead of Docker Compose:
-
-```yaml
-jobs:
-  ci-pipeline:
-    container:
-      image: gitea.arcodange.lab/arcodange/dance-lessons-coach-build-cache:${{ needs.build-cache.outputs.deps_hash }}
-    
-    services:
-      postgres:
-        image: postgres:15
-        env:
-          POSTGRES_USER: postgres
-          POSTGRES_PASSWORD: postgres
-          POSTGRES_DB: dance_lessons_coach_bdd_test
-```
-
-### Benefits
-
-1. **Performance**: Direct container execution without compose overhead
-2. **Reliability**: Service containers managed by GitHub Actions
-3. **Simplicity**: Cleaner workflow definition
-4. **Portability**: Works across CI platforms
-5. **Caching**: Intelligent dependency-based cache rebuilding
-
-### Workflow Steps
-
-1. **Build Cache**: Creates Docker image with Go tools and dependencies
-2. **CI Pipeline**: Runs tests, builds binaries, and generates documentation
-3. **Database Tests**: Connects to PostgreSQL service container
-4. **Coverage Reporting**: Updates coverage badges automatically
-5. **Artifact Publishing**: Builds and pushes Docker images (main branch only)
-
-### Environment Configuration
-
-The pipeline automatically sets up database environment variables:
-
 ```bash
-echo "DLC_DATABASE_HOST=postgres" >> $GITHUB_ENV
-echo "DLC_DATABASE_PORT=5432" >> $GITHUB_ENV
-echo "DLC_DATABASE_USER=postgres" >> $GITHUB_ENV
-echo "DLC_DATABASE_PASSWORD=postgres" >> $GITHUB_ENV
-echo "DLC_DATABASE_NAME=dance_lessons_coach_bdd_test" >> $GITHUB_ENV
-echo "DLC_DATABASE_SSL_MODE=disable" >> $GITHUB_ENV
+curl http://localhost:8080/api/health
+curl http://localhost:8080/api/v1/greet/Alice
 ```

-### Status
+Stop: `./scripts/start-server.sh stop`

-[![Build Status](https://gitea.arcodange.fr/api/badges/arcodange/dance-lessons-coach/status)](https://gitea.arcodange.fr/arcodange/dance-lessons-coach)
+## Greet CLI

-=======
- ✅ **Linting**: Code quality checks with `go fmt` and `go vet`
- ✅ **Version Management**: Automatic version detection
- ✅ **Portable**: Uses standard GitHub Actions workflow format
-
-### Workflow File
-```yaml
-# .github/workflows/main.yml
-jobs:
-  build-test:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-go@v4
-        with:
-          go-version: '1.26.1'
-      - run: go build ./...
-      - run: go test ./... -cover
-
-  lint-format:
-    runs-on: ubuntu-latest
-    steps:
-      - run: go fmt ./...
-      - run: go vet ./...
+```bash
+go run ./cmd/greet           # Hello world!
+go run ./cmd/greet Alice     # Hello Alice!
 ```

-### Setup Instructions
-1. **Gitea**: Enable GitHub Actions compatibility in repo settings
-2. **GitHub**: Push to mirror repository (workflow runs automatically)
-3. **GitLab**: Convert workflow to `.gitlab-ci.yml` or use compatibility mode
-
-**See [ADR 0016](adr/0016-ci-cd-pipeline-design.md) for complete CI/CD design and [STATUS_BADGES.md](STATUS_BADGES.md) for badge setup.**
-
 ## Configuration

-Basic configuration options:
+All options are available via `config.yaml` or `DLC_*` environment variables.

-```bash
-# Start with default configuration
-./scripts/start-server.sh start
+| Env var | Default | Description |
+|---------|---------|-------------|
+| `DLC_SERVER_PORT` | `8080` | Listening port |
+| `DLC_SERVER_HOST` | `0.0.0.0` | Bind address |
+| `DLC_LOGGING_JSON` | `false` | JSON log format |
+| `DLC_LOGGING_OUTPUT` | stderr | Log file path |
+| `DLC_SHUTDOWN_TIMEOUT` | `30s` | Graceful shutdown window |
+| `DLC_API_V2_ENABLED` | `false` | Enable `/api/v2` routes |
+| `DLC_CONFIG_FILE` | `./config.yaml` | Override config path |

-# Custom port
-export DLC_SERVER_PORT=9090
-./scripts/start-server.sh start
+See `config.example.yaml` for a full template.

-# JSON logging
-export DLC_LOGGING_JSON=true
-./scripts/start-server.sh start
-```
+## API

-**See [AGENTS.md](AGENTS.md#configuration-management) for comprehensive configuration guide including:**
- File-based configuration
- Environment variables
- Configuration priority rules
- OpenTelemetry setup
- Advanced scenarios
-
-## Usage
-
-### New Cobra CLI (Recommended)
-
-```bash
-# Show help
-./bin/dance-lessons-coach --help
-
-# Show version
-./bin/dance-lessons-coach version
-
-# Greet someone
-./bin/dance-lessons-coach greet John
-
-# Start server
-./bin/dance-lessons-coach server
-```
-
-### Legacy CLI (Deprecated)
-
-```bash
-# Default greeting
-go run ./cmd/greet
-# Output: Hello world!
-
-# Custom greeting
-go run ./cmd/greet John
-# Output: Hello John!
-```
-
-### Web Server
-
-**Using the server control script (recommended):**
-
-```bash
-# Start the server
-./scripts/start-server.sh start
-
-# Test API endpoints
-./scripts/start-server.sh test
-
-# Access OpenAPI documentation
-# Swagger UI: http://localhost:8080/swagger/
-# OpenAPI spec: http://localhost:8080/swagger/doc.json
-
-# Stop the server
-./scripts/start-server.sh stop
-```
-
-**Manual server management:**
-
-```bash
-# Start the server
-go run ./cmd/server
-
-# Test API endpoints
-curl http://localhost:8080/api/health
-# Output: {"status":"healthy"}
-
-curl http://localhost:8080/api/ready
-# Output: {"ready":true}
-
-curl http://localhost:8080/api/v1/greet
-# Output: {"message":"Hello world!"}
-
-curl http://localhost:8080/api/v1/greet/John
-# Output: {"message":"Hello John!"}
-```
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/api/health` | Liveness check |
+| GET | `/api/ready` | Readiness check (503 during shutdown) |
+| GET | `/api/version` | Version info (`?format=plain\|full\|json`) |
+| GET | `/api/v1/greet/` | Default greeting |
+| GET | `/api/v1/greet/{name}` | Named greeting |
+| POST | `/api/v2/greet` | V2 greeting with validation |
+| GET | `/swagger/` | Swagger UI |

 ## Testing

 ```bash
-# Run all tests
-go test ./...
-
-# Run specific package tests
-go test ./pkg/greet/
+go test ./...                          # unit + integration tests
+./scripts/test-graceful-shutdown.sh    # lifecycle + JSON logging validation
+./scripts/test-opentelemetry.sh        # tracing end-to-end
 ```

-## CI/CD
+## Gitea Client

-dance-lessons-coach includes a comprehensive CI/CD pipeline with multiple testing options:
+AI agent helper script at `.vibe/skills/gitea-client/scripts/gitea-client.sh`.

-### Local Testing (No Gitea Required)
+Auth setup:
 ```bash
-# Validate workflow structure
-./scripts/cicd.sh validate
-
-# Test workflow steps locally
-./scripts/cicd.sh test-simple
+echo "your_token" > ~/.gitea_token
+chmod 600 ~/.gitea_token
+export GITEA_API_TOKEN_FILE="$HOME/.gitea_token"
 ```

-### Gitea Integration
-```bash
-# Test local setup with Gitea configuration
-./scripts/cicd.sh test-local
-
-# Check pipeline status on Gitea
-./scripts/cicd.sh check-status
-```
-
-### Full CI/CD Testing
-```bash
-# Test with docker compose (requires Gitea runner)
-./scripts/cicd.sh test-docker
-```
-
-**See [adr/0016-ci-cd-pipeline-design.md](adr/0016-ci-cd-pipeline-design.md) for complete CI/CD architecture.**
-
-## Project Structure
-
-```
-dance-lessons-coach/
-├── adr/                    # Architecture Decision Records
-├── cmd/                    # Entry points (greet CLI, server)
-├── pkg/                    # Core packages (config, greet, server, telemetry)
-│   └── server/docs/        # Generated OpenAPI documentation (gitignored)
-├── config.yaml             # Configuration file
-├── scripts/                # Management scripts
-└── go.mod                   # Go module definition
-```
-
-**See [AGENTS.md](AGENTS.md#project-structure) for detailed structure and component explanations.**
-```
-
-## Development
-
-### Generate OpenAPI Documentation
-
-The project uses [swaggo/swag](https://github.com/swaggo/swag) to generate OpenAPI/Swagger documentation from code annotations:
-
-```bash
-# Generate documentation
-go generate ./pkg/server/
-
-# This creates:
-# - pkg/server/docs/docs.go (swagger template)
-# - pkg/server/docs/swagger.json (OpenAPI spec)
-# - pkg/server/docs/swagger.yaml (YAML version)
-```
-
-**Note:** `pkg/server/docs/` is gitignored. Documentation is embedded in the binary at build time.
-
-### Documentation Annotations
-
-Add swagger annotations to handlers and models:
-
-```go
-// @Summary Get personalized greeting
-// @Description Returns a greeting with the specified name
-// @Tags greet
-// @Accept json
-// @Produce json
-// @Param name path string true "Name to greet"
-// @Success 200 {object} GreetResponse "Successful response"
-// @Failure 400 {object} ErrorResponse "Invalid name parameter"
-// @Router /v1/greet/{name} [get]
-func (h *apiV1GreetHandler) handleGreetPath(w http.ResponseWriter, r *http.Request) {
-    // handler implementation
-}
-```
+Get a token at https://gitea.arcodange.lab → Profile → Settings → Applications.

 ## Architecture

-This project uses Architecture Decision Records (ADRs) to document key technical choices. See [adr/](adr/) for complete documentation including decisions on Go 1.26.1, Chi router, Zerolog, OpenTelemetry, interface-based design, graceful shutdown, configuration management, testing strategies, and OpenAPI documentation.
-
-**Adding new decisions?** See [adr/README.md](adr/README.md) for guidelines.
-
-## Gitea Integration
-
-dance-lessons-coach includes AI agent skills for Gitea integration to monitor CI/CD jobs and interact with pull requests.
-
-### Gitea Client Skill Setup
-
-The Gitea client skill enables AI agents to:
- Monitor CI/CD job status
- Fetch job logs for debugging
- Comment on pull requests
- Track PR status
-
-**Setup Instructions:**
-
-1. **Create a Personal Access Token:**
-   - Log in to https://gitea.arcodange.lab
-   - Go to Profile → Settings → Applications
-   - Generate token with `read:repository`, `write:repository`, and `read:user` scopes
-
-2. **Configure Authentication:**
-   ```bash
-   # Option 1: Environment variable
-   export GITEA_API_TOKEN="your_token"
-   
-   # Option 2: Token file (recommended)
-   echo "your_token" > ~/.gitea_token
-   chmod 600 ~/.gitea_token
-   export GITEA_API_TOKEN_FILE="$HOME/.gitea_token"
-   ```
-
-3. **Add to shell configuration:**
-   ```bash
-   echo 'export GITEA_API_TOKEN_FILE="$HOME/.gitea_token"' >> ~/.bashrc
-   source ~/.bashrc
-   ```
-
-**Usage Examples:**
-```bash
-# List recent jobs
-.vibe/skills/gitea-client/scripts/gitea-client.sh list-jobs owner repo workflow_id 5
-
-# Wait for job completion
-.vibe/skills/gitea-client/scripts/gitea-client.sh wait-job owner repo job_id 300
-
-# Comment on PR
-.vibe/skills/gitea-client/scripts/gitea-client.sh comment-pr owner repo 42 "Build completed!"
-```
-
-**Documentation:** See [.vibe/skills/gitea-client/README.md](.vibe/skills/gitea-client/README.md) for complete setup and usage guide.
-
-## 🤖 AI Agent Usage
-
-### Quick Launch Commands
-
-**Programmer Agent** (for code implementation, testing, CI/CD):
-```bash
-vibe start --agent dancelessonscoachprogrammer
-```
-
-**Product Owner Agent** (for requirements, interviews, documentation):
-```bash
-vibe start --agent dancelessonscoach-product-owner
-```
-
-### Full Documentation
-
-For complete agent usage guide including:
- Agent selection guidance
- Common workflow examples
- Configuration reference
- Best practices
- Troubleshooting tips
-
-See: [AGENT_USAGE_GUIDE.md](documentation/AGENT_USAGE_GUIDE.md)
-
-### Gitmoji Cheatsheet
-
-Quick reference for commit messages:
- **📝 `:memo:` docs** - Documentation
- **✨ `:sparkles:` feat** - New feature
- **🐛 `:bug:` fix** - Bug fix
- **♻️ `:recycle:` refactor** - Code refactoring
- **🔧 `:wrench:` chore** - Build/config changes
-
-Full cheatsheet: [GITMOJI_CHEATSHEET.md](documentation/GITMOJI_CHEATSHEET.md)
+Key decisions are documented in [adr/](adr/). See [AGENTS.md](AGENTS.md) for the full development reference (commands, config, ADR index, commit conventions).

 ## License

--- a/adr/0001-go-1.26.1-standard.md
+++ b/adr/0001-go-1.26.1-standard.md
@@ -1,8 +1,8 @@
 # Use Go 1.26.1 as the standard Go version

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-01
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-01

 ## Context and Problem Statement

--- a/adr/0002-chi-router.md
+++ b/adr/0002-chi-router.md
@@ -1,8 +1,8 @@
 # Use Chi router for HTTP routing

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-02
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-02

 ## Context and Problem Statement

--- a/adr/0003-zerolog-logging.md
+++ b/adr/0003-zerolog-logging.md
@@ -1,8 +1,8 @@
 # Use Zerolog for structured logging

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-02
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-02

 ## Context and Problem Statement

--- a/adr/0004-interface-based-design.md
+++ b/adr/0004-interface-based-design.md
@@ -1,8 +1,8 @@
 # Adopt interface-based design pattern

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-02
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-02

 ## Context and Problem Statement

--- a/adr/0005-graceful-shutdown.md
+++ b/adr/0005-graceful-shutdown.md
@@ -1,8 +1,8 @@
 # Implement graceful shutdown with readiness endpoints

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-03
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-03

 ## Context and Problem Statement

--- a/adr/0006-configuration-management.md
+++ b/adr/0006-configuration-management.md
@@ -1,8 +1,8 @@
 # Use Viper for configuration management

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-03
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-03

 ## Context and Problem Statement

--- a/adr/0007-opentelemetry-integration.md
+++ b/adr/0007-opentelemetry-integration.md
@@ -1,8 +1,8 @@
 # Integrate OpenTelemetry for distributed tracing

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-04
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-04

 ## Context and Problem Statement

--- a/adr/0008-bdd-testing.md
+++ b/adr/0008-bdd-testing.md
@@ -1,8 +1,8 @@
 # Adopt BDD with Godog for behavioral testing

-* Status: Accepted
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-05
+**Status:** Accepted
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-05

 ## Context and Problem Statement

--- a/adr/0009-hybrid-testing-approach.md
+++ b/adr/0009-hybrid-testing-approach.md
@@ -1,10 +1,9 @@
 # Combine BDD and Swagger-based testing

-* Status: ✅ Partially Implemented (BDD + Documentation only)
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-05
-* Last Updated: 2026-04-05
-* Implementation Status: BDD testing and OpenAPI documentation completed, SDK generation deferred
+**Status:** Implemented (BDD + OpenAPI documentation operational; SDK generation explicitly out of scope — would require a fresh ADR if reopened)
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-05
+**Last Updated:** 2026-05-05

 ## Context and Problem Statement

@@ -36,7 +35,7 @@ Chosen option: "Hybrid approach" because it provides the best combination of beh

 ## Implementation Status

-**Status**: ✅ Partially Implemented (BDD + Documentation only)
+**Status**: ✅ Implemented (BDD + OpenAPI documentation operational; SDK generation explicitly out of scope)

 ### What We Actually Have

@@ -329,7 +328,7 @@ If we need SDK generation in the future:
 - Add SDK-based BDD tests
 - Implement true hybrid testing approach

-**Current Status:** ✅ Partially Implemented (BDD + Documentation)
+**Current Status:** ✅ Implemented (BDD + OpenAPI documentation; SDK generation out of scope)
 **BDD Tests:** http://localhost:8080/api/health (all passing)
 **OpenAPI Docs:** http://localhost:8080/swagger/
 **OpenAPI Spec:** http://localhost:8080/swagger/doc.json
--- a/adr/0013-openapi-swagger-toolchain.md
+++ b/adr/0013-openapi-swagger-toolchain.md
@@ -1,11 +1,10 @@
 # 13. OpenAPI/Swagger Toolchain Selection

 **Date:** 2026-04-05
-**Status:** ✅ Partially Implemented (Documentation only)
+**Status:** Implemented (OpenAPI documentation operational; SDK generation explicitly out of scope, see ADR-0009)
 **Authors:** Arcodange Team
 **Implementation Date:** 2026-04-05
-**Last Updated:** 2026-04-05
-**Status:** OpenAPI documentation operational, SDK generation deferred
+**Last Updated:** 2026-05-05

 ## Context

@@ -983,7 +982,7 @@ If we need SDK generation in the future:
 4. Implement request validation middleware
 5. Migrate to OpenAPI 3.0 if needed

-**Current Status:** ✅ Partially Implemented (Documentation only)
+**Current Status:** ✅ Implemented (OpenAPI documentation; SDK generation out of scope)
 **Implementation:** swaggo/swag with embedded documentation
 **Documentation:** http://localhost:8080/swagger/
 **OpenAPI Spec:** http://localhost:8080/swagger/doc.json
--- a/adr/0015-cli-subcommands-cobra.md
+++ b/adr/0015-cli-subcommands-cobra.md
@@ -1,7 +1,7 @@
 # 15. CLI Subcommands and Flag Management with Cobra

 **Date:** 2026-04-05
-**Status:** ✅ Implemented
+**Status:** Implemented
 **Authors:** Arcodange Team
 **Decision Date:** 2026-04-05
 **Implementation Status:** Phase 1 Complete
@@ -222,7 +222,7 @@ dance-lessons-coach config validate

 ---

-**Status:** Proposed  
+**Status:** Proposed
 **Next Review:** 2026-04-12  
 **Implementation Owner:** Arcodange Team  
 **Approvers Needed:** @gabrielradureau
--- a/adr/0016-ci-cd-pipeline-design.md
+++ b/adr/0016-ci-cd-pipeline-design.md
@@ -1,10 +1,10 @@
 # 16. CI/CD Pipeline Design for Multi-Platform Compatibility

 **Date:** 2026-04-05
-**Status:** ✅ Accepted
+**Status:** Accepted
 **Authors:** Arcodange Team
 **Decision Date:** 2026-04-08
-**Implementation Status:** ✅ Completed
+**Implementation Status:** Completed

 ## Context

@@ -832,7 +832,7 @@ jobs:
 - ✅ **Coverage reporting**: Badges updating automatically
 - ✅ **Binary builds**: Scripts executing properly in container environment

-**Status:** ✅ Accepted   
+**Status:** Accepted
 **Implementation Date:** 2026-04-08   
 **Implementation Owner:** Arcodange Team   
 **Reviewers:** @gabrielradureau
--- a/adr/0017-trunk-based-development-workflow.md
+++ b/adr/0017-trunk-based-development-workflow.md
@@ -1,10 +1,10 @@
 # 17. Trunk-Based Development Workflow for CI/CD Safety

 **Date:** 2026-04-05
-**Status:** 🟢 Approved
+**Status:** Approved
 **Authors:** Arcodange Team
 **Decision Date:** 2026-04-05
-**Implementation Status:** ✅ Implemented
+**Implementation Status:** Implemented

 ## Context

--- a/adr/0018-user-management-auth-system.md
+++ b/adr/0018-user-management-auth-system.md
@@ -1,7 +1,7 @@
 # 18. User Management and Authentication System

-**Date:** 2024-04-06
-**Status:** Proposed
+**Date:** 2026-04-06
+**Status:** Implemented (user model, JWT auth, password-reset workflow, admin endpoints, greet personalization, BDD coverage all live; future enhancements like 2FA / email verification belong in separate ADRs)
 **Authors:** Product Owner
 **Decision Drivers:** Security, User Personalization, Admin Functionality

--- a/adr/0019-postgresql-integration.md
+++ b/adr/0019-postgresql-integration.md
@@ -1,7 +1,7 @@
 # 19. PostgreSQL Database Integration

-**Date:** 2024-04-07
-**Status:** Proposed
+**Date:** 2026-04-07
+**Status:** Implemented (core integration; performance tuning + extended monitoring tracked as future work)
 **Authors:** Product Owner
 **Decision Drivers:** Data Persistence, Scalability, Production Readiness

@@ -359,8 +359,6 @@ The PostgreSQL integration follows established dance-lessons-coach patterns:
 2. **Configuration Updates:** New database configuration structure
 3. **Development Workflow:** Docker-based database for local development

-
-
 ## Alternatives Considered

 ### Alternative 1: Keep SQLite with File Persistence
@@ -673,10 +671,10 @@ func AfterScenario(ctx context.Context, sc *godog.Scenario, err error) (context.
 ## Future Considerations

 ### Immediate Next Steps (Post-Migration)
-1. **CI/CD Integration:** Add PostgreSQL to CI pipeline
-2. **Performance Tuning:** Query optimization
-3. **Monitoring:** Database health metrics
-4. **Backup Strategy:** Regular database backups
+1. **CI/CD Integration:** Add PostgreSQL to CI pipeline — ✅ Implemented (`postgres:15` service in `.gitea/workflows/ci-cd.yaml`, all BDD tests run against real Postgres)
+2. **Performance Tuning:** Query optimization — Deferred. No production hot path identified. Reopen as separate ADR if/when latency budget exceeded.
+3. **Monitoring:** Database health metrics — Partial. `/api/healthz` reports DB connectivity. Deeper metrics (slow query log, pool stats) deferred until ADR-0022 cache Phase 2 lands.
+4. **Backup Strategy:** Regular database backups — Deferred. No production data yet. Will require separate ADR before any production data lands.

 ### Long-Term Enhancements
 1. **Database Sharding:** For horizontal scaling
--- a/adr/0020-docker-build-strategy.md
+++ b/adr/0020-docker-build-strategy.md
@@ -1,7 +1,6 @@
 # ADR 0020: Docker Build Strategy - Traditional vs Buildx

-## Status
-**Accepted** ✅
+**Status:** Accepted

 ## Context

--- a/adr/0021-jwt-secret-retention-policy.md
+++ b/adr/0021-jwt-secret-retention-policy.md
@@ -1,7 +1,6 @@
-# 10. JWT Secret Retention Policy
+# 21. JWT Secret Retention Policy

-## Status
-**Proposed** 🟡
+**Status:** Implemented (2026-05-05 — `pkg/user/jwt_manager.go` `RemoveExpiredSecrets` + `StartCleanupLoop`, wired in `pkg/server/server.go` `Run`; admin endpoint `/api/v1/admin/jwt/secrets` remains explicitly out of scope and tracked under @todo BDD scenarios)

 ## Context

--- a/adr/0022-rate-limiting-cache-strategy.md
+++ b/adr/0022-rate-limiting-cache-strategy.md
@@ -1,7 +1,6 @@
 # ADR 0022: Rate Limiting and Cache Strategy

-## Status
-**Proposed** 🟡
+**Status:** Implemented (Phase 1) - Phase 2 still Proposed

 ## Context

--- a/adr/0023-config-hot-reloading.md
+++ b/adr/0023-config-hot-reloading.md
@@ -1,8 +1,9 @@
 # Config Hot Reloading Strategy

-* Status: Proposed
-* Deciders: Gabriel Radureau, AI Agent
-* Date: 2026-04-05
+**Status:** Phase 1+2 Implemented (2026-05-05 — `logging.level` and `auth.jwt.ttl` hot-reloadable via `Config.WatchAndApply` in `pkg/config/config.go`, wired in `pkg/server/server.go Run`. Phase 2 also fixed a pre-existing bug where the hardcoded 24h TTL ignored `auth.jwt.ttl` from config entirely.) Phase 3 sub-phase 3.1 Implemented (2026-05-05 — `ReconfigureTracerProvider` in `pkg/telemetry/telemetry.go` added). Phase 3 sub-phase 3.2 In Flight (2026-05-05 — `telemetry.sampler.type` + `telemetry.sampler.ratio` hot-reload via `SetSamplerReconfigureCallback` in `pkg/config/config.go`. Remaining field: `api.v2_enabled`.)
+**Authors:** Gabriel Radureau, AI Agent
+**Date:** 2026-04-05
+**Last Updated:** 2026-05-05

 ## Context and Problem Statement

--- a/adr/0024-bdd-test-organization-and-isolation.md
+++ b/adr/0024-bdd-test-organization-and-isolation.md
@@ -1,7 +1,6 @@
 # ADR 0024: BDD Test Organization and Isolation Strategy

-## Status
-**Proposed** 🟡
+**Status:** Implemented (Phase 1 + Phase 2 + Phase 3 — parallel testing via [PR #35](https://gitea.arcodange.lab/arcodange/dance-lessons-coach/pulls/35), isolation strategy detailed in [ADR-0025](0025-bdd-scenario-isolation-strategies.md))

 ## Context

@@ -285,20 +284,22 @@ func CleanupFeatureData(featureName string) {

 ## Implementation Plan

-### Phase 1: Refactor Current Tests (1-2 weeks)
-1. Split monolithic feature files into feature directories
-2. Create feature-specific test scripts
-3. Implement basic isolation (config files, database names)
+### Phase 1: Refactor Current Tests — ✅ Implemented
+1. Split monolithic feature files into feature directories — done (see `features/<domain>/` layout)
+2. Create feature-specific test scripts — done
+3. Implement basic isolation (config files, database names) — done

-### Phase 2: Enhance Test Infrastructure (2-3 weeks)
-1. Add synchronization helpers to test framework
-2. Implement server lifecycle management
-3. Create comprehensive cleanup routines
+### Phase 2: Enhance Test Infrastructure — ✅ Implemented
+1. Add synchronization helpers to test framework — done
+2. Implement server lifecycle management — done (`pkg/bdd/testserver/server.go`)
+3. Create comprehensive cleanup routines — done

-### Phase 3: Parallel Testing (Optional)
-1. Add parallel test execution capability
-2. Implement port management for parallel runs
-3. Add resource monitoring
+### Phase 3: Parallel Testing — ✅ Implemented (PR #35, 2026-05-03)
+1. Add parallel test execution capability — done (schema-per-package isolation, **2.85x speedup**)
+2. Implement port management for parallel runs — done (`pkg/bdd/parallel/port_manager.go`)
+3. Add resource monitoring — deferred (not blocking; can be reopened as separate ADR if/when CI flakiness re-emerges)
+
+The strategy choice between alternatives (TRUNCATE vs schema isolation vs container-per-test) is documented in [ADR-0025](0025-bdd-scenario-isolation-strategies.md). Default behavior in CI is `BDD_SCHEMA_ISOLATION=true` (cf. `documentation/BDD_TEST_ENV.md`).

 ## Alternatives Considered

--- a/adr/0025-bdd-scenario-isolation-strategies.md
+++ b/adr/0025-bdd-scenario-isolation-strategies.md
@@ -1,7 +1,6 @@
 # ADR 0025: BDD Scenario Isolation Strategies

-## Status
-**Proposed** 🟡
+**Status:** Implemented (per-package schema isolation since T12 stage 2/2 - 2026-05-03)

 ## Context

--- a/adr/0026-composite-info-endpoint.md
+++ b/adr/0026-composite-info-endpoint.md
@@ -0,0 +1,197 @@
+# ADR 0026: Composite Info Endpoint vs Separate Calls
+
+**Status:** Implemented (2026-05-05 — PR pending)
+
+## Context
+
+The application currently exposes several endpoints that provide system information:
+- `/api/version` - returns version, commit, build date, Go version (cached 60s)
+- `/api/health` - returns `{"status":"healthy"}` (simple liveness)
+- `/api/healthz` - returns rich health info: status, version, uptime_seconds, timestamp
+- `/api/ready` - returns readiness with connection details
+
+Frontend components like `HealthDashboard` currently call `/api/healthz` to display server info. However, there is a need for a **composite endpoint** that aggregates:
+1. Version information (from `/api/version`)
+2. Build metadata (commit hash, build date)
+3. Uptime information (from `/api/healthz`)
+4. Cache status (enabled/disabled)
+5. Health status
+
+This raises an architectural question: **Should we create a new composite `/api/info` endpoint, or should frontend components make multiple separate API calls?**
+
+### The Problem with Separate Calls
+
+If the frontend makes individual calls to `/api/version`, `/api/healthz`, and checks cache config separately:
+
+1. **Multiple network requests**: 3-4 HTTP round trips per page load
+2. **Inconsistent data**: Responses may come from different moments in time
+3. **No caching coordination**: Each endpoint has its own cache key and TTL
+4. **Complex frontend logic**: Need to merge data from multiple sources
+5. **Poor user experience**: Slower page loads, multiple loading states
+
+### Current State Analysis
+
+| Endpoint | Data Provided | Cache TTL | Use Case |
+|----------|---------------|-----------|----------|
+| `/api/version` | version, commit, built, go | 60s | Version info |
+| `/api/healthz` | status, version, uptime_seconds, timestamp | None | K8s probes, health dashboard |
+| `/api/health` | status: "healthy" | None | Simple liveness |
+| `/api/ready` | ready, connections, reason | None | Readiness probes |
+
+The `/api/healthz` endpoint already combines some data (status + version + uptime + timestamp), but it:
+- Doesn't include commit_short
+- Doesn't include build_date separately
+- Doesn't include cache_enabled
+- Is not cached
+- Has Kubernetes-specific field naming (`healthz`)
+
+## Decision Drivers
+
+* **Performance**: Minimize network round trips for frontend
+* **Consistency**: All data should reflect the same point-in-time
+* **Maintainability**: Single source of truth for system info
+* **Caching**: Reuse existing cache infrastructure (ADR-0022)
+* **API Design**: Follow REST principles and existing patterns
+* **Backward Compatibility**: Existing endpoints must remain unchanged
+
+## Considered Options
+
+### Option 1: Composite `/api/info` Endpoint (Chosen)
+
+Create a new endpoint that aggregates all required data in a single call.
+
+**Pros:**
+- ✅ Single network request for frontend
+- ✅ Consistent point-in-time data
+- ✅ Can leverage existing cache infrastructure with key `info:json`
+- ✅ Follows existing pattern of `/api/version` caching
+- ✅ Clean API design - one endpoint, one purpose
+- ✅ Reduces frontend complexity
+- ✅ Better UX - faster page loads
+- ✅ Aligns with ADR-0022 cache strategy (reusable cache key pattern)
+
+**Cons:**
+- ⚠️ Duplicates some data from `/api/healthz` and `/api/version`
+- ⚠️ Requires new endpoint implementation
+- ⚠️ Need to maintain consistency if source endpoints change
+
+### Option 2: Frontend Aggregation with Multiple Calls
+
+Frontend makes separate calls to `/api/version`, `/api/healthz`, and introspects config.
+
+**Pros:**
+- ✅ No backend changes required
+- ✅ Uses existing endpoints
+
+**Cons:**
+- ❌ Multiple network requests (3-4 round trips)
+- ❌ Inconsistent data timing
+- ❌ Complex error handling in frontend
+- ❌ Poor UX - multiple loading states, slower
+- ❌ Each endpoint has different caching behavior
+- ❌ Violates DRY - same data fetched multiple times
+
+### Option 3: Extend `/api/healthz` Endpoint
+
+Add `commit_short`, `build_date`, and `cache_enabled` fields to existing `/api/healthz`.
+
+**Pros:**
+- ✅ Reuses existing endpoint
+- ✅ Single request
+
+**Cons:**
+- ❌ Breaks backward compatibility (response schema change)
+- ❌ `/api/healthz` is Kubernetes-focused (naming convention)
+- ❌ Not cached currently
+- ❌ Mixes health probe concerns with version info
+- ❌ Violates single responsibility
+
+### Option 4: GraphQL / Query Parameters
+
+Allow clients to specify which fields they want via query parameters.
+
+**Pros:**
+- ✅ Flexible - clients get exactly what they need
+- ✅ Single endpoint
+
+**Cons:**
+- ❌ Overkill for this use case
+- ❌ Not consistent with existing REST API design
+- ❌ Complex implementation
+- ❌ Not aligned with project architecture (Chi router, REST style)
+
+## Decision Outcome
+
+**Chosen: Option 1 - Composite `/api/info` Endpoint**
+
+We will implement a new `GET /api/info` endpoint that returns a JSON object with all required fields in a single call. This endpoint will:
+
+1. Aggregate data from existing sources (`version` package, `config`, server uptime)
+2. Be cached using the existing cache service with key `info:json`
+3. Use TTL from `config.cache.default_ttl_seconds` (consistent with ADR-0022)
+4. Return `X-Cache: HIT/MISS` headers for debugging
+5. Follow existing Go handler patterns from `pkg/server/server.go`
+
+### Response Schema
+
+```json
+{
+  "version": "1.4.0",
+  "commit_short": "a3f7b2c1",
+  "build_date": "2026-05-04T08:00:00Z",
+  "uptime_seconds": 1234,
+  "cache_enabled": true,
+  "healthz_status": "healthy"
+}
+```
+
+### Rationale
+
+1. **Performance**: Single HTTP request instead of 3-4 separate calls
+2. **Consistency**: All data reflects the same moment in time
+3. **Caching**: Leverages existing cache infrastructure (ADR-0022) with predictable key pattern
+4. **API Design**: Clean, RESTful endpoint with single responsibility
+5. **Maintainability**: Clear separation of concerns - info aggregation is a distinct use case
+6. **Backward Compatibility**: Existing endpoints remain unchanged
+7. **Frontend Simplicity**: Reduces complexity and improves UX
+
+### Cache Strategy
+
+Following ADR-0022 pattern:
+- Cache key: `info:json` (consistent with `version:format` pattern)
+- TTL: `config.cache.default_ttl_seconds` (default 300 seconds)
+- Cache service: `pkg/cache/cache.go` InMemoryService
+- Headers: `X-Cache: HIT` or `X-Cache: MISS`
+
+This allows the endpoint to be fast even under load, while maintaining data freshness.
+
+## Consequences
+
+### Positive
+
+1. **Improved frontend performance**: Single request instead of multiple
+2. **Better UX**: Faster page loads, simpler loading states
+3. **Consistent data**: All fields reflect the same point-in-time
+4. **Cache efficiency**: Reuses existing cache infrastructure
+5. **Clean separation**: Info endpoint handles aggregation, source endpoints unchanged
+6. **Easy to test**: Single endpoint with predictable response
+
+### Negative
+
+1. **Data duplication**: Some fields appear in multiple endpoints
+2. **Maintenance burden**: If source data changes, endpoint must be updated
+3. **New endpoint**: Increases API surface area (though minimal)
+
+### Mitigation
+
+1. Data duplication is acceptable - it's read-only system info
+2. Source the data from the same packages/functions used by other endpoints
+3. The new endpoint has a clear, focused purpose
+
+## Links
+
+- [ADR-0002: Chi Router](adr/0002-chi-router.md) - Routing foundation
+- [ADR-0022: Rate Limiting Cache Strategy](adr/0022-rate-limiting-cache-strategy.md) - Cache pattern reference
+- [pkg/server/server.go](pkg/server/server.go) - Handler patterns
+- [pkg/cache/cache.go](pkg/cache/cache.go) - Cache service
+- [pkg/version/version.go](pkg/version/version.go) - Version data source
--- a/adr/README.md
+++ b/adr/README.md
@@ -1,129 +1,113 @@
 # Architecture Decision Records (ADRs)

-This directory contains Architecture Decision Records (ADRs) for the dance-lessons-coach project.
+This directory contains the Architecture Decision Records (ADRs) for the dance-lessons-coach project. Each ADR captures a structurally important decision, its context, and its consequences.

-## Index of ADRs
+## Index

-| Number | Title | Status |
-|--------|-------|--------|
-| 0001 | Go 1.26.1 Standard | ✅ Accepted |
-| 0002 | Chi Router | ✅ Accepted |
-| 0003 | Zerolog Logging | ✅ Accepted |
-| 0004 | Interface-Based Design | ✅ Accepted |
-| 0005 | Graceful Shutdown | ✅ Accepted |
-| 0006 | Configuration Management | ✅ Accepted |
-| 0007 | OpenTelemetry Integration | ✅ Accepted |
-| 0008 | BDD Testing | ✅ Accepted |
-| 0009 | Hybrid Testing Approach | ✅ Accepted |
-| 0010 | CI/CD Pipeline Design | ✅ Accepted |
-| 0011 | Trunk-Based Development | ✅ Accepted |
-| 0012 | Commit Message Conventions | ✅ Accepted |
-| 0013 | Version Management Lifecycle | ✅ Accepted |
-| 0014 | Swagger Documentation | ✅ Accepted |
-| 0015 | Rate Limiting Strategy | ✅ Accepted |
-| 0016 | Cache Invalidation Strategy | ✅ Accepted |
-| 0017 | JWT Secret Rotation | ✅ Accepted |
-| 0018 | Configuration Hot Reloading | ✅ Accepted |
-| 0019 | BDD Feature Structure | ✅ Accepted |
-| 0020 | Database Migration Strategy | ✅ Accepted |
-| 0021 | API Versioning Strategy | ✅ Accepted |
-| 0022 | Rate Limiting and Cache Strategy | ✅ Accepted |
-| 0023 | Config Hot Reloading | 🟡 Proposed |
-| 0024 | BDD Test Organization and Isolation | 🟡 Proposed |
-| 0025 | BDD Scenario Isolation Strategies | 🟡 Proposed |
+| ADR | Title | Status |
+|-----|-------|--------|
+| [0001](0001-go-1.26.1-standard.md) | Use Go 1.26.1 as the standard Go version | Accepted |
+| [0002](0002-chi-router.md) | Use Chi router for HTTP routing | Accepted |
+| [0003](0003-zerolog-logging.md) | Use Zerolog for structured logging | Accepted |
+| [0004](0004-interface-based-design.md) | Adopt interface-based design pattern | Accepted |
+| [0005](0005-graceful-shutdown.md) | Implement graceful shutdown with readiness endpoints | Accepted |
+| [0006](0006-configuration-management.md) | Use Viper for configuration management | Accepted |
+| [0007](0007-opentelemetry-integration.md) | Integrate OpenTelemetry for distributed tracing | Accepted |
+| [0008](0008-bdd-testing.md) | Adopt BDD with Godog for behavioral testing | Accepted |
+| [0009](0009-hybrid-testing-approach.md) | Combine BDD and Swagger-based testing | Partially Implemented |
+| [0010](0010-api-v2-feature-flag.md) | API v2 Feature Flag Implementation | Accepted |
+| [0012](0012-git-hooks-staged-only-formatting.md) | Git Hooks: Staged-Only Formatting | Accepted |
+| [0013](0013-openapi-swagger-toolchain.md) | OpenAPI/Swagger Toolchain Selection | Partially Implemented |
+| [0015](0015-cli-subcommands-cobra.md) | CLI Subcommands and Flag Management with Cobra | Implemented |
+| [0016](0016-ci-cd-pipeline-design.md) | CI/CD Pipeline Design for Multi-Platform Compatibility | Accepted |
+| [0017](0017-trunk-based-development-workflow.md) | Trunk-Based Development Workflow for CI/CD Safety | Approved |
+| [0018](0018-user-management-auth-system.md) | User Management and Authentication System | Proposed |
+| [0019](0019-postgresql-integration.md) | PostgreSQL Database Integration | Proposed |
+| [0020](0020-docker-build-strategy.md) | Docker Build Strategy: Traditional vs Buildx | Accepted |
+| [0021](0021-jwt-secret-retention-policy.md) | JWT Secret Retention Policy | Proposed |
+| [0022](0022-rate-limiting-cache-strategy.md) | Rate Limiting and Cache Strategy | Proposed |
+| [0023](0023-config-hot-reloading.md) | Config Hot Reloading Strategy | Proposed |
+| [0024](0024-bdd-test-organization-and-isolation.md) | BDD Test Organization and Isolation Strategy | Proposed |
+| [0025](0025-bdd-scenario-isolation-strategies.md) | BDD Scenario Isolation Strategies | Proposed |
+
+> **Note** : numbers `0011` and `0014` are not currently in use. Reserved for future ADRs or representing previously deleted entries.

 ## What is an ADR?

-An ADR is a document that captures an important architectural decision made along with its context and consequences.
+An ADR is a document capturing one significant architectural decision: the **context** that motivated it, the **decision** itself, and its **consequences**. ADRs are append-only — once published, an ADR is not edited (except for typo / status updates). New decisions that supersede previous ones are recorded as new ADRs that explicitly link back.

-## Format
+## Canonical Format

-Each ADR follows this structure:
+All ADRs follow the canonical format below (homogenized 2026-05-03):

 ```markdown
-# [Short title is a few words]
+# NN. Short title summarising the decision

-* Status: [Proposed | Accepted | Deprecated | Superseded]
-* Deciders: [List of decision makers]
-* Date: [YYYY-MM-DD]
+**Status:** <Proposed | Accepted | Implemented | Partially Implemented | Approved | Rejected | Deferred | Deprecated | Superseded by ADR-NNNN>
+**Date:** YYYY-MM-DD
+**Authors:** Name(s)
+
+[Optional fields, all in `**Field:** value` format:]
+**Decision Drivers:** ...
+**Implementation Status:** ...
+**Implementation Date:** ...
+**Last Updated:** ...

 ## Context and Problem Statement

-[Describe the context and problem statement]
+[Describe the context and problem statement.]

 ## Decision Drivers

-* [Driver 1]
-* [Driver 2]
-* [Driver 3]
+* Driver 1
+* Driver 2

 ## Considered Options

-* [Option 1]
-* [Option 2]
-* [Option 3]
+* Option 1
+* Option 2

 ## Decision Outcome

-Chosen option: "[Option 1]" because [justification]
+Chosen option: "Option 1" because [justification].

 ## Pros and Cons of the Options

-### [Option 1]
+### Option 1

-* Good, because [argument a]
-* Good, because [argument b]
-* Bad, because [argument c]
+* Good, because [argument].
+* Bad, because [argument].

-### [Option 2]
+### Option 2

-* Good, because [argument a]
-* Good, because [argument b]
-* Bad, because [argument c]
+* Good, because [argument].
+* Bad, because [argument].

 ## Links

-* [Link type] [Link to ADR]
-* [Link type] [Link to ADR]
+* Related ADR: [ADR-NNNN](NNNN-slug.md)
+* Issue: [#NN](https://gitea.arcodange.lab/arcodange/dance-lessons-coach/issues/NN)
 ```

-## ADR List
-
-* [0001-go-1.26.1-standard.md](0001-go-1.26.1-standard.md) - Use Go 1.26.1 as the standard Go version
-* [0002-chi-router.md](0002-chi-router.md) - Use Chi router for HTTP routing
-* [0003-zerolog-logging.md](0003-zerolog-logging.md) - Use Zerolog for structured logging
-* [0004-interface-based-design.md](0004-interface-based-design.md) - Adopt interface-based design pattern
-* [0005-graceful-shutdown.md](0005-graceful-shutdown.md) - Implement graceful shutdown with readiness endpoints
-* [0006-configuration-management.md](0006-configuration-management.md) - Use Viper for configuration management
-* [0007-opentelemetry-integration.md](0007-opentelemetry-integration.md) - Integrate OpenTelemetry for distributed tracing
-* [0008-bdd-testing.md](0008-bdd-testing.md) - Adopt BDD with Godog for behavioral testing
-* [0009-hybrid-testing-approach.md](0009-hybrid-testing-approach.md) - Combine BDD and Swagger-based testing
-* [0010-api-v2-feature-flag.md](0010-api-v2-feature-flag.md) - API v2 implementation with feature flag control
-* [0011-validation-library-selection.md](0011-validation-library-selection.md) - Selection of go-playground/validator for input validation
-* [0012-git-hooks-staged-only-formatting.md](0012-git-hooks-staged-only-formatting.md) - Git hooks format only staged Go files
-* [0013-openapi-swagger-toolchain.md](0013-openapi-swagger-toolchain.md) - ✅ OpenAPI/Swagger documentation with swaggo/swag (Implemented)
-* [0014-grpc-adoption-strategy.md](0014-grpc-adoption-strategy.md) - Hybrid REST/gRPC adoption strategy
-* [0015-cli-subcommands-cobra.md](0015-cli-subcommands-cobra.md) - Cobra CLI framework adoption
-* [0016-ci-cd-pipeline-design.md](0016-ci-cd-pipeline-design.md) - CI/CD pipeline architecture
-* [0017-trunk-based-development-workflow.md](0017-trunk-based-development-workflow.md) - Trunk-based development workflow
-* [0018-user-management-auth-system.md](0018-user-management-auth-system.md) - User management and authentication system
-* [0019-postgresql-integration.md](0019-postgresql-integration.md) - PostgreSQL database integration
-* [0020-docker-build-strategy.md](0020-docker-build-strategy.md) - Docker Build Strategy: Traditional vs Buildx
-* [0021-jwt-secret-retention-policy.md](0021-jwt-secret-retention-policy.md) - JWT Secret Retention Policy with Configurable TTL and Retention
-* [0022-rate-limiting-cache-strategy.md](0022-rate-limiting-cache-strategy.md) - Rate Limiting and Cache Strategy with Multi-Phase Implementation
-* [0023-config-hot-reloading.md](0023-config-hot-reloading.md) - Config Hot Reloading Strategy
-* [0025-bdd-scenario-isolation-strategies.md](0025-bdd-scenario-isolation-strategies.md) - Schema-per-scenario isolation for BDD tests
-
-## How to Add a New ADR
-
-1. Create a new file with the next available number (e.g., `0010-new-decision.md`)
-2. Follow the template format
-3. Update this README.md with the new ADR
-4. Commit the changes
-
 ## Status Legend

-* **Proposed**: Decision is being discussed
-* **Accepted**: Decision has been made and implemented
-* **Deprecated**: Decision is no longer relevant
-* **Superseded**: Decision has been replaced by another ADR
+| Status | Meaning |
+|---|---|
+| **Proposed** | Decision is being discussed; no implementation yet. |
+| **Accepted** | Decision has been made; implementation may be pending or in progress. |
+| **Approved** | Same as Accepted; alternative term used in some legacy ADRs. |
+| **Implemented** | Decision is fully implemented and in production. |
+| **Partially Implemented** | Decision is partly implemented; remainder is deferred or pending. |
+| **Rejected** | Decision considered and explicitly rejected. The ADR documents why. |
+| **Deferred** | Decision postponed; revisit later. |
+| **Deprecated** | Decision is no longer relevant; system has moved on. |
+| **Superseded by ADR-NNNN** | Decision has been replaced by another ADR. Always include the link. |
+
+## How to Add a New ADR
+
+1. Pick the next available number (currently next would be `0026`).
+2. Copy an existing ADR (e.g., `0001-go-1.26.1-standard.md`) as a starting template.
+3. Edit the title, status, date, authors, and content.
+4. Update this `README.md` index with the new ADR.
+5. Commit using gitmoji convention (e.g., `📝 docs(adr): add ADR-0026 about ...`).
+6. Open a PR for review.
--- a/cmd/server/main.go
+++ b/cmd/server/main.go
@@ -48,8 +48,10 @@ func main() {
 		log.Fatal().Err(err).Msg("Failed to load configuration")
 	}

-	// Create readiness context to control readiness state
-	readyCtx, readyCancel := context.WithCancel(context.Background())
+	// Create readiness context to control readiness state.
+	// CancelableContext exposes Cancel() so that Server.Run() can cancel
+	// readiness at the start of graceful shutdown (before the propagation sleep).
+	readyCtx, readyCancel := server.NewCancelableContext(context.Background())
 	defer readyCancel()

 	// Create and run server
@@ -57,4 +59,5 @@ func main() {
 	if err := server.Run(); err != nil {
 		log.Fatal().Err(err).Msg("Server failed")
 	}
+	log.Trace().Msg("Server exited")
 }
--- a/config.yaml
+++ b/config.yaml
@@ -87,4 +87,15 @@ database:
  
  # Maximum lifetime of connections (default: "1h")
  # Format: number + unit (s, m, h)
-  conn_max_lifetime: 1h
+  conn_max_lifetime: 1h
+
+# Cache configuration (in-memory)
+cache:
+  # Enable in-memory cache (default: true)
+  enabled: true
+  
+  # Default TTL in seconds for cache items (default: 300 = 5 minutes)
+  default_ttl_seconds: 300
+  
+  # Cleanup interval in seconds for expired items (default: 600 = 10 minutes)
+  cleanup_interval_seconds: 600
--- a/documentation/API.md
+++ b/documentation/API.md
@@ -0,0 +1,106 @@
+# API endpoints
+
+Reference document for all HTTP endpoints exposed by `dance-lessons-coach` server. The authoritative source is the swag-generated Swagger UI at `/swagger/index.html` (served by the Go binary). This markdown is the human-readable index, intentionally short — when in doubt, run the server and open Swagger.
+
+## Conventions
+
+- All paths under `/api/` (no other prefix is used)
+- Versioned API under `/api/v1/<resource>` and `/api/v2/<resource>` (cf. ADR-0010 v2 feature flag)
+- System / Health / Version endpoints at root (`/api/<endpoint>`, no version)
+- Admin endpoints under `/api/admin/<action>` (require master admin password header)
+- Response Content-Type: `application/json` unless documented otherwise
+- Error envelope: `{"error":"<code>","message":"<text>"}` (HTTP 4xx/5xx)
+
+## System endpoints (no auth)
+
+| Method | Path | Purpose | Cf. |
+|---|---|---|---|
+| GET | `/api/health` | Liveness check (legacy, returns `{"status":"healthy"}`) | `pkg/server/server.go` |
+| GET | `/api/healthz` | **Kubernetes-style** rich health: status / version / uptime_seconds / timestamp | PR #20 — handler with swag `@Router /healthz [get]` |
+| GET | `/api/ready` | Readiness check (DB connection + service deps) | `pkg/server/server.go handleReadiness` |
+| GET | `/api/version` | Version info (cached 60s, since PR #29) | `pkg/server/server.go handleVersion` |
+| GET | `/api/info` | **Composite info aggregator**: version / commit_short / build_date / uptime_seconds / cache_enabled / healthz_status. Cached when cache is enabled (X-Cache: HIT/MISS header) | ADR-0026 — `pkg/server/server.go handleInfo` |
+
+`/api/info` body schema (`InfoResponse`):
+
+```json
+{
+  "version": "1.0.0",
+  "commit_short": "abc12345",
+  "build_date": "2026-05-05",
+  "uptime_seconds": 1234,
+  "cache_enabled": true,
+  "healthz_status": "healthy"
+}
+```
+
+Use `/api/info` from a frontend footer or status page when you need version + uptime + cache state in a single round trip. The composite design avoids 3-4 chatty calls (`/version`, `/healthz`, `/ready`) when only a snapshot is needed.
+
+`/api/healthz` body schema (`HealthzResponse`):
+
+```json
+{
+  "status": "healthy",
+  "version": "1.4.0",
+  "uptime_seconds": 1234,
+  "timestamp": "2026-05-04T08:00:00Z"
+}
+```
+
+Use `/api/healthz` for kubelet liveness probes — richer than `/api/health` and stable.
+
+## Admin endpoints (require X-Admin-Password header)
+
+| Method | Path | Purpose | Cf. |
+|---|---|---|---|
+| POST | `/api/admin/cache/flush` | Flush the entire in-memory cache. Returns `{"flushed":true,"items_flushed":N,"timestamp":"..."}` (200) or `{"error":"unauthorized"}` (401) or `{"error":"cache_disabled"}` (503) | PR #29 — `pkg/server/server.go handleAdminCacheFlush` |
+
+Auth: header `X-Admin-Password: <master-password>` (matches `auth.admin_master_password` in config / `DLC_AUTH_ADMIN_MASTER_PASSWORD` env var). Default `admin123` for local dev — **change in production**.
+
+## v1 API (auth + greeting)
+
+Mounted at `/api/v1/...` with the rate-limit middleware (cf. ADR-0022 Phase 1, since PR #22). Cached responses on greet (since PR #29).
+
+### Auth (`/api/v1/auth/...`)
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/api/v1/auth/register` | User registration |
+| POST | `/api/v1/auth/login` | Login with username + password, returns JWT |
+| POST | `/api/v1/auth/validate` | Validate a JWT token |
+| POST | `/api/v1/auth/password-reset/request` | Request password reset (admin-flagged users only) |
+| POST | `/api/v1/auth/password-reset/complete` | Complete password reset |
+
+JWT secret rotation policies: cf. ADR-0021 + JWT secrets endpoints under `/api/v1/admin/jwt/secrets` (admin-only).
+
+### Greet (`/api/v1/greet/...`)
+
+| Method | Path | Purpose |
+|---|---|---|
+| GET | `/api/v1/greet?name=X` | Greeting (cached per name 60s, header `X-Cache: HIT/MISS`) |
+| GET | `/api/v1/greet/{name}` | Greeting (path param variant, same caching) |
+
+### Admin under v1 (`/api/v1/admin/...`)
+
+JWT secret management endpoints. Cf. swag annotations in handlers + features/jwt/ BDD scenarios for the exact contract.
+
+## v2 API
+
+Enabled via `api.v2_enabled` config (cf. ADR-0010 v2 feature flag).
+
+| Method | Path | Purpose |
+|---|---|---|
+| POST | `/api/v2/greet` | v2 greeting (JSON body, more validation) |
+
+## Swagger UI
+
+Served at `/swagger/index.html` (and `/swagger/doc.json` for the embedded spec). Always reflects what the running binary exposes — when in doubt, prefer Swagger over this markdown.
+
+## Cross-references
+
+- [ADR-0002](../adr/0002-chi-router.md) — Chi router choice
+- [ADR-0010](../adr/0010-api-v2-feature-flag.md) — v2 feature flag
+- [ADR-0013](../adr/0013-openapi-swagger-toolchain.md) — OpenAPI / Swagger toolchain
+- [ADR-0018](../adr/0018-user-management-auth-system.md) — User management & auth
+- [ADR-0021](../adr/0021-jwt-secret-retention-policy.md) — JWT secret retention
+- [ADR-0022](../adr/0022-rate-limiting-cache-strategy.md) — Rate limiting + cache
--- a/documentation/BDD_TEST_ENV.md
+++ b/documentation/BDD_TEST_ENV.md
@@ -0,0 +1,89 @@
+# BDD test environment
+
+Environment variables and tooling specific to running BDD scenarios locally and in CI. Companion to [BDD_GUIDE.md](BDD_GUIDE.md) (which covers the BDD authoring workflow itself).
+
+## Required env vars (database connection)
+
+The BDD test server needs a Postgres instance reachable via:
+
+| Var | Default | Notes |
+|---|---|---|
+| `DLC_DATABASE_HOST` | `localhost` | Host of the Postgres instance |
+| `DLC_DATABASE_PORT` | `5432` | |
+| `DLC_DATABASE_USER` | `postgres` | Test-only credentials (NOT production) |
+| `DLC_DATABASE_PASSWORD` | `postgres` | |
+| `DLC_DATABASE_NAME` | `dance_lessons_coach_bdd_test` | Dedicated test DB |
+| `DLC_DATABASE_SSL_MODE` | `disable` | Tests run without TLS |
+
+Local setup:
+
+```bash
+docker compose up -d                                                # Postgres container
+docker exec dance-lessons-coach-postgres psql -U postgres \
+  -c "CREATE DATABASE dance_lessons_coach_bdd_test;"               # one-time
+```
+
+In CI: `.gitea/workflows/ci-cd.yaml` provisions a Postgres service container and exports the same vars.
+
+## Optional env vars
+
+### `BDD_SCHEMA_ISOLATION` (since [PR #35](https://gitea.arcodange.lab/arcodange/dance-lessons-coach/pulls/35) — T12 stage 2/2)
+
+| Value | Behaviour |
+|---|---|
+| `true` | Each test PACKAGE (process) gets its own isolated PostgreSQL schema with migrations. Packages run in **parallel** safely. **~2.85x speedup observed locally.** This is the new default in CI. |
+| (unset / `false`) | Falls back to single shared `public` schema with `CleanupDatabase` (TRUNCATE) between scenarios. Forces sequential package execution (`-p 1`). Slower but simpler. |
+
+Implementation: `pkg/bdd/testserver/server.go Start()` builds a per-package isolated repo via `user.NewPostgresRepositoryFromDSN` (PR #34). `Stop()` drops the schema + closes the per-package pool.
+
+ADR-0025 documents the isolation strategy ("Implemented" since PR #35).
+
+### `FEATURE` (per-package selector)
+
+When set, `pkg/bdd/testserver/server.go shouldEnableV2()` reads it. Used to scope per-feature behaviour (e.g. enable v2 endpoints only when `FEATURE=greet` AND `GODOG_TAGS` includes `@v2`).
+
+Without `FEATURE` set, falls back to `bdd` (generic).
+
+### `GODOG_TAGS` (scenario filter)
+
+Standard godog env var. The default suite excludes flaky/todo/skip/v2 tags:
+```
+GODOG_TAGS="~@flaky && ~@todo && ~@skip && ~@v2"
+```
+
+Scoped runs (e.g. `@critical` only): set `GODOG_TAGS="@critical"` and run.
+
+### `BDD_ENABLE_CLEANUP_LOGS` (debug)
+
+Set `=true` to log each scenario's CLEANUP / ISOLATION operation. Useful when debugging flakiness.
+
+## Recommended local commands
+
+Run all BDD with isolation (parallel, fast):
+```bash
+DLC_DATABASE_HOST=localhost DLC_DATABASE_PORT=5432 \
+DLC_DATABASE_USER=postgres DLC_DATABASE_PASSWORD=postgres \
+DLC_DATABASE_NAME=dance_lessons_coach_bdd_test DLC_DATABASE_SSL_MODE=disable \
+BDD_SCHEMA_ISOLATION=true \
+go test ./features/...
+```
+
+Run one feature with v2 enabled:
+```bash
+DLC_DATABASE_HOST=... \
+BDD_SCHEMA_ISOLATION=true FEATURE=greet GODOG_TAGS="@v2" \
+go test ./features/greet/...
+```
+
+Repro CI conditions (sequential, no isolation):
+```bash
+DLC_DATABASE_HOST=... \
+go test ./features/... -p 1
+```
+
+## Cross-references
+
+- [BDD_GUIDE.md](BDD_GUIDE.md) — authoring scenarios + steps
+- [ADR-0008](../adr/0008-bdd-testing.md) — choice of Godog
+- [ADR-0024](../adr/0024-bdd-test-organization-and-isolation.md) — feature directory organization
+- [ADR-0025](../adr/0025-bdd-scenario-isolation-strategies.md) — isolation strategies (Implemented since PR #35)
--- a/features/greet/greet.feature
+++ b/features/greet/greet.feature
@@ -21,17 +21,35 @@ Feature: Greet Service
    When I send a POST request to v2 greet with name "John"
    Then the response should be "{\"message\":\"Hello my friend John!\"}"

+  @v2 @api
  Scenario: v2 default greeting with empty name
    Given the server is running with v2 enabled
    When I send a POST request to v2 greet with name ""
    Then the response should be "{\"message\":\"Hello my friend!\"}"

+  @v2 @api
  Scenario: v2 greeting with missing name field
    Given the server is running with v2 enabled
    When I send a POST request to v2 greet with invalid JSON "{}"
    Then the response should be "{\"message\":\"Hello my friend!\"}"

+  @v2 @api
  Scenario: v2 greeting with name that is too long
    Given the server is running with v2 enabled
    When I send a POST request to v2 greet with name "ThisNameIsWayTooLongAndShouldFailValidationBecauseItExceedsTheMaximumAllowedLengthOf100Characters!!!!"
-    Then the response should contain error "validation_failed"
+    Then the response should contain error "validation_failed"
+
+  @ratelimit @skip @bdd-deferred
+  # NOTE: Functional behavior validated by unit tests in pkg/middleware/ratelimit_test.go.
+  # BDD scenario currently skipped: env-var-based rate limit config does not reach the
+  # already-started test server (architectural limitation of testsetup, not the middleware).
+  # TODO: rework testserver to allow per-scenario rate limit config (admin endpoint or
+  # per-scenario fresh server), then re-enable this scenario.
+  Scenario: Greet endpoint rejects requests over the rate limit
+    Given the server is running with rate limit set to 3 requests per minute and burst 3
+    When I make 3 requests to "/api/v1/greet/Alice"
+    Then all responses should have status 200
+    When I make 1 more request to "/api/v1/greet/Alice"
+    Then the response should have status 429
+    And the response body should contain "rate_limited"
+    And the response should have header "Retry-After"
--- a/features/health/health.feature
+++ b/features/health/health.feature
@@ -7,4 +7,12 @@ Feature: Health Endpoint
  Scenario: Health check returns healthy status
    Given the server is running
    When I request the health endpoint
-    Then the response should be "{\"status\":\"healthy\"}"
+    Then the response should be "{\"status\":\"healthy\"}"
+
+  @basic @critical
+  Scenario: Healthz endpoint returns rich health info
+    Given the server is running
+    When I request the healthz endpoint
+    Then the status code should be 200
+    And the response should be JSON with fields "status, version, uptime_seconds, timestamp"
+    And the "status" field should equal "healthy"
--- a/features/info/info.feature
+++ b/features/info/info.feature
@@ -0,0 +1,38 @@
+# features/info/info.feature
+@info @critical
+Feature: Info Endpoint
+  The /api/info endpoint should return composite application information
+
+  @basic @critical
+  Scenario: GET /api/info returns all required fields
+    Given the server is running
+    When I request the info endpoint
+    Then the status code should be 200
+    And the response should be JSON
+    And the response should contain "version"
+    And the response should contain "commit_short"
+    And the response should contain "build_date"
+    And the response should contain "uptime_seconds"
+    And the response should contain "cache_enabled"
+    And the response should contain "healthz_status"
+    And the "healthz_status" field should equal "healthy"
+
+  @version @critical
+  Scenario: version field matches semantic version pattern
+    Given the server is running
+    When I request the info endpoint
+    Then the status code should be 200
+    And the "version" field should match /^\d+\.\d+\.\d+$/
+
+  @cache @skip @bdd-deferred
+  Scenario: /api/info is cached when cache is enabled
+    # Deferred: the BDD testsetup currently runs with cache disabled
+    # (see "Cache service disabled" in test logs). Cache HIT/MISS behavior
+    # is covered by unit tests on the cache service. Reopen this scenario
+    # if/when the BDD harness gains a cache-enabled mode (likely after
+    # ADR-0022 Phase 2).
+    Given the server is running with cache enabled
+    When I request the info endpoint
+    Then the response header "X-Cache" should be "MISS"
+    When I request the info endpoint again
+    Then the response header "X-Cache" should be "HIT"
--- a/features/info/info_test.go
+++ b/features/info/info_test.go
@@ -0,0 +1,16 @@
+package info
+
+import (
+	"testing"
+
+	"dance-lessons-coach/pkg/bdd/testsetup"
+)
+
+func TestInfoBDD(t *testing.T) {
+	config := testsetup.NewFeatureConfig("info", "progress", false)
+	suite := testsetup.CreateTestSuite(t, config, "dance-lessons-coach BDD Tests - Info Feature")
+
+	if suite.Run() != 0 {
+		t.Fatal("non-zero status returned, failed to run info BDD tests")
+	}
+}
--- a/frontend/.storybook/main.ts
+++ b/frontend/.storybook/main.ts
@@ -0,0 +1,15 @@
+import type { StorybookConfig } from '@storybook/vue3-vite'
+
+const config: StorybookConfig = {
+  stories: ['../components/**/*.stories.@(js|ts|mdx)'],
+  addons: ['@storybook/addon-essentials'],
+  framework: {
+    name: '@storybook/vue3-vite',
+    options: {},
+  },
+  docs: {
+    autodocs: 'tag',
+  },
+}
+
+export default config
--- a/frontend/.storybook/preview.ts
+++ b/frontend/.storybook/preview.ts
@@ -0,0 +1,15 @@
+import type { Preview } from '@storybook/vue3'
+
+const preview: Preview = {
+  parameters: {
+    actions: { argTypesRegex: '^on[A-Z].*' },
+    controls: {
+      matchers: {
+        color: /(background|color)$/i,
+        date: /Date$/i,
+      },
+    },
+  },
+}
+
+export default preview
--- a/frontend/app.vue
+++ b/frontend/app.vue
@@ -0,0 +1,5 @@
+<template>
+  <NuxtLayout>
+    <NuxtPage />
+  </NuxtLayout>
+</template>
--- a/frontend/components/AppFooter.vue
+++ b/frontend/components/AppFooter.vue
@@ -0,0 +1,13 @@
+<script setup lang="ts">
+import AppFooterView, { type AppInfo } from './AppFooterView.vue'
+
+// Wrapper: handles data fetching, delegates rendering to AppFooterView.
+// Separation of concerns (SRP) - same pattern as HealthDashboard / HealthDashboardView.
+// server: false → fetch client-side only. Avoids SSR fetching through the dev proxy
+// (which can fail in some local setups), and lets Playwright route mocks apply.
+const { data, pending, error } = useFetch<AppInfo>('/api/info', { server: false })
+</script>
+
+<template>
+  <AppFooterView :data="data" :pending="pending" :error="error" />
+</template>
--- a/frontend/components/AppFooterView.vue
+++ b/frontend/components/AppFooterView.vue
@@ -0,0 +1,45 @@
+<script setup lang="ts">
+import { humaniseUptime } from '~/utils/uptime'
+
+export interface AppInfo {
+  version: string
+  commit_short: string
+  build_date: string
+  uptime_seconds: number
+  cache_enabled: boolean
+  healthz_status: string
+}
+
+defineProps<{
+  data: AppInfo | null | undefined
+  pending: boolean
+  error: { message: string } | null | undefined
+}>()
+</script>
+
+<template>
+  <footer data-testid="app-footer">
+    <p v-if="pending" data-testid="app-footer-pending">v?</p>
+    <p v-else-if="error" data-testid="app-footer-error">v? · info unavailable</p>
+    <p v-else-if="data" data-testid="app-footer-info">
+      <span data-testid="app-footer-version">v{{ data.version }}</span>
+      <span> · commit </span>
+      <span data-testid="app-footer-commit">{{ data.commit_short }}</span>
+      <span> · uptime </span>
+      <span data-testid="app-footer-uptime">{{ humaniseUptime(data.uptime_seconds) }}</span>
+    </p>
+  </footer>
+</template>
+
+<style scoped>
+footer {
+  border-top: 1px solid #ccc;
+  padding: 0.5rem 1rem;
+  font-size: 0.85rem;
+  color: #555;
+  text-align: center;
+}
+footer p {
+  margin: 0;
+}
+</style>
--- a/frontend/components/HealthDashboard.stories.ts
+++ b/frontend/components/HealthDashboard.stories.ts
@@ -0,0 +1,26 @@
+import type { Meta, StoryObj } from '@storybook/vue3'
+import HealthDashboard from './HealthDashboard.vue'
+
+const meta: Meta<typeof HealthDashboard> = {
+  title: 'Components/HealthDashboard',
+  component: HealthDashboard,
+  tags: ['autodocs'],
+  parameters: {
+    docs: {
+      description: {
+        component:
+          'Smart wrapper that calls /api/healthz internally and delegates rendering to HealthDashboardView. ' +
+          'For state-by-state previews (Healthy, Loading, Error), see ' +
+          '[HealthDashboardView stories](?path=/docs/components-healthdashboardview--docs).',
+      },
+    },
+  },
+}
+export default meta
+
+type Story = StoryObj<typeof meta>
+
+// Default story - calls real /api/healthz (works in browser if dev proxy + backend are up)
+export const Default: Story = {
+  args: {},
+}
--- a/frontend/components/HealthDashboard.vue
+++ b/frontend/components/HealthDashboard.vue
@@ -0,0 +1,17 @@
+<script setup lang="ts">
+import HealthDashboardView, { type HealthInfo } from './HealthDashboardView.vue'
+
+// Wrapper: handles data fetching, delegates rendering to HealthDashboardView.
+// Separation of concerns (SRP):
+//   - HealthDashboard (this) = data layer (useFetch lifecycle)
+//   - HealthDashboardView    = presentation layer (testable in Storybook + e2e)
+//
+// server: false → fetch client-side only. Avoids SSR fetching through the dev
+// proxy (which can fail in some local setups), and lets Playwright route mocks
+// apply. Same fix that landed for AppFooter in PR #40.
+const { data, pending, error } = useFetch<HealthInfo>('/api/healthz', { server: false })
+</script>
+
+<template>
+  <HealthDashboardView :data="data" :pending="pending" :error="error" />
+</template>
--- a/frontend/components/HealthDashboardView.stories.ts
+++ b/frontend/components/HealthDashboardView.stories.ts
@@ -0,0 +1,79 @@
+import type { Meta, StoryObj } from '@storybook/vue3'
+import HealthDashboardView from './HealthDashboardView.vue'
+
+interface ViewArgs {
+  data: {
+    status: string
+    version: string
+    uptime_seconds: number
+    timestamp: string
+  } | null
+  pending: boolean
+  error: { message: string } | null
+}
+
+const meta = {
+  title: 'Components/HealthDashboardView',
+  component: HealthDashboardView,
+  tags: ['autodocs'],
+  argTypes: {
+    pending: { control: 'boolean' },
+  },
+  parameters: {
+    docs: {
+      description: {
+        component:
+          'Pure presentational component for the health dashboard. ' +
+          'Accepts `data`, `pending`, `error` as props so all 3 states can be ' +
+          'previewed in Storybook and asserted in unit tests. The data fetching ' +
+          'wrapper is `HealthDashboard.vue`.',
+      },
+    },
+  },
+} satisfies Meta<ViewArgs>
+
+export default meta
+
+type Story = StoryObj<typeof meta>
+
+export const Healthy: Story = {
+  args: {
+    data: {
+      status: 'healthy',
+      version: '1.4.0',
+      uptime_seconds: 3600,
+      timestamp: '2026-05-03T17:30:00.000Z',
+    },
+    pending: false,
+    error: null,
+  },
+}
+
+export const Loading: Story = {
+  args: {
+    data: null,
+    pending: true,
+    error: null,
+  },
+}
+
+export const ErrorState: Story = {
+  args: {
+    data: null,
+    pending: false,
+    error: { message: '[GET] "/api/healthz": 502 Bad Gateway (simulated)' },
+  },
+}
+
+export const HealthyHighUptime: Story = {
+  args: {
+    data: {
+      status: 'healthy',
+      version: '1.5.0-rc1',
+      uptime_seconds: 86400 * 7,
+      timestamp: new Date().toISOString(),
+    },
+    pending: false,
+    error: null,
+  },
+}
--- a/frontend/components/HealthDashboardView.vue
+++ b/frontend/components/HealthDashboardView.vue
@@ -0,0 +1,30 @@
+<script setup lang="ts">
+export interface HealthInfo {
+  status: string
+  version: string
+  uptime_seconds: number
+  timestamp: string
+}
+
+defineProps<{
+  data: HealthInfo | null | undefined
+  pending: boolean
+  error: { message: string } | null | undefined
+}>()
+</script>
+
+<template>
+  <section data-testid="health-dashboard">
+    <h2>Server Health</h2>
+    <p v-if="pending" data-testid="health-loading">Loading...</p>
+    <p v-else-if="error" data-testid="health-error">
+      Error loading health: {{ error.message }}
+    </p>
+    <ul v-else-if="data" data-testid="health-info">
+      <li><strong>Status:</strong> <span data-testid="health-status">{{ data.status }}</span></li>
+      <li><strong>Version:</strong> {{ data.version }}</li>
+      <li><strong>Uptime:</strong> {{ data.uptime_seconds }} seconds</li>
+      <li><strong>Last check:</strong> {{ data.timestamp }}</li>
+    </ul>
+  </section>
+</template>
--- a/frontend/docs/README.md
+++ b/frontend/docs/README.md
@@ -0,0 +1,4 @@
+# Frontend Docs
+
+- [E2E Test Reports](./e2e/README.md) - auto-generated by `npm run docs:gen`
+- Storybook (run locally: `npm run storybook` ; build: `npm run build-storybook` then open `storybook-static/index.html`)
--- a/frontend/docs/e2e/README.md
+++ b/frontend/docs/e2e/README.md
@@ -0,0 +1,7 @@
+# E2E Test Reports
+
+[<- Up to docs](../README.md)
+
+| Test | Status | Duration |
+|------|--------|----------|
+| [home page loads and shows server health info](./home-page-loads-and-shows-server-health-info.md) | PASSED | 168ms |
--- a/frontend/docs/e2e/home-page-loads-and-shows-server-health-info.md
+++ b/frontend/docs/e2e/home-page-loads-and-shows-server-health-info.md
@@ -0,0 +1,16 @@
+# home page loads and shows server health info
+
+[<- Back to index](./README.md) | [Top](../README.md)
+
+**File**: `tests/e2e/health.spec.ts`
+**Status**: PASSED
+**Duration**: 168ms
+
+## Screenshot
+
+![home page loads and shows server health info](../../tests/e2e/screenshots/home-page-loads-and-shows-server-health-info.png)
+
+## Test Details
+
+- Start Time: 2026-05-03T14:38:42.958Z
+- Spec File: health.spec.ts
--- a/frontend/layouts/default.vue
+++ b/frontend/layouts/default.vue
@@ -0,0 +1,17 @@
+<template>
+  <div class="layout-root">
+    <slot />
+    <AppFooter />
+  </div>
+</template>
+
+<style scoped>
+.layout-root {
+  min-height: 100vh;
+  display: flex;
+  flex-direction: column;
+}
+.layout-root > :first-child {
+  flex: 1;
+}
+</style>
--- a/frontend/nuxt.config.ts
+++ b/frontend/nuxt.config.ts
@@ -0,0 +1,11 @@
+export default defineNuxtConfig({
+  devtools: { enabled: true },
+  nitro: {
+    devProxy: {
+      '/api': {
+        target: 'http://localhost:8080',
+        changeOrigin: true,
+      },
+    },
+  },
+})
--- a/frontend/package-lock.json
+++ b/frontend/package-lock.json
--- a/frontend/package.json
+++ b/frontend/package.json
@@ -0,0 +1,26 @@
+{
+  "name": "dance-lessons-coach-frontend",
+  "type": "module",
+  "scripts": {
+    "build": "nuxt build",
+    "dev": "nuxt dev",
+    "generate": "nuxt generate",
+    "preview": "nuxt preview",
+    "postinstall": "nuxt prepare",
+    "storybook": "storybook dev -p 6006",
+    "build-storybook": "storybook build",
+    "docs:gen": "playwright test && node scripts/generate-test-docs.mjs",
+    "docs:full": "npm run build-storybook && npm run docs:gen"
+  },
+  "devDependencies": {
+    "@playwright/test": "^1.59.1",
+    "@storybook/addon-essentials": "^8.0.0",
+    "@storybook/vue3": "^8.0.0",
+    "@storybook/vue3-vite": "^8.0.0",
+    "@types/node": "^25.6.0",
+    "nuxt": "^3.13.0",
+    "storybook": "^8.0.0",
+    "typescript": "^6.0.3"
+  },
+  "packageManager": "npm@11.5.2"
+}
--- a/frontend/pages/index.vue
+++ b/frontend/pages/index.vue
@@ -0,0 +1,6 @@
+<template>
+  <main>
+    <h1>dance-lessons-coach</h1>
+    <HealthDashboard />
+  </main>
+</template>
--- a/frontend/playwright.config.ts
+++ b/frontend/playwright.config.ts
@@ -0,0 +1,23 @@
+import { defineConfig } from '@playwright/test'
+import path from 'path'
+
+export default defineConfig({
+  testDir: './tests/e2e',
+  timeout: 30_000,
+  reporter: [
+    ['list'],
+    ['json', { outputFile: path.join(process.cwd(), 'test-results', 'results.json') }],
+  ],
+  use: {
+    baseURL: 'http://localhost:3000',
+    screenshot: 'on',
+    video: 'off',
+  },
+  outputDir: 'test-results/output',
+  webServer: {
+    command: 'npm run dev',
+    url: 'http://localhost:3000',
+    timeout: 60_000,
+    reuseExistingServer: !process.env.CI,
+  },
+})
--- a/frontend/scripts/generate-test-docs.mjs
+++ b/frontend/scripts/generate-test-docs.mjs
@@ -0,0 +1,120 @@
+#!/usr/bin/env node
+
+import fs from 'node:fs/promises'
+import path from 'node:path'
+import { fileURLToPath } from 'node:url'
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url))
+const frontendDir = path.resolve(__dirname, '..')
+
+const resultsPath = path.join(frontendDir, 'test-results', 'results.json')
+const docsDir = path.join(frontendDir, 'docs', 'e2e')
+const screenshotsDir = path.join(frontendDir, 'tests', 'e2e', 'screenshots')
+
+async function main() {
+  // Read results
+  const resultsText = await fs.readFile(resultsPath, 'utf8')
+  const results = JSON.parse(resultsText)
+
+  // Create output directories
+  await fs.mkdir(docsDir, { recursive: true })
+
+  // Extract tests from suites
+  const testDocs = []
+  for (const suite of results.suites || []) {
+    for (const spec of suite.specs || []) {
+      for (const test of spec.tests || []) {
+        for (const result of test.results || []) {
+          const testInfo = {
+            title: spec.title,
+            specFile: spec.file || suite.file,
+            status: result.status,
+            duration: result.duration,
+            startTime: result.startTime,
+            attachments: result.attachments || [],
+          }
+          testDocs.push(testInfo)
+        }
+      }
+    }
+  }
+
+  // Generate individual test markdown files
+  for (const test of testDocs) {
+    const slug = slugify(test.title)
+    const mdPath = path.join(docsDir, `${slug}.md`)
+    
+    // Use slug-based screenshot name (matches explicit screenshot in test)
+    let screenshotPath = `${slug}.png`
+
+    // Also try to find screenshot attachment and use its basename
+    if (test.attachments && test.attachments.length > 0) {
+      for (const attachment of test.attachments) {
+        if (attachment.contentType === 'image/png') {
+          const basename = path.basename(attachment.path)
+          // Prefer explicit screenshot name if it matches our pattern
+          if (basename !== 'test-finished-1.png' && basename !== 'test-finished-2.png') {
+            screenshotPath = basename
+            break
+          }
+        }
+      }
+    }
+
+    const absoluteScreenshotPath = path.join(screenshotsDir, screenshotPath)
+    const relativeScreenshotPath = path.relative(docsDir, absoluteScreenshotPath)
+
+    const mdContent = `# ${test.title}
+
+[<- Back to index](./README.md) | [Top](../README.md)
+
+**File**: \`tests/e2e/${test.specFile}\`
+**Status**: ${test.status.toUpperCase()}
+**Duration**: ${test.duration}ms
+
+## Screenshot
+
+![${test.title}](${relativeScreenshotPath})
+
+## Test Details
+
+- Start Time: ${test.startTime || 'N/A'}
+- Spec File: ${test.specFile}
+`
+
+    await fs.writeFile(mdPath, mdContent)
+    console.log(`Generated: ${path.relative(frontendDir, mdPath)}`)
+  }
+
+  // Generate index README
+  const indexContent = `# E2E Test Reports
+
+[<- Up to docs](../README.md)
+
+| Test | Status | Duration |
+|------|--------|----------|
+${testDocs.map(t => `| [${escapeMd(t.title)}](./${slugify(t.title)}.md) | ${t.status.toUpperCase()} | ${t.duration}ms |`).join('\n')}
+`
+
+  await fs.writeFile(path.join(docsDir, 'README.md'), indexContent)
+  console.log(`Generated: ${path.relative(frontendDir, path.join(docsDir, 'README.md'))}`)
+
+  console.log(`\nGenerated ${testDocs.length} test docs`)
+}
+
+function slugify(str) {
+  return str
+    .toLowerCase()
+    .replace(/[^\w\s-]/g, '')
+    .replace(/[\s_]+/g, '-')
+    .replace(/^-+|-+$/g, '')
+}
+
+function escapeMd(str) {
+  return str.replace(/[|\\\[\]\{\}]/g, '\\$&')
+}
+
+main().catch(err => {
+  console.error('Error:', err)
+  process.exit(1)
+})
--- a/frontend/shims-vue.d.ts
+++ b/frontend/shims-vue.d.ts
@@ -0,0 +1,6 @@
+declare module '*.vue' {
+  import type { DefineComponent } from 'vue'
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  const component: DefineComponent<any, any, any>
+  export default component
+}
--- a/frontend/tests/e2e/app-footer.spec.ts
+++ b/frontend/tests/e2e/app-footer.spec.ts
@@ -0,0 +1,67 @@
+import { test, expect } from '@playwright/test'
+
+// Both specs mock /api/info so they decouple from the dev-proxy plumbing.
+// The integration with the real backend is covered by the BDD scenario in
+// features/info/info.feature (server-side, no frontend proxy in the loop).
+
+test('home page footer shows version, commit and uptime', async ({ page }) => {
+  await page.route('**/api/info', (route) => {
+    route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({
+        version: '1.4.0',
+        commit_short: '4a3f1bb',
+        build_date: '2026-05-05T00:00:00Z',
+        uptime_seconds: 8042,
+        cache_enabled: true,
+        healthz_status: 'healthy',
+      }),
+    })
+  })
+  await page.goto('/')
+
+  // Footer is mounted globally via layouts/default.vue
+  await expect(page.getByTestId('app-footer')).toBeVisible()
+
+  // The PR #32 lesson: assert content, not just visibility.
+  // Without the regex check the test would PASS even if the footer rendered the
+  // pending placeholder ("v?") indefinitely.
+  await expect(page.getByTestId('app-footer-info')).toBeVisible()
+  const versionLocator = page.getByTestId('app-footer-version')
+  await expect(versionLocator).toBeVisible()
+  await expect(versionLocator).toHaveText(/^v\d+\.\d+\.\d+$/)
+
+  // Commit and uptime should be present and non-empty.
+  await expect(page.getByTestId('app-footer-commit')).not.toBeEmpty()
+  await expect(page.getByTestId('app-footer-uptime')).not.toBeEmpty()
+
+  await page.screenshot({
+    path: 'tests/e2e/screenshots/app-footer-shows-version-commit-uptime.png',
+    fullPage: true,
+  })
+})
+
+// Regression spec: documents the expected error UX so we don't ship a silent failure.
+// Routes /api/info to a 502 mock so the test is reproducible regardless of backend.
+test('home page footer surfaces info endpoint errors gracefully', async ({ page }) => {
+  await page.route('**/api/info', (route) => {
+    route.fulfill({
+      status: 502,
+      contentType: 'application/json',
+      body: JSON.stringify({ error: 'simulated_backend_down' }),
+    })
+  })
+  await page.goto('/')
+
+  // Footer must NOT crash the page
+  await expect(page.getByTestId('app-footer')).toBeVisible()
+  await expect(page.getByTestId('app-footer-error')).toBeVisible()
+  // The error placeholder should NOT contain a real version pattern
+  await expect(page.getByTestId('app-footer-info')).not.toBeVisible()
+
+  await page.screenshot({
+    path: 'tests/e2e/screenshots/app-footer-surfaces-info-endpoint-errors-gracefully.png',
+    fullPage: true,
+  })
+})
--- a/frontend/tests/e2e/health.spec.ts
+++ b/frontend/tests/e2e/health.spec.ts
@@ -0,0 +1,55 @@
+import { test, expect } from '@playwright/test'
+
+// Both specs mock /api/healthz so they decouple from the dev-proxy plumbing.
+// The integration with the real backend is covered by the BDD scenario in
+// features/health/health.feature (server-side, no frontend proxy in the loop).
+// Same approach as tests/e2e/app-footer.spec.ts (PR #40) - applied here to
+// close the debt left by that PR's out-of-scope follow-up note.
+
+test('home page loads and shows healthy server state', async ({ page }) => {
+  await page.route('**/api/healthz', (route) => {
+    route.fulfill({
+      status: 200,
+      contentType: 'application/json',
+      body: JSON.stringify({
+        status: 'healthy',
+        version: '1.4.0',
+        uptime_seconds: 8042,
+        timestamp: '2026-05-05T08:00:00Z',
+      }),
+    })
+  })
+  await page.goto('/')
+  await expect(page.getByTestId('health-dashboard')).toBeVisible()
+  const heading = page.getByRole('heading', { name: /dance-lessons-coach/i })
+  await expect(heading).toBeVisible()
+
+  // Assert the dashboard is in HEALTHY state, not an error state.
+  // The dashboard renders an "Error loading health: ..." paragraph when /api/healthz
+  // is unreachable (Go backend not running, proxy misconfigured, endpoint removed,
+  // etc.). Without these assertions the test would falsely PASS even when the
+  // dashboard shows the error UI - regression observed 2026-05-03 (Go backend
+  // not running locally → page renders the error, Playwright PASSES).
+  await expect(page.getByTestId('health-info')).toBeVisible()
+  await expect(page.getByTestId('health-status')).toHaveText('healthy')
+  await expect(page.getByText(/Error loading health/i)).not.toBeVisible()
+
+  await page.screenshot({ path: 'tests/e2e/screenshots/home-page-loads-and-shows-server-health-info.png', fullPage: true })
+})
+
+// Regression spec: documents the expected error UX so we don't ship a silent failure.
+// Routes /api/healthz to a 502 mock so the test is reproducible regardless of backend.
+test('home page surfaces health endpoint errors visibly', async ({ page }) => {
+  await page.route('**/api/healthz', (route) => {
+    route.fulfill({
+      status: 502,
+      contentType: 'application/json',
+      body: JSON.stringify({ error: 'simulated_backend_down' }),
+    })
+  })
+  await page.goto('/')
+  await expect(page.getByTestId('health-dashboard')).toBeVisible()
+  await expect(page.getByText(/Error loading health/i)).toBeVisible()
+  await expect(page.getByTestId('health-info')).not.toBeVisible()
+  await page.screenshot({ path: 'tests/e2e/screenshots/home-page-surfaces-health-endpoint-errors-visibly.png', fullPage: true })
+})
--- a/frontend/tests/e2e/screenshots/.gitkeep
+++ b/frontend/tests/e2e/screenshots/.gitkeep
--- a/frontend/tests/e2e/screenshots/app-footer-shows-version-commit-uptime.png
+++ b/frontend/tests/e2e/screenshots/app-footer-shows-version-commit-uptime.png
--- a/frontend/tests/e2e/screenshots/app-footer-surfaces-info-endpoint-errors-gracefully.png
+++ b/frontend/tests/e2e/screenshots/app-footer-surfaces-info-endpoint-errors-gracefully.png
--- a/frontend/tests/e2e/screenshots/home-page-loads-and-shows-server-health-info.png
+++ b/frontend/tests/e2e/screenshots/home-page-loads-and-shows-server-health-info.png
--- a/frontend/tests/e2e/screenshots/home-page-surfaces-health-endpoint-errors-visibly.png
+++ b/frontend/tests/e2e/screenshots/home-page-surfaces-health-endpoint-errors-visibly.png
--- a/frontend/tsconfig.json
+++ b/frontend/tsconfig.json
@@ -0,0 +1,6 @@
+{
+  "extends": "./.nuxt/tsconfig.json",
+  "compilerOptions": {
+    "strict": true
+  }
+}
--- a/frontend/utils/uptime.ts
+++ b/frontend/utils/uptime.ts
@@ -0,0 +1,16 @@
+// Convert a duration in seconds to a humanised string like "2h 13m" or "45m 12s".
+// Returns "?" for non-finite or negative input so the UI never renders NaN/empty.
+export function humaniseUptime(seconds: number | null | undefined): string {
+  if (seconds == null || !Number.isFinite(seconds) || seconds < 0) return '?'
+
+  const s = Math.floor(seconds)
+  const days = Math.floor(s / 86400)
+  const hours = Math.floor((s % 86400) / 3600)
+  const minutes = Math.floor((s % 3600) / 60)
+  const secs = s % 60
+
+  if (days > 0) return `${days}d ${hours}h`
+  if (hours > 0) return `${hours}h ${minutes}m`
+  if (minutes > 0) return `${minutes}m ${secs}s`
+  return `${secs}s`
+}
--- a/go.mod
+++ b/go.mod
@@ -4,12 +4,14 @@ go 1.26.1

 require (
 	github.com/cucumber/godog v0.15.1
+	github.com/fsnotify/fsnotify v1.9.0
 	github.com/go-chi/chi/v5 v5.2.5
 	github.com/go-playground/locales v0.14.1
 	github.com/go-playground/universal-translator v0.18.1
 	github.com/go-playground/validator/v10 v10.30.2
 	github.com/golang-jwt/jwt/v5 v5.3.1
 	github.com/lib/pq v1.12.3
+	github.com/patrickmn/go-cache v2.1.0+incompatible
 	github.com/rs/zerolog v1.35.0
 	github.com/spf13/cobra v1.8.0
 	github.com/spf13/viper v1.21.0
@@ -22,6 +24,7 @@ require (
 	go.opentelemetry.io/otel/sdk v1.43.0
 	go.opentelemetry.io/otel/trace v1.43.0
 	golang.org/x/crypto v0.49.0
+	golang.org/x/time v0.15.0
 	gorm.io/driver/postgres v1.6.0
 	gorm.io/driver/sqlite v1.6.0
 	gorm.io/gorm v1.31.1
@@ -35,7 +38,6 @@ require (
 	github.com/cucumber/messages/go/v21 v21.0.1 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/felixge/httpsnoop v1.0.4 // indirect
-	github.com/fsnotify/fsnotify v1.9.0 // indirect
 	github.com/gabriel-vasile/mimetype v1.4.13 // indirect
 	github.com/go-logr/logr v1.4.3 // indirect
 	github.com/go-logr/stdr v1.2.2 // indirect
--- a/go.sum
+++ b/go.sum
@@ -118,6 +118,8 @@ github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D
 github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
 github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
 github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno=
+github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc=
+github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ=
 github.com/pelletier/go-toml/v2 v2.2.4 h1:mye9XuhQ6gvn5h28+VilKrrPoQVanw5PMw/TB0t5Ec4=
 github.com/pelletier/go-toml/v2 v2.2.4/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
@@ -206,6 +208,8 @@ golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9sn
 golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
 golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
 golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
+golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U=
+golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
 golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
 golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
--- a/pkg/bdd/steps/common_steps.go
+++ b/pkg/bdd/steps/common_steps.go
@@ -2,6 +2,7 @@ package steps

 import (
 	"fmt"
+	"regexp"
 	"strings"

 	"dance-lessons-coach/pkg/bdd/testserver"
@@ -63,3 +64,105 @@ func (s *CommonSteps) theStatusCodeShouldBe(expectedStatus int) error {
 	}
 	return nil
 }
+
+// JSON field validation
+func (s *CommonSteps) theResponseShouldBeJSONWithFields(fields string) error {
+	// Parse the fields comma-separated list
+	fieldList := strings.Split(fields, ", ")
+	for _, field := range fieldList {
+		field = strings.TrimSpace(field)
+		if !s.responseContainsJSONField(field) {
+			return fmt.Errorf("response does not contain field %q", field)
+		}
+	}
+	return nil
+}
+
+func (s *CommonSteps) responseContainsJSONField(field string) bool {
+	body := string(s.client.GetLastBody())
+	// Simple check - look for "field":" in the JSON
+	// This works for simple fields, may need enhancement for nested objects
+	searchString := `"` + field + `":`
+	return strings.Contains(body, searchString)
+}
+
+func (s *CommonSteps) theFieldShouldEqual(field, expectedValue string) error {
+	body := string(s.client.GetLastBody())
+	// Look for the field and extract its value
+	// Simple implementation: look for "field":"value" pattern
+	searchPattern := `"` + field + `":"` + expectedValue + `"`
+	if !strings.Contains(body, searchPattern) {
+		// Also try without quotes (for numbers)
+		searchPatternNum := `"` + field + `":` + expectedValue
+		if !strings.Contains(body, searchPatternNum) {
+			return fmt.Errorf("field %q does not equal %q in response: %s", field, expectedValue, body)
+		}
+	}
+	return nil
+}
+
+// Regex field matching
+func (s *CommonSteps) theFieldShouldMatch(field, pattern string) error {
+	body := string(s.client.GetLastBody())
+	// Extract the value of the field from JSON
+	// Look for "field":"value" and extract value
+	fieldPattern := `"` + field + `":"([^"]*)"`
+	re := regexp.MustCompile(fieldPattern)
+	matches := re.FindStringSubmatch(body)
+	if matches == nil {
+		// Try without quotes (for numbers)
+		fieldPatternNum := `"` + field + `":(\d+\.?\d*)`
+		reNum := regexp.MustCompile(fieldPatternNum)
+		matches = reNum.FindStringSubmatch(body)
+		if matches == nil {
+			return fmt.Errorf("field %q not found in response: %s", field, body)
+		}
+	}
+
+	// matches[1] contains the value
+	value := matches[1]
+
+	// Compile and match the pattern
+	regex, err := regexp.Compile(pattern)
+	if err != nil {
+		return fmt.Errorf("invalid regex pattern %q: %v", pattern, err)
+	}
+
+	if !regex.MatchString(value) {
+		return fmt.Errorf("field %q value %q does not match pattern %q", field, value, pattern)
+	}
+	return nil
+}
+
+// Response is JSON check
+func (s *CommonSteps) theResponseShouldBeJSON() error {
+	body := string(s.client.GetLastBody())
+	// Simple check for JSON structure
+	body = strings.TrimSpace(body)
+	if !strings.HasPrefix(body, "{") && !strings.HasPrefix(body, "[") {
+		return fmt.Errorf("response is not JSON: %s", body)
+	}
+	return nil
+}
+
+// Response contains field (simple string containment in body)
+func (s *CommonSteps) theResponseShouldContain(field string) error {
+	body := string(s.client.GetLastBody())
+	if !strings.Contains(body, `"`+field+`"`) {
+		return fmt.Errorf("response does not contain field %q: %s", field, body)
+	}
+	return nil
+}
+
+// Response header validation
+func (s *CommonSteps) theResponseHeader(header, expectedValue string) error {
+	resp := s.client.GetLastResponse()
+	if resp == nil {
+		return fmt.Errorf("no response captured for header check")
+	}
+	headerValue := resp.Header.Get(header)
+	if headerValue != expectedValue {
+		return fmt.Errorf("header %q expected %q, got %q", header, expectedValue, headerValue)
+	}
+	return nil
+}
--- a/pkg/bdd/steps/health_steps.go
+++ b/pkg/bdd/steps/health_steps.go
@@ -24,7 +24,23 @@ func (s *HealthSteps) iRequestTheHealthEndpoint() error {
 	return s.client.Request("GET", "/api/health", nil)
 }

+func (s *HealthSteps) iRequestTheHealthzEndpoint() error {
+	return s.client.Request("GET", "/api/healthz", nil)
+}
+
+func (s *HealthSteps) iRequestTheInfoEndpoint() error {
+	return s.client.Request("GET", "/api/info", nil)
+}
+
+func (s *HealthSteps) iRequestTheInfoEndpointAgain() error {
+	return s.client.Request("GET", "/api/info", nil)
+}
+
 func (s *HealthSteps) theServerIsRunning() error {
 	// Actually verify the server is running by checking the readiness endpoint
 	return s.client.Request("GET", "/api/ready", nil)
 }
+
+func (s *HealthSteps) theServerIsRunningWithCacheEnabled() error {
+	return s.client.Request("GET", "/api/ready", nil)
+}
--- a/pkg/bdd/steps/ratelimit_steps.go
+++ b/pkg/bdd/steps/ratelimit_steps.go
@@ -0,0 +1,94 @@
+package steps
+
+import (
+	"fmt"
+	"os"
+	"strings"
+
+	"dance-lessons-coach/pkg/bdd/testserver"
+)
+
+// RateLimitSteps holds rate limit-related step definitions
+type RateLimitSteps struct {
+	client      *testserver.Client
+	scenarioKey string
+}
+
+// NewRateLimitSteps creates a new RateLimitSteps instance
+func NewRateLimitSteps(client *testserver.Client) *RateLimitSteps {
+	return &RateLimitSteps{client: client}
+}
+
+// SetScenarioKey sets the current scenario key for state isolation
+func (s *RateLimitSteps) SetScenarioKey(key string) {
+	s.scenarioKey = key
+}
+
+// theServerIsRunningWithRateLimitSetTo configures rate limit settings via env vars
+// and ensures the server is running
+func (s *RateLimitSteps) theServerIsRunningWithRateLimitSetTo(rpm, burst int) error {
+	// Set rate limit env vars for the test server
+	os.Setenv("DLC_RATE_LIMIT_ENABLED", "true")
+	os.Setenv("DLC_RATE_LIMIT_REQUESTS_PER_MINUTE", fmt.Sprintf("%d", rpm))
+	os.Setenv("DLC_RATE_LIMIT_BURST_SIZE", fmt.Sprintf("%d", burst))
+
+	// Verify the server is running
+	return s.client.Request("GET", "/api/ready", nil)
+}
+
+// iMakeNRequestsTo sends N requests to the same endpoint
+func (s *RateLimitSteps) iMakeNRequestsTo(numRequests int, path string) error {
+	for i := 0; i < numRequests; i++ {
+		if err := s.client.Request("GET", path, nil); err != nil {
+			return fmt.Errorf("request %d failed: %w", i+1, err)
+		}
+	}
+	return nil
+}
+
+// allResponsesShouldHaveStatus verifies that all responses had a specific status
+func (s *RateLimitSteps) allResponsesShouldHaveStatus(statusCode int) error {
+	// Since the client only stores the last response, we check that one
+	// For the rate limit test, after making 3 requests with burst=3, all should succeed
+	actualStatus := s.client.GetLastStatusCode()
+	if actualStatus != statusCode {
+		return fmt.Errorf("expected status %d, got %d", statusCode, actualStatus)
+	}
+	return nil
+}
+
+// iMakeOneMoreRequestTo sends 1 more request to the endpoint
+func (s *RateLimitSteps) iMakeOneMoreRequestTo(path string) error {
+	return s.client.Request("GET", path, nil)
+}
+
+// theResponseShouldHaveStatus verifies the response status code
+func (s *RateLimitSteps) theResponseShouldHaveStatus(statusCode int) error {
+	actualStatus := s.client.GetLastStatusCode()
+	if actualStatus != statusCode {
+		return fmt.Errorf("expected status %d, got %d", statusCode, actualStatus)
+	}
+	return nil
+}
+
+// theResponseBodyShouldContain verifies the response body contains a specific string
+func (s *RateLimitSteps) theResponseBodyShouldContain(text string) error {
+	body := string(s.client.GetLastBody())
+	if !strings.Contains(body, text) {
+		return fmt.Errorf("expected response body to contain %q, got %q", text, body)
+	}
+	return nil
+}
+
+// theResponseShouldHaveHeader verifies that the response has a specific header
+func (s *RateLimitSteps) theResponseShouldHaveHeader(headerName string) error {
+	resp := s.client.GetLastResponse()
+	if resp == nil {
+		return fmt.Errorf("no response available")
+	}
+	headerValue := resp.Header.Get(headerName)
+	if headerValue == "" {
+		return fmt.Errorf("expected header %q to be set, but it was not found", headerName)
+	}
+	return nil
+}
--- a/pkg/bdd/steps/steps.go
+++ b/pkg/bdd/steps/steps.go
@@ -16,6 +16,7 @@ type StepContext struct {
 	commonSteps       *CommonSteps
 	jwtRetentionSteps *JWTRetentionSteps
 	configSteps       *ConfigSteps
+	rateLimitSteps    *RateLimitSteps
 }

 // NewStepContext creates a new step context
@@ -28,6 +29,7 @@ func NewStepContext(client *testserver.Client) *StepContext {
 		commonSteps:       NewCommonSteps(client),
 		jwtRetentionSteps: NewJWTRetentionSteps(client),
 		configSteps:       NewConfigSteps(client),
+		rateLimitSteps:    NewRateLimitSteps(client),
 	}
 }

@@ -62,6 +64,9 @@ func SetScenarioKeyForAllSteps(sc *StepContext, key string) {
 		if sc.commonSteps != nil {
 			sc.commonSteps.SetScenarioKey(key)
 		}
+		if sc.rateLimitSteps != nil {
+			sc.rateLimitSteps.SetScenarioKey(key)
+		}
 	}
 }

@@ -83,6 +88,10 @@ func InitializeAllSteps(ctx *godog.ScenarioContext, client *testserver.Client, s

 	// Health steps
 	ctx.Step(`^I request the health endpoint$`, sc.healthSteps.iRequestTheHealthEndpoint)
+	ctx.Step(`^I request the healthz endpoint$`, sc.healthSteps.iRequestTheHealthzEndpoint)
+	ctx.Step(`^I request the info endpoint$`, sc.healthSteps.iRequestTheInfoEndpoint)
+	ctx.Step(`^I request the info endpoint again$`, sc.healthSteps.iRequestTheInfoEndpointAgain)
+	ctx.Step(`^the server is running with cache enabled$`, sc.healthSteps.theServerIsRunningWithCacheEnabled)
 	ctx.Step(`^the server is running$`, sc.healthSteps.theServerIsRunning)

 	// Auth steps
@@ -293,8 +302,23 @@ func InitializeAllSteps(ctx *godog.ScenarioContext, client *testserver.Client, s
 	ctx.Step(`^the audit entry should contain the previous and new values$`, sc.configSteps.theAuditEntryShouldContainThePreviousAndNewValues)
 	ctx.Step(`^the audit entry should contain the timestamp of the change$`, sc.configSteps.theAuditEntryShouldContainTheTimestampOfTheChange)

+	// Rate limit steps
+	ctx.Step(`^the server is running with rate limit set to (\d+) requests per minute and burst (\d+)$`, sc.rateLimitSteps.theServerIsRunningWithRateLimitSetTo)
+	ctx.Step(`^I make (\d+) requests to "([^"]*)"$`, sc.rateLimitSteps.iMakeNRequestsTo)
+	ctx.Step(`^all responses should have status (\d+)$`, sc.rateLimitSteps.allResponsesShouldHaveStatus)
+	ctx.Step(`^I make 1 more request to "([^"]*)"$`, sc.rateLimitSteps.iMakeOneMoreRequestTo)
+	ctx.Step(`^the response should have status (\d+)$`, sc.rateLimitSteps.theResponseShouldHaveStatus)
+	ctx.Step(`^the response body should contain "([^"]*)"$`, sc.rateLimitSteps.theResponseBodyShouldContain)
+	ctx.Step(`^the response should have header "([^"]*)"$`, sc.rateLimitSteps.theResponseShouldHaveHeader)
+
 	// Common steps
 	ctx.Step(`^the response should be "{\\"([^"]*)":\\"([^"]*)"}"$`, sc.commonSteps.theResponseShouldBe)
 	ctx.Step(`^the response should contain error "([^"]*)"$`, sc.commonSteps.theResponseShouldContainError)
 	ctx.Step(`^the status code should be (\d+)$`, sc.commonSteps.theStatusCodeShouldBe)
+	ctx.Step(`^the response should be JSON with fields "([^"]*)"$`, sc.commonSteps.theResponseShouldBeJSONWithFields)
+	ctx.Step(`^the "([^"]*)" field should equal "([^"]*)"$`, sc.commonSteps.theFieldShouldEqual)
+	ctx.Step(`^the "([^"]*)" field should match /([^/]+)/$`, sc.commonSteps.theFieldShouldMatch)
+	ctx.Step(`^the response should be JSON$`, sc.commonSteps.theResponseShouldBeJSON)
+	ctx.Step(`^the response should contain "([^"]*)"$`, sc.commonSteps.theResponseShouldContain)
+	ctx.Step(`^the response header "([^"]*)" should be "([^"]*)"$`, sc.commonSteps.theResponseHeader)
 }
--- a/pkg/bdd/suite.go
+++ b/pkg/bdd/suite.go
@@ -115,6 +115,15 @@ func InitializeTestSuite(ctx *godog.TestSuiteContext) {
 				testserver.TraceStateJWTSecretOperation(feature, scenarioKey, "RESET", "ok")
 			}

+			// Flush cache after every scenario to prevent cache pollution
+			if flushErr := sharedServer.FlushCache(); flushErr != nil {
+				if isCleanupLoggingEnabled() {
+					log.Warn().Err(flushErr).Msg("CLEANUP: Failed to flush cache after scenario")
+				}
+			} else {
+				testserver.TraceStateCacheOperation(feature, scenarioKey, "FLUSH", "ok")
+			}
+
 			// Clean database after every scenario (only if schema isolation is disabled)
 			if !isSchemaIsolationEnabled() {
 				if cleanupErr := sharedServer.CleanupDatabase(); cleanupErr != nil {
--- a/pkg/bdd/testserver/server.go
+++ b/pkg/bdd/testserver/server.go
@@ -15,6 +15,7 @@ import (
 	"sync"
 	"time"

+	"dance-lessons-coach/pkg/cache"
 	"dance-lessons-coach/pkg/config"
 	"dance-lessons-coach/pkg/server"
 	"dance-lessons-coach/pkg/user"
@@ -47,10 +48,13 @@ type Server struct {
 	port               int
 	baseURL            string
 	db                 *sql.DB
-	authService        user.AuthService // Reference to auth service for cleanup
-	schemaMutex        sync.Mutex       // Protects schema operations
-	currentSchema      string           // Current schema being used
-	originalSearchPath string           // Original search_path to restore
+	authService        user.AuthService         // Reference to auth service for cleanup
+	cacheService       cache.Service            // Reference to cache service for cleanup
+	isolatedRepo       *user.PostgresRepository // Per-package isolated repo (BDD_SCHEMA_ISOLATION=true)
+	isolatedSchemaName string                   // Per-package schema name to drop on Stop()
+	schemaMutex        sync.Mutex               // Protects schema operations
+	currentSchema      string                   // Current schema being used
+	originalSearchPath string                   // Original search_path to restore
 }

 // getDatabaseHost returns the database host from environment variable or defaults to localhost
@@ -146,13 +150,62 @@ func (s *Server) Start() error {
 	// This is the ONLY place where we check env vars for v2 configuration
 	v2Enabled := s.shouldEnableV2()

-	// Create real server instance from pkg/server
+	// Create real server instance from pkg/server.
+	// When BDD_SCHEMA_ISOLATION=true, each test package (process) gets its own
+	// isolated PostgreSQL schema with its own connection pool + migrations.
+	// This makes `go test ./features/...` parallel-safe because each feature
+	// package runs in its own process and gets its own schema.
 	cfg := createTestConfig(s.port, v2Enabled)
-	realServer := server.NewServer(cfg, context.Background())
+	var realServer *server.Server
+	if isSchemaIsolationEnabled() {
+		feature := os.Getenv("FEATURE")
+		if feature == "" {
+			feature = "bdd"
+		}
+		schemaName := generateSchemaName(feature, "package_root")
+		log.Info().Str("schema", schemaName).Str("feature", feature).Msg("ISOLATION: Building per-package isolated repo")
+
+		// Connect a default repo briefly just to CREATE SCHEMA (uses cfg from env vars)
+		bootstrapRepo, err := user.NewPostgresRepository(cfg)
+		if err != nil {
+			return fmt.Errorf("ISOLATION bootstrap repo failed: %w", err)
+		}
+		// Drop + recreate to ensure clean slate per process
+		_ = bootstrapRepo.Exec(fmt.Sprintf("DROP SCHEMA IF EXISTS %s CASCADE", schemaName))
+		if err := bootstrapRepo.Exec(fmt.Sprintf("CREATE SCHEMA %s", schemaName)); err != nil {
+			bootstrapRepo.Close()
+			return fmt.Errorf("ISOLATION CREATE SCHEMA failed: %w", err)
+		}
+		bootstrapRepo.Close()
+
+		// Build the per-package isolated repo (runs migrations in the new schema)
+		dsn := user.BuildSchemaIsolatedDSN(cfg, schemaName)
+		isolatedRepo, err := user.NewPostgresRepositoryFromDSN(cfg, dsn)
+		if err != nil {
+			return fmt.Errorf("ISOLATION isolated repo failed: %w", err)
+		}
+		s.isolatedRepo = isolatedRepo
+		s.isolatedSchemaName = schemaName
+
+		// Build user service backed by the isolated repo
+		jwtConfig := user.JWTConfig{
+			Secret:         cfg.GetJWTSecret(),
+			ExpirationTime: time.Hour * 24,
+			Issuer:         "dance-lessons-coach",
+		}
+		isolatedUserService := user.NewUserService(isolatedRepo, jwtConfig, cfg.GetAdminMasterPassword())
+
+		realServer = server.NewServerWithUserRepo(cfg, context.Background(), isolatedRepo, isolatedUserService)
+	} else {
+		realServer = server.NewServer(cfg, context.Background())
+	}

 	// Store auth service for cleanup
 	s.authService = realServer.GetAuthService()

+	// Store cache service for cleanup
+	s.cacheService = realServer.GetCacheService()
+
 	// Initialize database connection for cleanup
 	if err := s.initDBConnection(); err != nil {
 		return fmt.Errorf("failed to initialize database connection: %w", err)
@@ -409,6 +462,23 @@ func (s *Server) ResetJWTSecrets() error {
 	return nil
 }

+// FlushCache clears all cached data to prevent cache pollution between scenarios
+// This prevents cached responses from affecting subsequent test scenarios
+func (s *Server) FlushCache() error {
+	if s.cacheService == nil {
+		if isCleanupLoggingEnabled() {
+			log.Info().Msg("CLEANUP: No cache service available, skipping cache flush")
+		}
+		return nil
+	}
+
+	s.cacheService.Flush()
+	if isCleanupLoggingEnabled() {
+		log.Info().Msg("CLEANUP: Cache flushed successfully")
+	}
+	return nil
+}
+
 // CleanupDatabase deletes all test data from all tables
 // This uses raw SQL to avoid dependency on repositories and handles foreign keys properly
 // Uses SET CONSTRAINTS ALL DEFERRED to temporarily disable foreign key checks
@@ -555,7 +625,7 @@ func (s *Server) SetupScenarioSchema(feature, scenario string) error {
 		return fmt.Errorf("failed to create schema %s: %w", schemaName, err)
 	}

-	// Set search path to use the new schema
+	// Set search path to use the new schema (testserver's own connection)
 	searchPathSQL := fmt.Sprintf("SET search_path = %s, %s", schemaName, s.originalSearchPath)
 	if _, err := s.db.Exec(searchPathSQL); err != nil {
 		return fmt.Errorf("failed to set search_path: %w", err)
@@ -617,6 +687,21 @@ func (s *Server) getCurrentSearchPath() (string, error) {
 }

 func (s *Server) Stop() error {
+	// Cleanup the per-package isolated schema + close its pool, if any.
+	// (BDD_SCHEMA_ISOLATION=true path - see Start().)
+	if s.isolatedRepo != nil {
+		if s.isolatedSchemaName != "" {
+			if err := s.isolatedRepo.Exec(fmt.Sprintf("DROP SCHEMA IF EXISTS %s CASCADE", s.isolatedSchemaName)); err != nil {
+				log.Warn().Err(err).Str("schema", s.isolatedSchemaName).Msg("ISOLATION: failed to drop schema on Stop")
+			}
+		}
+		if err := s.isolatedRepo.Close(); err != nil {
+			log.Warn().Err(err).Msg("ISOLATION: failed to close isolated repo")
+		}
+		s.isolatedRepo = nil
+		s.isolatedSchemaName = ""
+	}
+
 	if s.httpServer != nil {
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		defer cancel()
@@ -676,6 +761,25 @@ func (s *Server) shouldEnableV2() bool {
 // createTestConfig creates a test configuration
 // Pass v2Enabled explicitly to avoid reading env vars deep in the stack
 func createTestConfig(port int, v2Enabled bool) *config.Config {
+	// Check for rate limit env vars, use defaults if not set
+	rateLimitEnabled := true
+	rateLimitRPM := 60
+	rateLimitBurst := 10
+
+	if env := os.Getenv("DLC_RATE_LIMIT_ENABLED"); env != "" {
+		rateLimitEnabled = strings.EqualFold(env, "true") || env == "1"
+	}
+	if env := os.Getenv("DLC_RATE_LIMIT_REQUESTS_PER_MINUTE"); env != "" {
+		if val, err := strconv.Atoi(env); err == nil {
+			rateLimitRPM = val
+		}
+	}
+	if env := os.Getenv("DLC_RATE_LIMIT_BURST_SIZE"); env != "" {
+		if val, err := strconv.Atoi(env); err == nil {
+			rateLimitBurst = val
+		}
+	}
+
 	return &config.Config{
 		Server: config.ServerConfig{
 			Host: "0.0.0.0",
@@ -702,5 +806,10 @@ func createTestConfig(port int, v2Enabled bool) *config.Config {
 		Logging: config.LoggingConfig{
 			Level: "debug",
 		},
+		RateLimit: config.RateLimitConfig{
+			Enabled:           rateLimitEnabled,
+			RequestsPerMinute: rateLimitRPM,
+			BurstSize:         rateLimitBurst,
+		},
 	}
 }
--- a/pkg/bdd/testserver/state_tracer.go
+++ b/pkg/bdd/testserver/state_tracer.go
@@ -31,6 +31,11 @@ func TraceStateJWTSecretOperation(feature, scenario, operation, details string)
 	writeTraceLine(feature, scenario, "JWT_"+operation, details)
 }

+// TraceStateCacheOperation logs a cache operation
+func TraceStateCacheOperation(feature, scenario, operation, details string) {
+	writeTraceLine(feature, scenario, "CACHE_"+operation, details)
+}
+
 // TraceStateSchemaIsolation logs a schema isolation operation
 func TraceStateSchemaIsolation(feature, scenario, operation, details string) {
 	writeTraceLine(feature, scenario, "SCHEMA_"+operation, details)
--- a/pkg/cache/cache.go
+++ b/pkg/cache/cache.go
@@ -0,0 +1,56 @@
+package cache
+
+import (
+	"time"
+
+	gocache "github.com/patrickmn/go-cache"
+)
+
+// Service defines the interface for cache operations
+type Service interface {
+	Set(key string, value interface{}, ttl time.Duration)
+	Get(key string) (interface{}, bool)
+	Delete(key string)
+	Flush()
+	ItemCount() int
+}
+
+// InMemoryService implements Service using go-cache library
+type InMemoryService struct {
+	cache *gocache.Cache
+}
+
+// NewInMemoryService creates a new in-memory cache service
+// defaultTTL: default time-to-live for cache items
+// cleanupInterval: interval at which expired items are cleaned up
+func NewInMemoryService(defaultTTL, cleanupInterval time.Duration) Service {
+	c := gocache.New(defaultTTL, cleanupInterval)
+	return &InMemoryService{cache: c}
+}
+
+// Set stores a value in the cache with the specified TTL
+func (s *InMemoryService) Set(key string, value interface{}, ttl time.Duration) {
+	s.cache.Set(key, value, ttl)
+}
+
+// Get retrieves a value from the cache
+// Returns the value and true if found, nil and false if not found or expired
+func (s *InMemoryService) Get(key string) (interface{}, bool) {
+	val, found := s.cache.Get(key)
+	return val, found
+}
+
+// Delete removes an item from the cache
+func (s *InMemoryService) Delete(key string) {
+	s.cache.Delete(key)
+}
+
+// Flush clears all items from the cache
+func (s *InMemoryService) Flush() {
+	s.cache.Flush()
+}
+
+// ItemCount returns the number of items currently in the cache
+func (s *InMemoryService) ItemCount() int {
+	return s.cache.ItemCount()
+}
--- a/pkg/cache/cache_test.go
+++ b/pkg/cache/cache_test.go
@@ -0,0 +1,135 @@
+package cache
+
+import (
+	"testing"
+	"time"
+)
+
+func TestInMemoryService_SetGet(t *testing.T) {
+	svc := NewInMemoryService(1*time.Hour, 1*time.Hour)
+
+	// Test Set and Get
+	svc.Set("key1", "value1", 1*time.Hour)
+	val, ok := svc.Get("key1")
+	if !ok {
+		t.Fatal("Expected to find key1 in cache")
+	}
+	if val != "value1" {
+		t.Fatalf("Expected 'value1', got '%v'", val)
+	}
+
+	// Test Get non-existent key
+	_, ok = svc.Get("nonexistent")
+	if ok {
+		t.Fatal("Expected not to find nonexistent key")
+	}
+}
+
+func TestInMemoryService_Delete(t *testing.T) {
+	svc := NewInMemoryService(1*time.Hour, 1*time.Hour)
+
+	svc.Set("key1", "value1", 1*time.Hour)
+	_, ok := svc.Get("key1")
+	if !ok {
+		t.Fatal("Expected to find key1 before delete")
+	}
+
+	svc.Delete("key1")
+	_, ok = svc.Get("key1")
+	if ok {
+		t.Fatal("Expected not to find key1 after delete")
+	}
+}
+
+func TestInMemoryService_Flush(t *testing.T) {
+	svc := NewInMemoryService(1*time.Hour, 1*time.Hour)
+
+	svc.Set("key1", "value1", 1*time.Hour)
+	svc.Set("key2", "value2", 1*time.Hour)
+
+	if svc.ItemCount() != 2 {
+		t.Fatalf("Expected 2 items, got %d", svc.ItemCount())
+	}
+
+	svc.Flush()
+
+	if svc.ItemCount() != 0 {
+		t.Fatalf("Expected 0 items after flush, got %d", svc.ItemCount())
+	}
+
+	_, ok := svc.Get("key1")
+	if ok {
+		t.Fatal("Expected key1 to be flushed")
+	}
+}
+
+func TestInMemoryService_ItemCount(t *testing.T) {
+	svc := NewInMemoryService(1*time.Hour, 1*time.Hour)
+
+	if svc.ItemCount() != 0 {
+		t.Fatalf("Expected 0 items initially, got %d", svc.ItemCount())
+	}
+
+	svc.Set("key1", "value1", 1*time.Hour)
+	if svc.ItemCount() != 1 {
+		t.Fatalf("Expected 1 item, got %d", svc.ItemCount())
+	}
+
+	svc.Set("key2", "value2", 1*time.Hour)
+	if svc.ItemCount() != 2 {
+		t.Fatalf("Expected 2 items, got %d", svc.ItemCount())
+	}
+
+	svc.Delete("key1")
+	if svc.ItemCount() != 1 {
+		t.Fatalf("Expected 1 item after delete, got %d", svc.ItemCount())
+	}
+}
+
+func TestInMemoryService_TTLExpiration(t *testing.T) {
+	// Use a very short TTL for testing
+	svc := NewInMemoryService(100*time.Millisecond, 50*time.Millisecond)
+
+	svc.Set("key1", "value1", 50*time.Millisecond)
+
+	// Should be present immediately
+	val, ok := svc.Get("key1")
+	if !ok {
+		t.Fatal("Expected to find key1 immediately after set")
+	}
+	if val != "value1" {
+		t.Fatalf("Expected 'value1', got '%v'", val)
+	}
+
+	// Wait for expiration
+	time.Sleep(100 * time.Millisecond)
+
+	// Should be expired now
+	_, ok = svc.Get("key1")
+	if ok {
+		t.Fatal("Expected key1 to be expired after TTL")
+	}
+}
+
+func TestInMemoryService_DifferentTypes(t *testing.T) {
+	svc := NewInMemoryService(1*time.Hour, 1*time.Hour)
+
+	// Test with different types
+	svc.Set("string", "hello", 1*time.Hour)
+	svc.Set("int", 42, 1*time.Hour)
+	svc.Set("slice", []string{"a", "b"}, 1*time.Hour)
+
+	if svc.ItemCount() != 3 {
+		t.Fatalf("Expected 3 items, got %d", svc.ItemCount())
+	}
+
+	val, ok := svc.Get("string")
+	if !ok || val != "hello" {
+		t.Fatal("String value mismatch")
+	}
+
+	val, ok = svc.Get("int")
+	if !ok || val != 42 {
+		t.Fatal("Int value mismatch")
+	}
+}
--- a/pkg/config/config.go
+++ b/pkg/config/config.go
@@ -1,11 +1,14 @@
 package config

 import (
+	"context"
 	"fmt"
 	"os"
 	"strings"
+	"sync"
 	"time"

+	"github.com/fsnotify/fsnotify"
 	"github.com/rs/zerolog"
 	"github.com/rs/zerolog/log"
 	"github.com/spf13/viper"
@@ -13,6 +16,13 @@ import (
 	"dance-lessons-coach/pkg/version"
 )

+// SamplerReconfigureFunc is the signature for callbacks invoked when
+// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload.
+// The callback receives the new sampler type and ratio values.
+// It must be safe to call concurrently — implementations should use their
+// own synchronisation if needed. Returns an error if the reconfigure fails.
+type SamplerReconfigureFunc func(ctx context.Context, samplerType string, samplerRatio float64) error
+
 // NewZerologWriter creates a zerolog writer based on configuration
 func NewZerologWriter() *os.File {
 	return os.Stderr
@@ -27,6 +37,31 @@ type Config struct {
 	API       APIConfig       `mapstructure:"api"`
 	Auth      AuthConfig      `mapstructure:"auth"`
 	Database  DatabaseConfig  `mapstructure:"database"`
+	RateLimit RateLimitConfig `mapstructure:"rate_limit"`
+	Cache     CacheConfig     `mapstructure:"cache"`
+
+	// viper is the underlying configuration source. Kept (unexported,
+	// mapstructure:"-") so hot-reload can re-unmarshal on file changes —
+	// see WatchAndApply (ADR-0023 selective hot-reload).
+	viper *viper.Viper `mapstructure:"-"`
+
+	// reloadMu serialises Unmarshal during hot-reload so a partial mutation
+	// can't be observed mid-flight by getter calls.
+	reloadMu sync.RWMutex `mapstructure:"-"`
+
+	// samplerReconfigureCallback is invoked when telemetry.sampler.type or
+	// telemetry.sampler.ratio change. nil means no callback registered.
+	samplerReconfigureCallback SamplerReconfigureFunc `mapstructure:"-"`
+
+	// prevSamplerType and prevSamplerRatio track the last-seen sampler values
+	// to detect changes during hot-reload (ADR-0023 Phase 3).
+	prevSamplerType  string  `mapstructure:"-"`
+	prevSamplerRatio float64 `mapstructure:"-"`
+
+	// watcherStopped indicates that the config watcher has been stopped via
+	// the context being cancelled. This prevents the OnConfigChange handler
+	// from processing events after cleanup.
+	watcherStopped bool `mapstructure:"-"`
 }

 // ServerConfig holds server-related configuration
@@ -97,6 +132,20 @@ type DatabaseConfig struct {
 	ConnMaxLifetime time.Duration `mapstructure:"conn_max_lifetime"`
 }

+// RateLimitConfig holds rate limiting configuration
+type RateLimitConfig struct {
+	Enabled           bool `mapstructure:"enabled"`
+	RequestsPerMinute int  `mapstructure:"requests_per_minute"`
+	BurstSize         int  `mapstructure:"burst_size"`
+}
+
+// CacheConfig holds cache configuration
+type CacheConfig struct {
+	Enabled                bool `mapstructure:"enabled"`
+	DefaultTTLSeconds      int  `mapstructure:"default_ttl_seconds"`
+	CleanupIntervalSeconds int  `mapstructure:"cleanup_interval_seconds"`
+}
+
 // VersionInfo holds application version information
 type VersionInfo struct {
 	Version   string `mapstructure:"-"` // Set via ldflags
@@ -118,6 +167,34 @@ type SamplerConfig struct {
 	Ratio float64 `mapstructure:"ratio"`
 }

+// peekJSONLogging determines whether JSON logging should be used before the full
+// config is loaded, solving the chicken-and-egg problem where the logger format
+// must be known before any log is emitted, yet the format is stored in the config.
+//
+// Resolution order (mirrors Viper's own priority):
+//  1. DLC_LOGGING_JSON env var — checked directly via os.Getenv (zero overhead)
+//  2. logging.json key in the config file — read with a minimal throwaway Viper
+//     instance so we don't parse the whole config twice unnecessarily
+func peekJSONLogging() bool {
+	// 1. Env var takes highest priority — check it first
+	if env := os.Getenv("DLC_LOGGING_JSON"); env != "" {
+		return strings.EqualFold(env, "true") || env == "1"
+	}
+
+	// 2. Try to read logging.json from the config file
+	preV := viper.New()
+	preV.SetDefault("logging.json", false)
+	if configFile := os.Getenv("DLC_CONFIG_FILE"); configFile != "" {
+		preV.SetConfigFile(configFile)
+	} else {
+		preV.SetConfigName("config")
+		preV.SetConfigType("yaml")
+		preV.AddConfigPath(".")
+	}
+	_ = preV.ReadInConfig() // ignore errors — defaults apply on failure
+	return preV.GetBool("logging.json")
+}
+
 // LoadConfig loads configuration from file, environment variables, and defaults
 // Configuration priority: file > environment variables > defaults
 // To specify a custom config file path, set DLC_CONFIG_FILE environment variable
@@ -129,9 +206,17 @@ func LoadConfig() (*Config, error) {

 	v := viper.New()

-	// Set up initial console logging for config loading messages
-	consoleWriter := zerolog.ConsoleWriter{Out: os.Stderr}
-	log.Logger = log.Output(consoleWriter)
+	// Configure the logger format before emitting any log output.
+	// peekJSONLogging reads the JSON setting early (env var + config file pre-read)
+	// so that every log line — including those produced during config loading — is
+	// already in the correct format.
+	jsonLogging := peekJSONLogging()
+	if jsonLogging {
+		log.Logger = log.Output(os.Stderr)
+	} else {
+		log.Logger = log.Output(zerolog.ConsoleWriter{Out: os.Stderr})
+	}
+	log.Info().Bool("json", jsonLogging).Msg("Logging configured")

 	// Set default values
 	v.SetDefault("server.host", "0.0.0.0")
@@ -153,6 +238,16 @@ func LoadConfig() (*Config, error) {
 	// API defaults
 	v.SetDefault("api.v2_enabled", false)

+	// Rate limit defaults
+	v.SetDefault("rate_limit.enabled", true)
+	v.SetDefault("rate_limit.requests_per_minute", 60)
+	v.SetDefault("rate_limit.burst_size", 10)
+
+	// Cache defaults
+	v.SetDefault("cache.enabled", true)
+	v.SetDefault("cache.default_ttl_seconds", 300)
+	v.SetDefault("cache.cleanup_interval_seconds", 600)
+
 	// Auth defaults
 	v.SetDefault("auth.jwt_secret", "default-secret-key-please-change-in-production")
 	v.SetDefault("auth.admin_master_password", "admin123")
@@ -212,6 +307,16 @@ func LoadConfig() (*Config, error) {
 	// API environment variables
 	v.BindEnv("api.v2_enabled", "DLC_API_V2_ENABLED")

+	// Rate limit environment variables
+	v.BindEnv("rate_limit.enabled", "DLC_RATE_LIMIT_ENABLED")
+	v.BindEnv("rate_limit.requests_per_minute", "DLC_RATE_LIMIT_REQUESTS_PER_MINUTE")
+	v.BindEnv("rate_limit.burst_size", "DLC_RATE_LIMIT_BURST_SIZE")
+
+	// Cache environment variables
+	v.BindEnv("cache.enabled", "DLC_CACHE_ENABLED")
+	v.BindEnv("cache.default_ttl_seconds", "DLC_CACHE_DEFAULT_TTL_SECONDS")
+	v.BindEnv("cache.cleanup_interval_seconds", "DLC_CACHE_CLEANUP_INTERVAL_SECONDS")
+
 	// Database environment variables
 	v.BindEnv("database.host", "DLC_DATABASE_HOST")
 	v.BindEnv("database.port", "DLC_DATABASE_PORT")
@@ -227,15 +332,17 @@ func LoadConfig() (*Config, error) {
 		return nil, fmt.Errorf("config unmarshal error: %w", err)
 	}

-	// Configure log output format (JSON or console) first
-	if config.Logging.JSON {
-		log.Logger = log.Output(os.Stderr)
-	} else {
-		consoleWriter := zerolog.ConsoleWriter{Out: os.Stderr}
-		log.Logger = log.Output(consoleWriter)
-	}
+	// Keep the viper instance for hot-reload (ADR-0023).
+	config.viper = v

-	// Setup logging based on configuration
+	// Initialize previous sampler values for hot-reload change detection
+	// (ADR-0023 Phase 3).
+	config.prevSamplerType = config.Telemetry.Sampler.Type
+	config.prevSamplerRatio = config.Telemetry.Sampler.Ratio
+
+	// Setup logging based on configuration (level, output file, time format).
+	// The JSON/console format was already applied at the top of LoadConfig via
+	// peekJSONLogging, so SetupLogging only needs to handle the remaining knobs.
 	config.SetupLogging()

 	log.Info().
@@ -297,6 +404,19 @@ func (c *Config) GetSamplerRatio() float64 {
 	return c.Telemetry.Sampler.Ratio
 }

+// SetSamplerReconfigureCallback registers a callback that is invoked when
+// telemetry.sampler.type or telemetry.sampler.ratio change via hot-reload.
+// The callback receives the new sampler type and ratio values.
+// Pass nil to unregister the callback.
+func (c *Config) SetSamplerReconfigureCallback(callback SamplerReconfigureFunc) {
+	c.reloadMu.Lock()
+	defer c.reloadMu.Unlock()
+	c.samplerReconfigureCallback = callback
+	// Initialize previous values so we can detect changes on first hot-reload
+	c.prevSamplerType = c.Telemetry.Sampler.Type
+	c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
+}
+
 // GetV2Enabled returns whether v2 API is enabled
 func (c *Config) GetV2Enabled() bool {
 	return c.API.V2Enabled
@@ -359,6 +479,48 @@ func (c *Config) GetLogOutput() string {
 	return c.Logging.Output
 }

+// GetRateLimitEnabled returns whether rate limiting is enabled
+func (c *Config) GetRateLimitEnabled() bool {
+	return c.RateLimit.Enabled
+}
+
+// GetRateLimitRequestsPerMinute returns the requests per minute limit
+func (c *Config) GetRateLimitRequestsPerMinute() int {
+	if c.RateLimit.RequestsPerMinute <= 0 {
+		return 60
+	}
+	return c.RateLimit.RequestsPerMinute
+}
+
+// GetRateLimitBurstSize returns the burst size for rate limiting
+func (c *Config) GetRateLimitBurstSize() int {
+	if c.RateLimit.BurstSize <= 0 {
+		return 10
+	}
+	return c.RateLimit.BurstSize
+}
+
+// GetCacheEnabled returns whether cache is enabled
+func (c *Config) GetCacheEnabled() bool {
+	return c.Cache.Enabled
+}
+
+// GetCacheDefaultTTLSeconds returns the default TTL in seconds for cache items
+func (c *Config) GetCacheDefaultTTLSeconds() int {
+	if c.Cache.DefaultTTLSeconds <= 0 {
+		return 300
+	}
+	return c.Cache.DefaultTTLSeconds
+}
+
+// GetCacheCleanupIntervalSeconds returns the cleanup interval in seconds for cache
+func (c *Config) GetCacheCleanupIntervalSeconds() int {
+	if c.Cache.CleanupIntervalSeconds <= 0 {
+		return 600
+	}
+	return c.Cache.CleanupIntervalSeconds
+}
+
 // GetDatabaseHost returns the database host
 func (c *Config) GetDatabaseHost() string {
 	if c.Database.Host == "" {
@@ -482,3 +644,97 @@ func (c *Config) setupLogOutput() {
 	log.Logger = log.Output(file)
 	log.Trace().Str("output", output).Msg("Logging to file")
 }
+
+// WatchAndApply starts watching the config file for changes and applies the
+// hot-reloadable subset on every change (ADR-0023 selective hot-reload).
+//
+// Phases shipped:
+//   - Phase 1: logging.level — re-applied via SetupLogging on every change.
+//   - Phase 2: auth.jwt.ttl — picked up automatically because the userService
+//     reads it via JWTConfig.GetTTL (a method value capturing this *Config).
+//     The reloaded TTL is used on the NEXT token generation; tokens issued
+//     before the change keep their original expiry.
+//   - Phase 3: telemetry.sampler.type + telemetry.sampler.ratio — triggers
+//     the callback set via SetSamplerReconfigureCallback if the values change.
+//
+// The other fields listed in ADR-0023 (api.v2_enabled) remain restart-only
+// until their handlers land in subsequent phases.
+//
+// Stops when ctx is cancelled. Safe to call once at server startup.
+// If the config file is absent (ConfigFileNotFoundError at load time), this
+// becomes a no-op and logs a single warning.
+func (c *Config) WatchAndApply(ctx context.Context) {
+	if c.viper == nil {
+		log.Warn().Msg("Config hot-reload disabled: no viper instance attached")
+		return
+	}
+	if c.viper.ConfigFileUsed() == "" {
+		log.Info().Msg("Config hot-reload disabled: no config file in use (env-only or defaults)")
+		return
+	}
+
+	c.viper.OnConfigChange(func(in fsnotify.Event) {
+		// Skip processing if watcher has been stopped
+		c.reloadMu.Lock()
+		if c.watcherStopped {
+			c.reloadMu.Unlock()
+			return
+		}
+		c.reloadMu.Unlock()
+
+		log.Info().Str("event", in.Op.String()).Str("file", in.Name).Msg("Config file changed, reloading hot-reloadable fields")
+		c.reloadMu.Lock()
+		defer c.reloadMu.Unlock()
+
+		if err := c.viper.Unmarshal(c); err != nil {
+			log.Error().Err(err).Msg("Hot-reload: failed to unmarshal new config, keeping previous values")
+			return
+		}
+
+		// Apply hot-reloadable fields. Order matters: logging first so the
+		// rest of the reload is logged at the right level.
+		c.SetupLogging()
+
+		// Check if sampler config changed and invoke callback if registered
+		samplerChanged := c.prevSamplerType != c.Telemetry.Sampler.Type ||
+			c.prevSamplerRatio != c.Telemetry.Sampler.Ratio
+		if samplerChanged && c.samplerReconfigureCallback != nil {
+			if err := c.samplerReconfigureCallback(context.Background(),
+				c.Telemetry.Sampler.Type,
+				c.Telemetry.Sampler.Ratio); err != nil {
+				log.Error().Err(err).Msg("Hot-reload: sampler reconfigure callback failed")
+			} else {
+				// Update previous values only after successful callback
+				c.prevSamplerType = c.Telemetry.Sampler.Type
+				c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
+				log.Info().
+					Str("sampler_type", c.prevSamplerType).
+					Float64("sampler_ratio", c.prevSamplerRatio).
+					Msg("Hot-reload applied: telemetry sampler reconfigured")
+			}
+		} else if samplerChanged {
+			// No callback registered, just update tracking values
+			c.prevSamplerType = c.Telemetry.Sampler.Type
+			c.prevSamplerRatio = c.Telemetry.Sampler.Ratio
+		}
+
+		log.Info().
+			Str("logging_level", c.GetLogLevel()).
+			Dur("jwt_ttl", c.GetJWTTTL()).
+			Msg("Hot-reload applied (logging.level + auth.jwt.ttl)")
+	})
+	c.viper.WatchConfig()
+
+	log.Info().Str("file", c.viper.ConfigFileUsed()).Msg("Config hot-reload watcher started (ADR-0023 Phase 1)")
+
+	// Stop the watcher on context cancel — we set a flag that the
+	// OnConfigChange handler checks, avoiding the race with viper's
+	// internal state that would occur if we called OnConfigChange again.
+	go func() {
+		<-ctx.Done()
+		c.reloadMu.Lock()
+		c.watcherStopped = true
+		c.reloadMu.Unlock()
+		log.Info().Msg("Config hot-reload watcher stopped")
+	}()
+}
--- a/pkg/config/config_hot_reload_test.go
+++ b/pkg/config/config_hot_reload_test.go
@@ -0,0 +1,351 @@
+package config
+
+import (
+	"context"
+	"errors"
+	"os"
+	"path/filepath"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/spf13/viper"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// loadFromFile is a helper that mimics LoadConfig() for a specific file path
+// without going through the env-prefix and singleton machinery — keeps the
+// test hermetic.
+func loadFromFile(t *testing.T, path string) *Config {
+	t.Helper()
+	v := viper.New()
+	v.SetConfigFile(path)
+	v.SetConfigType("yaml")
+	v.SetDefault("logging.level", "info")
+	v.SetDefault("auth.jwt.ttl", time.Hour)
+	require.NoError(t, v.ReadInConfig())
+
+	c := &Config{viper: v}
+	require.NoError(t, v.Unmarshal(c))
+	return c
+}
+
+// TestWatchAndApply_LoggingLevel proves the hot-reload pipe end-to-end:
+// write a new logging.level to the watched file, the OnConfigChange handler
+// re-unmarshals, and the in-memory Config reflects the new value.
+func TestWatchAndApply_LoggingLevel(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+	require.NoError(t, os.WriteFile(path, []byte("logging:\n  level: info\n"), 0644))
+
+	c := loadFromFile(t, path)
+	assert.Equal(t, "info", c.GetLogLevel())
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+
+	// Mutate the file. fsnotify needs a real write event; rewrite atomically.
+	require.NoError(t, os.WriteFile(path, []byte("logging:\n  level: debug\n"), 0644))
+
+	// Poll for up to 2s waiting for the in-memory level to flip.
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		c.reloadMu.RLock()
+		level := c.GetLogLevel()
+		c.reloadMu.RUnlock()
+		if level == "debug" {
+			return
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	c.reloadMu.RLock()
+	defer c.reloadMu.RUnlock()
+	t.Fatalf("logging level did not hot-reload to debug: still %q", c.GetLogLevel())
+}
+
+// TestWatchAndApply_NoFileNoOp confirms the watcher is a safe no-op when no
+// config file is in use (env-only / defaults) — important so production
+// containers without a mounted config.yaml don't crash.
+func TestWatchAndApply_NoFileNoOp(t *testing.T) {
+	c := &Config{viper: viper.New()}
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx) // should return without panicking
+}
+
+// TestWatchAndApply_NilViperNoOp confirms the watcher tolerates a Config
+// constructed without the viper field (e.g. tests that build a Config{}
+// manually — same defensive code path as production but exercised explicitly).
+func TestWatchAndApply_NilViperNoOp(t *testing.T) {
+	c := &Config{}
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+}
+
+// TestWatchAndApply_JWTTTL proves Phase 2 of ADR-0023: the JWT TTL is
+// re-read on every token generation via the GetJWTTTL method value, so
+// after a config-file change the new TTL takes effect without restart.
+func TestWatchAndApply_JWTTTL(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+	require.NoError(t, os.WriteFile(path, []byte("auth:\n  jwt:\n    ttl: 1h\n"), 0644))
+
+	c := loadFromFile(t, path)
+	assert.Equal(t, time.Hour, c.GetJWTTTL())
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+
+	require.NoError(t, os.WriteFile(path, []byte("auth:\n  jwt:\n    ttl: 30m\n"), 0644))
+
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		c.reloadMu.RLock()
+		ttl := c.GetJWTTTL()
+		c.reloadMu.RUnlock()
+		if ttl == 30*time.Minute {
+			return
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+	c.reloadMu.RLock()
+	defer c.reloadMu.RUnlock()
+	t.Fatalf("auth.jwt.ttl did not hot-reload to 30m: still %s", c.GetJWTTTL())
+}
+
+// TestWatchAndApply_TelemetrySamplerType proves Phase 3 of ADR-0023:
+// when telemetry.sampler.type changes, the callback registered via
+// SetSamplerReconfigureCallback is invoked exactly once with the new value.
+func TestWatchAndApply_TelemetrySamplerType(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+	initial := []byte(`telemetry:
+  sampler:
+    type: parentbased_always_on
+    ratio: 1.0
+`)
+	changed := []byte(`telemetry:
+  sampler:
+    type: traceidratio
+    ratio: 1.0
+`)
+	require.NoError(t, os.WriteFile(path, initial, 0644))
+
+	c := loadFromFile(t, path)
+	assert.Equal(t, "parentbased_always_on", c.GetSamplerType())
+
+	// Setup callback tracker
+	var mu sync.Mutex
+	callbackCalled := false
+	var recordedType string
+	var recordedRatio float64
+	c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCalled = true
+		recordedType = samplerType
+		recordedRatio = samplerRatio
+		return nil
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+
+	// Mutate the file
+	require.NoError(t, os.WriteFile(path, changed, 0644))
+
+	// Poll for up to 2s waiting for callback
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		mu.Lock()
+		if callbackCalled {
+			mu.Unlock()
+			assert.Equal(t, "traceidratio", recordedType)
+			assert.Equal(t, 1.0, recordedRatio)
+			return
+		}
+		mu.Unlock()
+		time.Sleep(20 * time.Millisecond)
+	}
+	mu.Lock()
+	defer mu.Unlock()
+	t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
+}
+
+// TestWatchAndApply_TelemetrySamplerRatio proves Phase 3 of ADR-0023:
+// when telemetry.sampler.ratio changes, the callback registered via
+// SetSamplerReconfigureCallback is invoked exactly once with the new value.
+func TestWatchAndApply_TelemetrySamplerRatio(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+	initial := []byte(`telemetry:
+  sampler:
+    type: parentbased_always_on
+    ratio: 1.0
+`)
+	changed := []byte(`telemetry:
+  sampler:
+    type: parentbased_always_on
+    ratio: 0.5
+`)
+	require.NoError(t, os.WriteFile(path, initial, 0644))
+
+	c := loadFromFile(t, path)
+	assert.Equal(t, 1.0, c.GetSamplerRatio())
+
+	// Setup callback tracker
+	var mu sync.Mutex
+	callbackCalled := false
+	var recordedType string
+	var recordedRatio float64
+	c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCalled = true
+		recordedType = samplerType
+		recordedRatio = samplerRatio
+		return nil
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+
+	// Mutate the file
+	require.NoError(t, os.WriteFile(path, changed, 0644))
+
+	// Poll for up to 2s waiting for callback
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		mu.Lock()
+		if callbackCalled {
+			mu.Unlock()
+			assert.Equal(t, "parentbased_always_on", recordedType)
+			assert.Equal(t, 0.5, recordedRatio)
+			return
+		}
+		mu.Unlock()
+		time.Sleep(20 * time.Millisecond)
+	}
+	mu.Lock()
+	defer mu.Unlock()
+	t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
+}
+
+// TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange proves that
+// the sampler callback is NOT invoked when the config file changes but
+// sampler type and ratio remain the same.
+func TestWatchAndApply_SamplerCallbackNotCalledWhenNoChange(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+	initial := []byte(`telemetry:
+  sampler:
+    type: parentbased_always_on
+    ratio: 1.0
+logging:
+  level: info
+`)
+	changed := []byte(`telemetry:
+  sampler:
+    type: parentbased_always_on
+    ratio: 1.0
+logging:
+  level: debug
+`)
+	require.NoError(t, os.WriteFile(path, initial, 0644))
+
+	c := loadFromFile(t, path)
+
+	// Setup callback tracker
+	var mu sync.Mutex
+	callbackCalled := false
+	c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCalled = true
+		return nil
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+
+	// Mutate the file (logging level changes, but sampler stays the same)
+	require.NoError(t, os.WriteFile(path, changed, 0644))
+
+	// Poll for up to 2s - callback should NOT be called
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		mu.Lock()
+		wasCalled := callbackCalled
+		mu.Unlock()
+		if wasCalled {
+			t.Fatalf("sampler reconfigure callback was invoked but sampler did not change")
+		}
+		time.Sleep(20 * time.Millisecond)
+	}
+}
+
+// TestWatchAndApply_SamplerCallbackErrorHandling proves that when the
+// sampler reconfigure callback returns an error, the previous sampler values
+// are NOT updated, allowing retry on next config change.
+func TestWatchAndApply_SamplerCallbackErrorHandling(t *testing.T) {
+	dir := t.TempDir()
+	path := filepath.Join(dir, "config.yaml")
+	initial := []byte(`telemetry:
+  sampler:
+    type: parentbased_always_on
+    ratio: 1.0
+`)
+	changed := []byte(`telemetry:
+  sampler:
+    type: traceidratio
+    ratio: 0.5
+`)
+	require.NoError(t, os.WriteFile(path, initial, 0644))
+
+	c := loadFromFile(t, path)
+
+	// Setup callback that returns an error
+	expectedErr := errors.New("reconfigure failed")
+	var mu sync.Mutex
+	callbackCalled := false
+	c.SetSamplerReconfigureCallback(func(ctx context.Context, samplerType string, samplerRatio float64) error {
+		mu.Lock()
+		defer mu.Unlock()
+		callbackCalled = true
+		return expectedErr
+	})
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+	c.WatchAndApply(ctx)
+
+	// Mutate the file
+	require.NoError(t, os.WriteFile(path, changed, 0644))
+
+	// Poll for up to 2s waiting for callback error
+	deadline := time.Now().Add(2 * time.Second)
+	for time.Now().Before(deadline) {
+		mu.Lock()
+		if callbackCalled {
+			mu.Unlock()
+			// Verify previous values were NOT updated (so retry can work)
+			c.reloadMu.RLock()
+			assert.Equal(t, "parentbased_always_on", c.prevSamplerType)
+			assert.Equal(t, 1.0, c.prevSamplerRatio)
+			c.reloadMu.RUnlock()
+			return
+		}
+		mu.Unlock()
+		time.Sleep(20 * time.Millisecond)
+	}
+	mu.Lock()
+	defer mu.Unlock()
+	t.Fatalf("sampler reconfigure callback was not invoked: callbackCalled=%v", callbackCalled)
+}
--- a/pkg/jwt/jwt.go
+++ b/pkg/jwt/jwt.go
@@ -24,13 +24,25 @@ type JWTSecret struct {
 	ExpiresAt *time.Time // Optional expiration time
 }

-// JWTSecretManager manages multiple JWT secrets for rotation
+// JWTSecretManager manages multiple JWT secrets for rotation.
+// Secrets can carry an optional expiration; the cleanup loop removes them
+// after expiry while always preserving the primary secret (ADR-0021).
 type JWTSecretManager interface {
 	AddSecret(secret string, isPrimary bool, expiresIn time.Duration)
 	RotateToSecret(newSecret string)
 	GetPrimarySecret() string
 	GetAllValidSecrets() []JWTSecret
 	GetSecretByIndex(index int) (string, bool)
+
+	// RemoveExpiredSecrets drops every non-primary secret whose ExpiresAt is
+	// non-nil and in the past. Returns the count of secrets removed.
+	// The primary secret is never removed regardless of expiration.
+	RemoveExpiredSecrets() int
+
+	// StartCleanupLoop spawns a goroutine that calls RemoveExpiredSecrets at
+	// the given interval. Stops when the context is cancelled. Safe to call
+	// once at startup; calling again replaces the previous loop's context.
+	StartCleanupLoop(ctx context.Context, interval time.Duration)
 }

 // JWTService defines interface for JWT operations
--- a/pkg/jwt/jwt_secret_manager.go
+++ b/pkg/jwt/jwt_secret_manager.go
@@ -1,16 +1,24 @@
 package jwt

 import (
+	"context"
+	"sync"
 	"time"
+
+	"github.com/rs/zerolog/log"
 )

-// jwtSecretManagerImpl implements the JWTSecretManager interface
+// jwtSecretManagerImpl implements the JWTSecretManager interface.
+// All operations are mutex-protected so the cleanup goroutine
+// (StartCleanupLoop) can run alongside Generate / Validate calls.
 type jwtSecretManagerImpl struct {
+	mu            sync.Mutex
 	secrets       []JWTSecret
 	primarySecret string
+	cleanupCancel context.CancelFunc
 }

-// NewJWTSecretManager creates a new JWT secret manager
+// NewJWTSecretManager creates a new JWT secret manager.
 func NewJWTSecretManager(initialSecret string) JWTSecretManager {
 	return &jwtSecretManagerImpl{
 		secrets: []JWTSecret{
@@ -24,58 +32,132 @@ func NewJWTSecretManager(initialSecret string) JWTSecretManager {
 	}
 }

-// AddSecret adds a new JWT secret
+// AddSecret adds a new JWT secret.
 func (m *jwtSecretManagerImpl) AddSecret(secret string, isPrimary bool, expiresIn time.Duration) {
-	expiresAt := time.Now().Add(expiresIn)
-	m.secrets = append(m.secrets, JWTSecret{
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.addSecretLocked(secret, isPrimary, expiresIn)
+}
+
+// addSecretLocked is the internal helper that assumes the mutex is held.
+func (m *jwtSecretManagerImpl) addSecretLocked(secret string, isPrimary bool, expiresIn time.Duration) {
+	entry := JWTSecret{
 		Secret:    secret,
 		IsPrimary: isPrimary,
 		CreatedAt: time.Now(),
-		ExpiresAt: &expiresAt,
-	})
+	}
+	if expiresIn > 0 {
+		expiresAt := time.Now().Add(expiresIn)
+		entry.ExpiresAt = &expiresAt
+	}
+	m.secrets = append(m.secrets, entry)

 	if isPrimary {
 		m.primarySecret = secret
 	}
 }

-// RotateToSecret rotates to a new primary secret
+// RotateToSecret rotates to a new primary secret.
 func (m *jwtSecretManagerImpl) RotateToSecret(newSecret string) {
-	// Mark existing primary as non-primary
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
 	for i, secret := range m.secrets {
 		if secret.IsPrimary {
 			m.secrets[i].IsPrimary = false
 			break
 		}
 	}
-
-	// Add new secret as primary
-	m.AddSecret(newSecret, true, 0) // No expiration for primary
+	m.addSecretLocked(newSecret, true, 0)
 }

-// GetPrimarySecret returns the current primary secret
+// GetPrimarySecret returns the current primary secret.
 func (m *jwtSecretManagerImpl) GetPrimarySecret() string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
 	return m.primarySecret
 }

-// GetAllValidSecrets returns all valid (non-expired) secrets
+// GetAllValidSecrets returns all valid (non-expired) secrets.
 func (m *jwtSecretManagerImpl) GetAllValidSecrets() []JWTSecret {
-	var validSecrets []JWTSecret
-	now := time.Now()
+	m.mu.Lock()
+	defer m.mu.Unlock()

+	now := time.Now()
+	valid := make([]JWTSecret, 0, len(m.secrets))
 	for _, secret := range m.secrets {
 		if secret.ExpiresAt == nil || secret.ExpiresAt.After(now) {
-			validSecrets = append(validSecrets, secret)
+			valid = append(valid, secret)
 		}
 	}
-
-	return validSecrets
+	return valid
 }

-// GetSecretByIndex returns a secret by index for testing
+// GetSecretByIndex returns a secret by index for testing.
 func (m *jwtSecretManagerImpl) GetSecretByIndex(index int) (string, bool) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
 	if index < 0 || index >= len(m.secrets) {
 		return "", false
 	}
 	return m.secrets[index].Secret, true
 }
+
+// RemoveExpiredSecrets drops every non-primary secret whose ExpiresAt is
+// non-nil and in the past. Returns the count of secrets removed.
+// The primary secret is never removed regardless of expiration (ADR-0021).
+func (m *jwtSecretManagerImpl) RemoveExpiredSecrets() int {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	now := time.Now()
+	kept := make([]JWTSecret, 0, len(m.secrets))
+	removed := 0
+	for _, secret := range m.secrets {
+		if !secret.IsPrimary && secret.ExpiresAt != nil && !secret.ExpiresAt.After(now) {
+			removed++
+			continue
+		}
+		kept = append(kept, secret)
+	}
+	m.secrets = kept
+	return removed
+}
+
+// StartCleanupLoop spawns a goroutine that calls RemoveExpiredSecrets at the
+// given interval. Stops when the parent context is cancelled. Calling again
+// cancels the previous loop's context and starts a fresh one.
+func (m *jwtSecretManagerImpl) StartCleanupLoop(ctx context.Context, interval time.Duration) {
+	m.mu.Lock()
+	if m.cleanupCancel != nil {
+		m.cleanupCancel()
+	}
+	loopCtx, cancel := context.WithCancel(ctx)
+	m.cleanupCancel = cancel
+	m.mu.Unlock()
+
+	if interval <= 0 {
+		log.Warn().Dur("interval", interval).Msg("JWT secret cleanup interval is non-positive, loop disabled")
+		return
+	}
+
+	go func() {
+		ticker := time.NewTicker(interval)
+		defer ticker.Stop()
+		log.Info().Dur("interval", interval).Msg("JWT secret cleanup loop started")
+		for {
+			select {
+			case <-loopCtx.Done():
+				log.Info().Msg("JWT secret cleanup loop stopped")
+				return
+			case <-ticker.C:
+				removed := m.RemoveExpiredSecrets()
+				if removed > 0 {
+					log.Info().Int("removed", removed).Msg("JWT secrets cleaned up")
+				} else {
+					log.Trace().Msg("JWT cleanup tick: no expired secrets")
+				}
+			}
+		}
+	}()
+}
--- a/pkg/middleware/ratelimit.go
+++ b/pkg/middleware/ratelimit.go
@@ -0,0 +1,153 @@
+package middleware
+
+import (
+	"encoding/json"
+	"fmt"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"golang.org/x/time/rate"
+)
+
+// RateLimitConfig holds the configuration for rate limiting
+type RateLimitConfig struct {
+	Enabled           bool
+	RequestsPerMinute int
+	BurstSize         int
+}
+
+// RateLimiter implements per-IP rate limiting using a token bucket algorithm
+type RateLimiter struct {
+	mu       sync.Mutex
+	visitors map[string]*visitor
+	rate     rate.Limit
+	burst    int
+	ttl      time.Duration
+	enabled  bool
+}
+
+type visitor struct {
+	limiter  *rate.Limiter
+	lastSeen time.Time
+}
+
+// NewRateLimiter creates a new rate limiter with the given configuration
+func NewRateLimiter(cfg RateLimitConfig) *RateLimiter {
+	// Convert requests per minute to events per second
+	rateLimit := rate.Limit(float64(cfg.RequestsPerMinute) / 60.0)
+	burst := cfg.BurstSize
+	if burst <= 0 {
+		burst = 1
+	}
+
+	return &RateLimiter{
+		mu:       sync.Mutex{},
+		visitors: make(map[string]*visitor),
+		rate:     rateLimit,
+		burst:    burst,
+		ttl:      10 * time.Minute,
+		enabled:  cfg.Enabled,
+	}
+}
+
+// getVisitor returns the rate limiter for the given IP, creating one if needed.
+// It performs TTL-based eviction of stale entries.
+func (rl *RateLimiter) getVisitor(ip string) *rate.Limiter {
+	if !rl.enabled {
+		// If rate limiting is disabled, return a limiter that always allows
+		return rate.NewLimiter(rate.Inf, 1)
+	}
+
+	now := time.Now()
+
+	rl.mu.Lock()
+	defer rl.mu.Unlock()
+
+	// Clean up old entries periodically (every 100 accesses to avoid lock contention)
+	if len(rl.visitors) > 0 && len(rl.visitors)%100 == 0 {
+		rl.cleanupOldVisitors(now)
+	}
+
+	v, exists := rl.visitors[ip]
+	if !exists || now.Sub(v.lastSeen) > rl.ttl {
+		// Create new limiter for this IP
+		limiter := rate.NewLimiter(rl.rate, rl.burst)
+		rl.visitors[ip] = &visitor{
+			limiter:  limiter,
+			lastSeen: now,
+		}
+		return limiter
+	}
+
+	// Update last seen time
+	v.lastSeen = now
+	return v.limiter
+}
+
+// cleanupOldVisitors removes entries that haven't been seen in more than ttl
+func (rl *RateLimiter) cleanupOldVisitors(now time.Time) {
+	for ip, v := range rl.visitors {
+		if now.Sub(v.lastSeen) > rl.ttl {
+			delete(rl.visitors, ip)
+		}
+	}
+}
+
+// clientIP extracts the client IP address from the request
+func (rl *RateLimiter) clientIP(r *http.Request) string {
+	// Try X-Forwarded-For header first
+	if xff := r.Header.Get("X-Forwarded-For"); xff != "" {
+		// X-Forwarded-For can contain multiple IPs: client, proxy1, proxy2, ...
+		// The leftmost is the original client
+		ips := strings.Split(xff, ",")
+		if len(ips) > 0 {
+			return strings.TrimSpace(ips[0])
+		}
+	}
+
+	// Try X-Real-IP header
+	if xri := r.Header.Get("X-Real-IP"); xri != "" {
+		return strings.TrimSpace(xri)
+	}
+
+	// Fall back to RemoteAddr (strip port if present)
+	addr := r.RemoteAddr
+	if colonIdx := strings.LastIndex(addr, ":"); colonIdx != -1 {
+		return addr[:colonIdx]
+	}
+	return addr
+}
+
+// Middleware returns the rate limiting middleware function
+func (rl *RateLimiter) Middleware(next http.Handler) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		ip := rl.clientIP(r)
+		limiter := rl.getVisitor(ip)
+
+		if !limiter.Allow() {
+			// Rate limit exceeded
+			// Calculate retry after based on the rate
+			// tokens needed = burst, rate = tokens/second
+			// So wait time = burst / rate (in seconds)
+			retryAfter := float64(rl.burst) / float64(rl.rate)
+			if retryAfter <= 0 {
+				retryAfter = 1
+			}
+
+			w.Header().Set("Content-Type", "application/json")
+			w.Header().Set("Retry-After", fmt.Sprintf("%.0f", retryAfter))
+			w.WriteHeader(http.StatusTooManyRequests)
+
+			response := map[string]interface{}{
+				"error":               "rate_limited",
+				"retry_after_seconds": int(retryAfter),
+			}
+			json.NewEncoder(w).Encode(response)
+			return
+		}
+
+		next.ServeHTTP(w, r)
+	})
+}
--- a/pkg/middleware/ratelimit_test.go
+++ b/pkg/middleware/ratelimit_test.go
@@ -0,0 +1,310 @@
+package middleware
+
+import (
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+	"time"
+)
+
+func TestRateLimiter_AllowsRequestsWithinBurst(t *testing.T) {
+	cfg := RateLimitConfig{
+		Enabled:           true,
+		RequestsPerMinute: 60,
+		BurstSize:         5,
+	}
+	rl := NewRateLimiter(cfg)
+
+	// Create a simple handler that returns 200 OK
+	handler := rl.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+		w.Write([]byte("OK"))
+	}))
+
+	// Make 5 requests (equal to burst size) - all should succeed
+	for i := 0; i < 5; i++ {
+		req := httptest.NewRequest("GET", "/test", nil)
+		req.RemoteAddr = "192.168.1.1:12345"
+		rr := httptest.NewRecorder()
+
+		handler.ServeHTTP(rr, req)
+
+		if rr.Code != http.StatusOK {
+			t.Errorf("Request %d: expected status 200, got %d", i+1, rr.Code)
+		}
+	}
+}
+
+func TestRateLimiter_BlocksRequestsExceedingBurst(t *testing.T) {
+	cfg := RateLimitConfig{
+		Enabled:           true,
+		RequestsPerMinute: 60,
+		BurstSize:         3,
+	}
+	rl := NewRateLimiter(cfg)
+
+	handler := rl.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	// Make 4 requests (exceeding burst of 3) - 4th should be rate limited
+	for i := 0; i < 3; i++ {
+		req := httptest.NewRequest("GET", "/test", nil)
+		req.RemoteAddr = "192.168.1.2:12345"
+		rr := httptest.NewRecorder()
+		handler.ServeHTTP(rr, req)
+
+		if rr.Code != http.StatusOK {
+			t.Errorf("Request %d: expected status 200, got %d", i+1, rr.Code)
+		}
+	}
+
+	// 4th request should be rate limited
+	req := httptest.NewRequest("GET", "/test", nil)
+	req.RemoteAddr = "192.168.1.2:12345"
+	rr := httptest.NewRecorder()
+	handler.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusTooManyRequests {
+		t.Errorf("Request 4: expected status 429, got %d", rr.Code)
+	}
+
+	// Verify response body
+	var response map[string]interface{}
+	if err := json.NewDecoder(rr.Body).Decode(&response); err != nil {
+		t.Fatalf("Failed to decode response body: %v", err)
+	}
+
+	if response["error"] != "rate_limited" {
+		t.Errorf("Expected error 'rate_limited', got %v", response["error"])
+	}
+
+	if _, ok := response["retry_after_seconds"]; !ok {
+		t.Error("Expected retry_after_seconds in response")
+	}
+
+	// Verify Retry-After header
+	if retryAfter := rr.Header().Get("Retry-After"); retryAfter == "" {
+		t.Error("Expected Retry-After header to be set")
+	}
+}
+
+func TestRateLimiter_DifferentIPsIndependent(t *testing.T) {
+	cfg := RateLimitConfig{
+		Enabled:           true,
+		RequestsPerMinute: 60,
+		BurstSize:         2,
+	}
+	rl := NewRateLimiter(cfg)
+
+	handler := rl.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	// IP1 makes 2 requests (fills its burst)
+	for i := 0; i < 2; i++ {
+		req := httptest.NewRequest("GET", "/test", nil)
+		req.RemoteAddr = "10.0.0.1:12345"
+		rr := httptest.NewRecorder()
+		handler.ServeHTTP(rr, req)
+
+		if rr.Code != http.StatusOK {
+			t.Errorf("IP1 request %d: expected status 200, got %d", i+1, rr.Code)
+		}
+	}
+
+	// IP1's 3rd request should be rate limited
+	req := httptest.NewRequest("GET", "/test", nil)
+	req.RemoteAddr = "10.0.0.1:12345"
+	rr := httptest.NewRecorder()
+	handler.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusTooManyRequests {
+		t.Errorf("IP1 request 3: expected status 429, got %d", rr.Code)
+	}
+
+	// IP2 should still be able to make requests (independent rate limit)
+	req2 := httptest.NewRequest("GET", "/test", nil)
+	req2.RemoteAddr = "10.0.0.2:12345"
+	rr2 := httptest.NewRecorder()
+	handler.ServeHTTP(rr2, req2)
+
+	if rr2.Code != http.StatusOK {
+		t.Errorf("IP2 request 1: expected status 200, got %d", rr2.Code)
+	}
+}
+
+func TestRateLimiter_Disabled(t *testing.T) {
+	cfg := RateLimitConfig{
+		Enabled:           false,
+		RequestsPerMinute: 60,
+		BurstSize:         1,
+	}
+	rl := NewRateLimiter(cfg)
+
+	handler := rl.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	// Make many requests - all should succeed when disabled
+	for i := 0; i < 100; i++ {
+		req := httptest.NewRequest("GET", "/test", nil)
+		req.RemoteAddr = "192.168.1.100:12345"
+		rr := httptest.NewRecorder()
+		handler.ServeHTTP(rr, req)
+
+		if rr.Code != http.StatusOK {
+			t.Errorf("Request %d with disabled rate limiter: expected status 200, got %d", i+1, rr.Code)
+		}
+	}
+}
+
+func TestRateLimiter_TTLExpiration(t *testing.T) {
+	cfg := RateLimitConfig{
+		Enabled:           true,
+		RequestsPerMinute: 60,
+		BurstSize:         2,
+	}
+	rl := NewRateLimiter(cfg)
+
+	// Manually set a short TTL for testing
+	rl.ttl = 50 * time.Millisecond
+
+	handler := rl.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	// IP makes 2 requests (fills burst)
+	for i := 0; i < 2; i++ {
+		req := httptest.NewRequest("GET", "/test", nil)
+		req.RemoteAddr = "10.0.0.50:12345"
+		rr := httptest.NewRecorder()
+		handler.ServeHTTP(rr, req)
+
+		if rr.Code != http.StatusOK {
+			t.Errorf("Request %d: expected status 200, got %d", i+1, rr.Code)
+		}
+	}
+
+	// 3rd request should be rate limited
+	req := httptest.NewRequest("GET", "/test", nil)
+	req.RemoteAddr = "10.0.0.50:12345"
+	rr := httptest.NewRecorder()
+	handler.ServeHTTP(rr, req)
+
+	if rr.Code != http.StatusTooManyRequests {
+		t.Errorf("Request 3: expected status 429, got %d", rr.Code)
+	}
+
+	// Wait for TTL to expire
+	time.Sleep(60 * time.Millisecond)
+
+	// New request should succeed (new limiter created after TTL expiration)
+	req2 := httptest.NewRequest("GET", "/test", nil)
+	req2.RemoteAddr = "10.0.0.50:12345"
+	rr2 := httptest.NewRecorder()
+	handler.ServeHTTP(rr2, req2)
+
+	if rr2.Code != http.StatusOK {
+		t.Errorf("Request after TTL: expected status 200, got %d", rr2.Code)
+	}
+}
+
+func TestRateLimiter_ClientIPExtraction(t *testing.T) {
+	rl := NewRateLimiter(RateLimitConfig{Enabled: true, RequestsPerMinute: 60, BurstSize: 10})
+
+	tests := []struct {
+		name       string
+		header     map[string]string
+		remoteAddr string
+		expected   string
+	}{
+		{
+			name:       "X-Forwarded-For single IP",
+			header:     map[string]string{"X-Forwarded-For": "203.0.113.195"},
+			remoteAddr: "127.0.0.1:12345",
+			expected:   "203.0.113.195",
+		},
+		{
+			name:       "X-Forwarded-For multiple IPs",
+			header:     map[string]string{"X-Forwarded-For": "203.0.113.195, 70.41.3.18, 150.172.238.178"},
+			remoteAddr: "127.0.0.1:12345",
+			expected:   "203.0.113.195",
+		},
+		{
+			name:       "X-Real-IP",
+			header:     map[string]string{"X-Real-IP": "203.0.113.50"},
+			remoteAddr: "127.0.0.1:12345",
+			expected:   "203.0.113.50",
+		},
+		{
+			name:       "RemoteAddr with port",
+			header:     map[string]string{},
+			remoteAddr: "203.0.113.100:54321",
+			expected:   "203.0.113.100",
+		},
+		{
+			name:       "RemoteAddr without port",
+			header:     map[string]string{},
+			remoteAddr: "203.0.113.101",
+			expected:   "203.0.113.101",
+		},
+		{
+			name:       "X-Forwarded-For takes precedence over X-Real-IP",
+			header:     map[string]string{"X-Forwarded-For": "203.0.113.200", "X-Real-IP": "203.0.113.201"},
+			remoteAddr: "127.0.0.1:12345",
+			expected:   "203.0.113.200",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			req := httptest.NewRequest("GET", "/test", nil)
+			for k, v := range tt.header {
+				req.Header.Set(k, v)
+			}
+			req.RemoteAddr = tt.remoteAddr
+
+			ip := rl.clientIP(req)
+			if ip != tt.expected {
+				t.Errorf("clientIP() = %q, expected %q", ip, tt.expected)
+			}
+		})
+	}
+}
+
+func TestRateLimiter_ContentTypeHeader(t *testing.T) {
+	cfg := RateLimitConfig{
+		Enabled:           true,
+		RequestsPerMinute: 60,
+		BurstSize:         1,
+	}
+	rl := NewRateLimiter(cfg)
+
+	handler := rl.Middleware(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		w.WriteHeader(http.StatusOK)
+	}))
+
+	// Make 1 request to fill burst
+	req := httptest.NewRequest("GET", "/test", nil)
+	req.RemoteAddr = "192.168.1.200:12345"
+	rr := httptest.NewRecorder()
+	handler.ServeHTTP(rr, req)
+
+	// 2nd request should be rate limited
+	req2 := httptest.NewRequest("GET", "/test", nil)
+	req2.RemoteAddr = "192.168.1.200:12345"
+	rr2 := httptest.NewRecorder()
+	handler.ServeHTTP(rr2, req2)
+
+	if rr2.Code != http.StatusTooManyRequests {
+		t.Fatalf("Expected status 429, got %d", rr2.Code)
+	}
+
+	// Check Content-Type header is JSON
+	contentType := rr2.Header().Get("Content-Type")
+	if contentType != "application/json" {
+		t.Errorf("Expected Content-Type: application/json, got %q", contentType)
+	}
+}
--- a/pkg/server/healthz_test.go
+++ b/pkg/server/healthz_test.go
@@ -0,0 +1,43 @@
+package server
+
+import (
+	"context"
+	"encoding/json"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"dance-lessons-coach/pkg/config"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestHandleHealthz(t *testing.T) {
+	// Setup
+	cfg := &config.Config{}
+	s := NewServer(cfg, context.Background())
+
+	// Create request
+	req := httptest.NewRequest(http.MethodGet, "/api/healthz", nil)
+	w := httptest.NewRecorder()
+
+	// Call handler
+	s.handleHealthz(w, req)
+
+	// Check status code
+	assert.Equal(t, http.StatusOK, w.Code)
+
+	// Check content type
+	assert.Equal(t, "application/json", w.Header().Get("Content-Type"))
+
+	// Decode response
+	var resp HealthzResponse
+	err := json.NewDecoder(w.Body).Decode(&resp)
+	assert.NoError(t, err)
+
+	// Assert fields
+	assert.Equal(t, "healthy", resp.Status)
+	assert.NotEmpty(t, resp.Version)
+	assert.GreaterOrEqual(t, resp.UptimeSeconds, int64(0))
+	assert.NotZero(t, resp.Timestamp)
+}
--- a/pkg/server/server.go
+++ b/pkg/server/server.go
@@ -13,12 +13,14 @@ import (
 	"time"

 	"github.com/go-chi/chi/v5"
-	"github.com/go-chi/chi/v5/middleware"
+	chimiddleware "github.com/go-chi/chi/v5/middleware"
 	"github.com/rs/zerolog/log"
 	httpSwagger "github.com/swaggo/http-swagger"

+	"dance-lessons-coach/pkg/cache"
 	"dance-lessons-coach/pkg/config"
 	"dance-lessons-coach/pkg/greet"
+	"dance-lessons-coach/pkg/middleware"
 	"dance-lessons-coach/pkg/telemetry"
 	"dance-lessons-coach/pkg/user"
 	userapi "dance-lessons-coach/pkg/user/api"
@@ -33,6 +35,28 @@ import (
 //go:embed docs/swagger.json
 var swaggerJSON embed.FS

+// CancelableContext wraps a context.Context and exposes a Cancel() method so
+// that Server.Run() can cancel readiness during graceful shutdown via the type
+// assertion it already performs. Callers that don't need controlled cancellation
+// (tests, CLI) can pass a plain context.Background() — the assertion silently
+// fails and readiness is never explicitly cancelled, which is harmless.
+type CancelableContext struct {
+	context.Context
+	cancel context.CancelFunc
+}
+
+// NewCancelableContext creates a CancelableContext whose Cancel() method will
+// be invoked by Server.Run() at the start of graceful shutdown, before the
+// 1-second readiness propagation window. The returned CancelFunc is a no-op
+// after Cancel() has been called, so it is safe to defer in main.
+func NewCancelableContext(parent context.Context) (*CancelableContext, context.CancelFunc) {
+	ctx, cancel := context.WithCancel(parent)
+	return &CancelableContext{Context: ctx, cancel: cancel}, cancel
+}
+
+// Cancel satisfies the interface checked in Run() and cancels the context.
+func (c *CancelableContext) Cancel() { c.cancel() }
+
 type Server struct {
 	router         *chi.Mux
 	readyCtx       context.Context
@@ -42,10 +66,26 @@ type Server struct {
 	validator      *validation.Validator
 	userRepo       user.UserRepository
 	userService    user.UserService
+	cacheService   cache.Service
+	startedAt      time.Time
 }

 func NewServer(cfg *config.Config, readyCtx context.Context) *Server {
-	// Create validator instance
+	// Initialize default user repository and services (Postgres from cfg)
+	userRepo, userService, err := initializeUserServices(cfg)
+	if err != nil {
+		log.Warn().Err(err).Msg("Failed to initialize user services, user functionality will be disabled")
+	}
+	return NewServerWithUserRepo(cfg, readyCtx, userRepo, userService)
+}
+
+// NewServerWithUserRepo builds a Server with caller-provided userRepo + userService.
+// Used by BDD test infra to inject a per-scenario repository (e.g., one connected
+// to an isolated PostgreSQL schema). Pass nil for both to disable user functionality.
+//
+// The validator + cache services are still built from cfg internally; they don't
+// need per-scenario isolation today.
+func NewServerWithUserRepo(cfg *config.Config, readyCtx context.Context, userRepo user.UserRepository, userService user.UserService) *Server {
 	validator, err := validation.GetValidatorFromConfig(cfg)
 	if err != nil {
 		log.Error().Err(err).Msg("Failed to create validator, continuing without validation")
@@ -53,20 +93,27 @@ func NewServer(cfg *config.Config, readyCtx context.Context) *Server {
 		log.Trace().Msg("Validator created successfully")
 	}

-	// Initialize user repository and services
-	userRepo, userService, err := initializeUserServices(cfg)
-	if err != nil {
-		log.Warn().Err(err).Msg("Failed to initialize user services, user functionality will be disabled")
+	var cacheService cache.Service
+	if cfg.GetCacheEnabled() {
+		cacheService = cache.NewInMemoryService(
+			time.Duration(cfg.GetCacheDefaultTTLSeconds())*time.Second,
+			time.Duration(cfg.GetCacheCleanupIntervalSeconds())*time.Second,
+		)
+		log.Trace().Msg("Cache service initialized")
+	} else {
+		log.Trace().Msg("Cache service disabled")
 	}

 	s := &Server{
-		router:      chi.NewRouter(),
-		readyCtx:    readyCtx,
-		withOTEL:    cfg.GetTelemetryEnabled(),
-		config:      cfg,
-		validator:   validator,
-		userRepo:    userRepo,
-		userService: userService,
+		router:       chi.NewRouter(),
+		readyCtx:     readyCtx,
+		withOTEL:     cfg.GetTelemetryEnabled(),
+		config:       cfg,
+		validator:    validator,
+		userRepo:     userRepo,
+		userService:  userService,
+		cacheService: cacheService,
+		startedAt:    time.Now(),
 	}
 	s.setupRoutes()
 	return s
@@ -78,6 +125,12 @@ func (s *Server) GetAuthService() user.AuthService {
 	return s.userService
 }

+// GetCacheService returns the cache service for test cleanup
+// This allows test suites to flush cache between tests
+func (s *Server) GetCacheService() cache.Service {
+	return s.cacheService
+}
+
 // initializeUserServices initializes the user repository and unified user service
 func initializeUserServices(cfg *config.Config) (user.UserRepository, user.UserService, error) {
 	// Create user repository using PostgreSQL
@@ -86,10 +139,16 @@ func initializeUserServices(cfg *config.Config) (user.UserRepository, user.UserS
 		return nil, nil, fmt.Errorf("failed to create PostgreSQL user repository: %w", err)
 	}

-	// Create JWT config
+	// Create JWT config.
+	// GetTTL is a method value — it captures cfg, so when WatchAndApply
+	// re-unmarshals into the same Config struct on file changes, every
+	// subsequent token generation reads the new TTL (ADR-0023 Phase 2).
+	// ExpirationTime is kept as a static fallback for tests that build
+	// JWTConfig manually without a Config.
 	jwtConfig := user.JWTConfig{
 		Secret:         cfg.GetJWTSecret(),
-		ExpirationTime: time.Hour * 24, // 24 hours
+		ExpirationTime: 24 * time.Hour,
+		GetTTL:         cfg.GetJWTTTL,
 		Issuer:         "dance-lessons-coach",
 	}

@@ -101,7 +160,7 @@ func initializeUserServices(cfg *config.Config) (user.UserRepository, user.UserS

 func (s *Server) setupRoutes() {
 	// Use Zerolog middleware instead of Chi's default logger
-	s.router.Use(middleware.RequestLogger(&middleware.DefaultLogFormatter{
+	s.router.Use(chimiddleware.RequestLogger(&chimiddleware.DefaultLogFormatter{
 		Logger:  &log.Logger,
 		NoColor: false,
 	}))
@@ -115,12 +174,24 @@ func (s *Server) setupRoutes() {
 	// Version endpoint at root level
 	s.router.Get("/api/version", s.handleVersion)

+	// Kubernetes-style health endpoint at root level
+	s.router.Get("/api/healthz", s.handleHealthz)
+
+	// Info endpoint - composite aggregator
+	s.router.Get("/api/info", s.handleInfo)
+
 	// API routes
 	s.router.Route("/api/v1", func(r chi.Router) {
 		r.Use(s.getAllMiddlewares()...)
 		s.registerApiV1Routes(r)
 	})

+	// Admin routes
+	s.router.Route("/api/admin", func(r chi.Router) {
+		r.Use(s.getAllMiddlewares()...)
+		r.Post("/cache/flush", s.handleAdminCacheFlush)
+	})
+
 	// Register v2 routes if enabled
 	if s.config.GetV2Enabled() {
 		s.router.Route("/api/v2", func(r chi.Router) {
@@ -147,8 +218,12 @@ func (s *Server) setupRoutes() {
 }

 func (s *Server) registerApiV1Routes(r chi.Router) {
-	greetService := greet.NewService()
-	greetHandler := greet.NewApiV1GreetHandler(greetService)
+	// Create rate limit middleware
+	rateLimitMiddleware := middleware.NewRateLimiter(middleware.RateLimitConfig{
+		Enabled:           s.config.GetRateLimitEnabled(),
+		RequestsPerMinute: s.config.GetRateLimitRequestsPerMinute(),
+		BurstSize:         s.config.GetRateLimitBurstSize(),
+	})

 	// Create auth middleware if available
 	var authMiddleware *AuthMiddleware
@@ -157,11 +232,14 @@ func (s *Server) registerApiV1Routes(r chi.Router) {
 	}

 	r.Route("/greet", func(r chi.Router) {
+		// Add rate limiting middleware for greet endpoint
+		r.Use(rateLimitMiddleware.Middleware)
 		// Add optional authentication middleware
 		if authMiddleware != nil {
 			r.Use(authMiddleware.Middleware)
 		}
-		greetHandler.RegisterRoutes(r)
+		r.Get("/", s.handleGreetQuery)
+		r.Get("/{name}", s.handleGreetPath)
 	})

 	// Register user authentication routes
@@ -193,8 +271,8 @@ func (s *Server) registerApiV2Routes(r chi.Router) {
 // getAllMiddlewares returns all middleware including OpenTelemetry if enabled
 func (s *Server) getAllMiddlewares() []func(http.Handler) http.Handler {
 	middlewares := []func(http.Handler) http.Handler{
-		middleware.StripSlashes,
-		middleware.Recoverer,
+		chimiddleware.StripSlashes,
+		chimiddleware.Recoverer,
 	}

 	if s.withOTEL {
@@ -314,26 +392,283 @@ func (s *Server) handleVersion(w http.ResponseWriter, r *http.Request) {
 		format = "plain" // default format
 	}

+	// Check cache if enabled
+	cacheKey := "version:" + format
+	if s.cacheService != nil {
+		if cached, ok := s.cacheService.Get(cacheKey); ok {
+			log.Trace().Str("cache_key", cacheKey).Msg("Cache hit for version")
+			w.Header().Set("Content-Type", "text/plain")
+			if format == "json" {
+				w.Header().Set("Content-Type", "application/json")
+			}
+			w.Write([]byte(cached.(string)))
+			return
+		}
+	}
+
+	// Build response
+	var response string
 	switch format {
 	case "plain":
 		w.Header().Set("Content-Type", "text/plain")
-		w.Write([]byte(version.Short()))
+		response = version.Short()
 	case "full":
 		w.Header().Set("Content-Type", "text/plain")
-		w.Write([]byte(version.Full()))
+		response = version.Full()
 	case "json":
 		w.Header().Set("Content-Type", "application/json")
-		jsonResponse := fmt.Sprintf(`{
+		response = fmt.Sprintf(`{
 			"version": "%s",
 			"commit": "%s",
 			"built": "%s",
 			"go": "%s"
 		}`, version.Version, version.Commit, version.Date, version.GoVersion)
-		w.Write([]byte(jsonResponse))
 	default:
 		w.Header().Set("Content-Type", "text/plain")
-		w.Write([]byte(version.Short()))
+		response = version.Short()
 	}
+
+	// Cache the response for 60 seconds if cache is enabled
+	if s.cacheService != nil {
+		s.cacheService.Set(cacheKey, response, 60*time.Second)
+		log.Trace().Str("cache_key", cacheKey).Msg("Cached version response")
+	}
+
+	w.Write([]byte(response))
+}
+
+// HealthzResponse represents the Kubernetes-style health check response
+type HealthzResponse struct {
+	Status        string    `json:"status"`
+	Version       string    `json:"version"`
+	UptimeSeconds int64     `json:"uptime_seconds"`
+	Timestamp     time.Time `json:"timestamp"`
+}
+
+// InfoResponse represents the JSON response for /api/info
+type InfoResponse struct {
+	Version       string `json:"version"`
+	CommitShort   string `json:"commit_short"`
+	BuildDate     string `json:"build_date"`
+	UptimeSeconds int64  `json:"uptime_seconds"`
+	CacheEnabled  bool   `json:"cache_enabled"`
+	HealthzStatus string `json:"healthz_status"`
+}
+
+// handleHealthz godoc
+//
+//	@Summary		Kubernetes-style health check
+//	@Description	Returns rich health info for liveness/readiness probes
+//	@Tags			System/Health
+//	@Produce		json
+//	@Success		200	{object}	HealthzResponse
+//	@Router			/healthz [get]
+func (s *Server) handleHealthz(w http.ResponseWriter, r *http.Request) {
+	log.Trace().Msg("Healthz check requested")
+	resp := HealthzResponse{
+		Status:        "healthy",
+		Version:       version.Version,
+		UptimeSeconds: int64(time.Since(s.startedAt).Seconds()),
+		Timestamp:     time.Now().UTC(),
+	}
+	w.Header().Set("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(resp)
+}
+
+// handleInfo godoc
+//
+//	@Summary		Get composite info
+//	@Description	Returns aggregated version, build, uptime, cache, and health info
+//	@Tags			System/Info
+//	@Produce		json
+//	@Success		200	{object}	InfoResponse
+//	@Router			/info [get]
+func (s *Server) handleInfo(w http.ResponseWriter, r *http.Request) {
+	log.Trace().Msg("Info endpoint requested")
+
+	// Build commit_short from version.Commit (first 8 chars if available)
+	commitShort := version.Commit
+	if len(commitShort) > 8 {
+		commitShort = commitShort[:8]
+	}
+
+	// Build response
+	resp := InfoResponse{
+		Version:       version.Version,
+		CommitShort:   commitShort,
+		BuildDate:     version.Date,
+		UptimeSeconds: int64(time.Since(s.startedAt).Seconds()),
+		CacheEnabled:  s.cacheService != nil,
+		HealthzStatus: "healthy",
+	}
+
+	// Cache key
+	cacheKey := "info:json"
+
+	// Check cache if enabled
+	if s.cacheService != nil {
+		if cached, ok := s.cacheService.Get(cacheKey); ok {
+			log.Trace().Str("cache_key", cacheKey).Msg("Cache hit for info")
+			w.Header().Set("Content-Type", "application/json")
+			w.Header().Set("X-Cache", "HIT")
+			w.Write([]byte(cached.(string)))
+			return
+		}
+	}
+
+	// Marshal response
+	data, err := json.Marshal(resp)
+	if err != nil {
+		http.Error(w, `{"error":"server_error"}`, http.StatusInternalServerError)
+		return
+	}
+
+	// Cache the response
+	if s.cacheService != nil {
+		s.cacheService.Set(cacheKey, string(data),
+			time.Duration(s.config.GetCacheDefaultTTLSeconds())*time.Second)
+		w.Header().Set("X-Cache", "MISS")
+		log.Trace().Str("cache_key", cacheKey).Msg("Cached info response")
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	w.Write(data)
+}
+
+// handleGreetQuery godoc
+//
+//	@Summary		Get greeting with cache
+//	@Description	Returns greeting for name from query param with caching
+//	@Tags			API/v1/Greeting
+//	@Accept			json
+//	@Produce		json
+//	@Param			name	query		string				false	"Name to greet"
+//	@Success		200		{object}	map[string]string	"Greeting message"
+//	@Failure		400		{object}	map[string]string	"Invalid request"
+//	@Router			/v1/greet [get]
+func (s *Server) handleGreetQuery(w http.ResponseWriter, r *http.Request) {
+	name := r.URL.Query().Get("name")
+	cacheKey := "greet:v1:" + name
+
+	// Check cache if enabled
+	if s.cacheService != nil {
+		if cached, ok := s.cacheService.Get(cacheKey); ok {
+			log.Trace().Str("cache_key", cacheKey).Msg("Cache hit for greet")
+			w.Header().Set("Content-Type", "application/json")
+			w.Header().Set("X-Cache", "HIT")
+			w.Write([]byte(cached.(string)))
+			return
+		}
+	}
+
+	// Compute response
+	greetService := greet.NewService()
+	message := greetService.Greet(r.Context(), name)
+	response, err := json.Marshal(map[string]string{"message": message})
+	if err != nil {
+		http.Error(w, `{"error":"server_error"}`, http.StatusInternalServerError)
+		return
+	}
+
+	// Cache the response for 60 seconds if cache is enabled
+	if s.cacheService != nil {
+		s.cacheService.Set(cacheKey, string(response), 60*time.Second)
+		w.Header().Set("X-Cache", "MISS")
+		log.Trace().Str("cache_key", cacheKey).Msg("Cached greet response")
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	w.Write(response)
+}
+
+// handleGreetPath godoc
+//
+//	@Summary		Get personalized greeting with cache
+//	@Description	Returns greeting for name from path param with caching
+//	@Tags			API/v1/Greeting
+//	@Accept			json
+//	@Produce		json
+//	@Param			name	path		string				true	"Name to greet"
+//	@Success		200		{object}	map[string]string	"Greeting message"
+//	@Failure		400		{object}	map[string]string	"Invalid request"
+//	@Router			/v1/greet/{name} [get]
+func (s *Server) handleGreetPath(w http.ResponseWriter, r *http.Request) {
+	name := chi.URLParam(r, "name")
+	cacheKey := "greet:v1:" + name
+
+	// Check cache if enabled
+	if s.cacheService != nil {
+		if cached, ok := s.cacheService.Get(cacheKey); ok {
+			log.Trace().Str("cache_key", cacheKey).Msg("Cache hit for greet")
+			w.Header().Set("Content-Type", "application/json")
+			w.Header().Set("X-Cache", "HIT")
+			w.Write([]byte(cached.(string)))
+			return
+		}
+	}
+
+	// Compute response
+	greetService := greet.NewService()
+	message := greetService.Greet(r.Context(), name)
+	response, err := json.Marshal(map[string]string{"message": message})
+	if err != nil {
+		http.Error(w, `{"error":"server_error"}`, http.StatusInternalServerError)
+		return
+	}
+
+	// Cache the response for 60 seconds if cache is enabled
+	if s.cacheService != nil {
+		s.cacheService.Set(cacheKey, string(response), 60*time.Second)
+		w.Header().Set("X-Cache", "MISS")
+		log.Trace().Str("cache_key", cacheKey).Msg("Cached greet response")
+	}
+
+	w.Header().Set("Content-Type", "application/json")
+	w.Write(response)
+}
+
+// handleAdminCacheFlush godoc
+//
+//	@Summary		Flush cache
+//	@Description	Flushes the entire cache, requires admin authentication
+//	@Tags			API/Admin
+//	@Accept			json
+//	@Produce		json
+//	@Param			X-Admin-Password	header		string					true	"Admin master password"
+//	@Success		200					{object}	map[string]interface{}	"Cache flushed successfully"
+//	@Failure		401					{object}	map[string]string		"Unauthorized"
+//	@Failure		503					{object}	map[string]string		"Cache disabled"
+//	@Router			/admin/cache/flush [post]
+func (s *Server) handleAdminCacheFlush(w http.ResponseWriter, r *http.Request) {
+	if s.cacheService == nil {
+		w.Header().Set("Content-Type", "application/json")
+		w.WriteHeader(http.StatusServiceUnavailable)
+		json.NewEncoder(w).Encode(map[string]string{"error": "cache_disabled"})
+		return
+	}
+
+	// Admin auth - check X-Admin-Password header
+	masterPassword := r.Header.Get("X-Admin-Password")
+	if masterPassword == "" {
+		http.Error(w, `{"error":"unauthorized","message":"Admin password required"}`, http.StatusUnauthorized)
+		return
+	}
+
+	_, err := s.userService.AdminAuthenticate(r.Context(), masterPassword)
+	if err != nil {
+		http.Error(w, `{"error":"unauthorized","message":"Invalid admin password"}`, http.StatusUnauthorized)
+		return
+	}
+
+	itemCount := s.cacheService.ItemCount()
+	s.cacheService.Flush()
+
+	w.Header().Set("Content-Type", "application/json")
+	json.NewEncoder(w).Encode(map[string]interface{}{
+		"flushed":       true,
+		"items_flushed": itemCount,
+		"timestamp":     time.Now().UTC().Format(time.RFC3339),
+	})
 }

 func (s *Server) Router() http.Handler {
@@ -372,6 +707,17 @@ func (s *Server) Run() error {
 	ongoingCtx, stopOngoingGracefully := context.WithCancel(context.Background())
 	defer stopOngoingGracefully()

+	// Start the JWT secret cleanup loop (ADR-0021). The loop runs until rootCtx
+	// is cancelled (graceful shutdown), removing non-primary secrets whose
+	// ExpiresAt is in the past.
+	if s.userService != nil {
+		s.userService.StartJWTSecretCleanupLoop(rootCtx, s.config.GetJWTSecretCleanupInterval())
+	}
+
+	// Start config hot-reload watcher (ADR-0023 Phase 1: logging.level only).
+	// Stops automatically on rootCtx cancellation.
+	s.config.WatchAndApply(rootCtx)
+
 	// Create HTTP server
 	log.Trace().Str("address", s.config.GetServerAddress()).Msg("Server running")

--- a/pkg/telemetry/telemetry.go
+++ b/pkg/telemetry/telemetry.go
@@ -74,6 +74,36 @@ func Shutdown(ctx context.Context, tp *sdktrace.TracerProvider) error {
 	return tp.Shutdown(ctx)
 }

+// ReconfigureTracerProvider rebuilds the global tracer provider with the
+// updated sampler settings (ADR-0023 Phase 3 hot-reload). The previous
+// provider is gracefully shut down so in-flight spans are flushed.
+//
+// No-op if oldTP is nil — telemetry was disabled at startup, hot-reloading
+// it on would require a different code path (out of scope for Phase 3).
+//
+// Returns the new TracerProvider so the caller can track it for the next
+// shutdown / reconfigure cycle. On error the old TP is left in place.
+func (s *Setup) ReconfigureTracerProvider(ctx context.Context, oldTP *sdktrace.TracerProvider) (*sdktrace.TracerProvider, error) {
+	if oldTP == nil {
+		return nil, nil
+	}
+
+	// Build the new provider first — if anything fails we keep the old TP active.
+	newTP, err := s.InitializeTracing(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	// InitializeTracing already swapped the global provider via otel.SetTracerProvider,
+	// so the new one is now active. Drain the old one so no spans are lost.
+	if shutdownErr := oldTP.Shutdown(ctx); shutdownErr != nil {
+		// Log via the standard logger — zerolog isn't imported in this package.
+		log.Printf("ReconfigureTracerProvider: old TP shutdown failed: %v (new TP is active)", shutdownErr)
+	}
+
+	return newTP, nil
+}
+
 // getSampler returns the appropriate sampler based on configuration
 func (s *Setup) getSampler() sdktrace.Sampler {
 	switch s.SamplerType {
--- a/pkg/telemetry/telemetry_test.go
+++ b/pkg/telemetry/telemetry_test.go
@@ -0,0 +1,93 @@
+package telemetry
+
+// All tests in this file mutate the OpenTelemetry global tracer provider via
+// otel.SetTracerProvider (called by InitializeTracing / ReconfigureTracerProvider).
+// They MUST NOT be parallelized — t.Parallel() would race on the global state.
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+	"go.opentelemetry.io/otel"
+)
+
+// TestReconfigureTracerProvider_NilOldNoOp confirms that hot-reload is a
+// no-op when telemetry was never initialized at startup. Hot-reloading
+// telemetry-on requires a different code path that's out of scope for
+// ADR-0023 Phase 3.
+func TestReconfigureTracerProvider_NilOldNoOp(t *testing.T) {
+	s := &Setup{
+		ServiceName:  "test",
+		OTLPEndpoint: "localhost:4317",
+		Insecure:     true,
+		SamplerType:  "always_on",
+		SamplerRatio: 1.0,
+		Version:      "test",
+	}
+	newTP, err := s.ReconfigureTracerProvider(context.Background(), nil)
+	assert.NoError(t, err)
+	assert.Nil(t, newTP)
+}
+
+// TestReconfigureTracerProvider_SwapsGlobal confirms that after a successful
+// reconfigure, the global otel tracer provider points to the new one and the
+// old one was shut down (Shutdown returns nil even after a second Shutdown,
+// so we just verify no error path hit).
+func TestReconfigureTracerProvider_SwapsGlobal(t *testing.T) {
+	ctx := context.Background()
+	s := &Setup{
+		ServiceName:  "test",
+		OTLPEndpoint: "localhost:4317",
+		Insecure:     true,
+		SamplerType:  "always_on",
+		SamplerRatio: 1.0,
+		Version:      "test",
+	}
+
+	oldTP, err := s.InitializeTracing(ctx)
+	require.NoError(t, err)
+	require.NotNil(t, oldTP)
+	t.Cleanup(func() { _ = oldTP.Shutdown(ctx) }) // belt-and-braces, harmless if already shut down
+
+	// Mutate sampler before reconfigure
+	s.SamplerType = "traceidratio"
+	s.SamplerRatio = 0.25
+
+	newTP, err := s.ReconfigureTracerProvider(ctx, oldTP)
+	require.NoError(t, err)
+	require.NotNil(t, newTP)
+	t.Cleanup(func() { _ = newTP.Shutdown(ctx) })
+
+	// otel.GetTracerProvider returns a TracerProvider interface — pointer equality
+	// against newTP is the strongest assertion available without sdk-private state.
+	gotTP := otel.GetTracerProvider()
+	assert.Same(t, newTP, gotTP, "global tracer provider should be the new TP")
+}
+
+// TestReconfigureTracerProvider_OldShutdownErrorDoesNotFailReconfigure
+// confirms that even if shutting down the old TP fails, the new TP is still
+// returned and active. We simulate this by passing an already-shut-down
+// provider as oldTP — its second Shutdown is harmless on the SDK but
+// exercises the error-tolerance path.
+func TestReconfigureTracerProvider_OldShutdownErrorDoesNotFailReconfigure(t *testing.T) {
+	ctx := context.Background()
+	s := &Setup{
+		ServiceName:  "test",
+		OTLPEndpoint: "localhost:4317",
+		Insecure:     true,
+		SamplerType:  "always_on",
+		SamplerRatio: 1.0,
+		Version:      "test",
+	}
+
+	oldTP, err := s.InitializeTracing(ctx)
+	require.NoError(t, err)
+	_ = oldTP.Shutdown(ctx) // pre-shutdown: subsequent Shutdown is documented to return nil
+
+	newTP, err := s.ReconfigureTracerProvider(ctx, oldTP)
+	require.NoError(t, err)
+	require.NotNil(t, newTP)
+	t.Cleanup(func() { _ = newTP.Shutdown(ctx) })
+}
--- a/pkg/user/auth_service.go
+++ b/pkg/user/auth_service.go
@@ -11,13 +11,30 @@ import (
 	"golang.org/x/crypto/bcrypt"
 )

-// JWTConfig holds JWT configuration
+// JWTConfig holds JWT configuration.
+//
+// GetTTL, when non-nil, is called on every token generation to read the
+// current TTL — this enables ADR-0023 Phase 2 hot-reload of `auth.jwt.ttl`.
+// If nil, ExpirationTime is used as a static fallback.
 type JWTConfig struct {
 	Secret         string
 	ExpirationTime time.Duration
+	GetTTL         func() time.Duration
 	Issuer         string
 }

+// effectiveTTL returns the live TTL: GetTTL() when wired, else
+// ExpirationTime as a static fallback (used by tests that don't go
+// through the server-level wiring).
+func (c JWTConfig) effectiveTTL() time.Duration {
+	if c.GetTTL != nil {
+		if ttl := c.GetTTL(); ttl > 0 {
+			return ttl
+		}
+	}
+	return c.ExpirationTime
+}
+
 // userServiceImpl implements the unified UserService interface
 type userServiceImpl struct {
 	repo           UserRepository
@@ -69,7 +86,7 @@ func (s *userServiceImpl) GenerateJWT(ctx context.Context, user *User) (string,
 		"sub":   user.ID,
 		"name":  user.Username,
 		"admin": user.IsAdmin,
-		"exp":   time.Now().Add(s.jwtConfig.ExpirationTime).Unix(),
+		"exp":   time.Now().Add(s.jwtConfig.effectiveTTL()).Unix(),
 		"iat":   time.Now().Unix(),
 		"iss":   s.jwtConfig.Issuer,
 	}
@@ -218,6 +235,18 @@ func (s *userServiceImpl) ResetJWTSecrets() {
 	s.secretManager.Reset(s.jwtConfig.Secret)
 }

+// StartJWTSecretCleanupLoop delegates to the underlying secret manager to
+// start the periodic cleanup goroutine described in ADR-0021.
+func (s *userServiceImpl) StartJWTSecretCleanupLoop(ctx context.Context, interval time.Duration) {
+	s.secretManager.StartCleanupLoop(ctx, interval)
+}
+
+// RemoveExpiredJWTSecrets triggers an immediate cleanup pass via the
+// underlying secret manager. Returns the count of removed expired secrets.
+func (s *userServiceImpl) RemoveExpiredJWTSecrets() int {
+	return s.secretManager.RemoveExpiredSecrets()
+}
+
 // UserExists checks if a user exists by username
 func (s *userServiceImpl) UserExists(ctx context.Context, username string) (bool, error) {
 	return s.repo.UserExists(ctx, username)
--- a/pkg/user/jwt_manager.go
+++ b/pkg/user/jwt_manager.go
@@ -1,7 +1,11 @@
 package user

 import (
+	"context"
+	"sync"
 	"time"
+
+	"github.com/rs/zerolog/log"
 )

 // JWTSecret represents a JWT secret with metadata
@@ -12,10 +16,16 @@ type JWTSecret struct {
 	ExpiresAt *time.Time // Optional expiration time
 }

-// JWTSecretManager manages multiple JWT secrets for rotation
+// JWTSecretManager manages multiple JWT secrets for rotation.
+// All operations are mutex-protected so the cleanup goroutine
+// (StartCleanupLoop) can run alongside Generate / Validate calls.
+// ADR-0021 implements automatic removal of expired secrets while
+// always preserving the primary secret.
 type JWTSecretManager struct {
+	mu            sync.Mutex
 	secrets       []JWTSecret
 	primarySecret string
+	cleanupCancel context.CancelFunc
 }

 // NewJWTSecretManager creates a new JWT secret manager
@@ -34,12 +44,19 @@ func NewJWTSecretManager(initialSecret string) *JWTSecretManager {

 // AddSecret adds a new JWT secret
 func (m *JWTSecretManager) AddSecret(secret string, isPrimary bool, expiresIn time.Duration) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	m.addSecretLocked(secret, isPrimary, expiresIn)
+}
+
+// addSecretLocked is the internal helper that assumes the mutex is held.
+func (m *JWTSecretManager) addSecretLocked(secret string, isPrimary bool, expiresIn time.Duration) {
 	var expiresAt *time.Time
 	if expiresIn > 0 {
 		expirationTime := time.Now().Add(expiresIn)
 		expiresAt = &expirationTime
 	}
-	// If expiresIn is 0 or negative, expiresAt remains nil (no expiration)
+	// expiresIn <= 0 means no expiration

 	m.secrets = append(m.secrets, JWTSecret{
 		Secret:    secret,
@@ -55,48 +72,60 @@ func (m *JWTSecretManager) AddSecret(secret string, isPrimary bool, expiresIn ti

 // RotateToSecret rotates to a new primary secret
 func (m *JWTSecretManager) RotateToSecret(newSecret string) {
-	// Mark existing primary as non-primary
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
 	for i, secret := range m.secrets {
 		if secret.IsPrimary {
 			m.secrets[i].IsPrimary = false
 			break
 		}
 	}
-
-	// Add new secret as primary
-	m.AddSecret(newSecret, true, 0) // No expiration for primary
+	m.addSecretLocked(newSecret, true, 0)
 }

 // GetPrimarySecret returns the current primary secret
 func (m *JWTSecretManager) GetPrimarySecret() string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
 	return m.primarySecret
 }

 // GetAllValidSecrets returns all valid (non-expired) secrets
 func (m *JWTSecretManager) GetAllValidSecrets() []JWTSecret {
-	var validSecrets []JWTSecret
-	now := time.Now()
+	m.mu.Lock()
+	defer m.mu.Unlock()

+	now := time.Now()
+	valid := make([]JWTSecret, 0, len(m.secrets))
 	for _, secret := range m.secrets {
 		if secret.ExpiresAt == nil || secret.ExpiresAt.After(now) {
-			validSecrets = append(validSecrets, secret)
+			valid = append(valid, secret)
 		}
 	}
-
-	return validSecrets
+	return valid
 }

 // GetSecretByIndex returns a secret by index for testing
 func (m *JWTSecretManager) GetSecretByIndex(index int) (string, bool) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
 	if index < 0 || index >= len(m.secrets) {
 		return "", false
 	}
 	return m.secrets[index].Secret, true
 }

-// Reset resets the secret manager to its initial state with only the primary secret
-// This is useful for test cleanup to ensure tests don't interfere with each other
+// Reset resets the secret manager to its initial state with only the primary
+// secret. Used for test cleanup so tests don't interfere with each other.
 func (m *JWTSecretManager) Reset(initialSecret string) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if m.cleanupCancel != nil {
+		m.cleanupCancel()
+		m.cleanupCancel = nil
+	}
 	m.secrets = []JWTSecret{
 		{
 			Secret:    initialSecret,
@@ -106,3 +135,64 @@ func (m *JWTSecretManager) Reset(initialSecret string) {
 	}
 	m.primarySecret = initialSecret
 }
+
+// RemoveExpiredSecrets drops every non-primary secret whose ExpiresAt is
+// non-nil and in the past. Returns the count of secrets removed.
+// The primary secret is never removed regardless of expiration (ADR-0021).
+func (m *JWTSecretManager) RemoveExpiredSecrets() int {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	now := time.Now()
+	kept := make([]JWTSecret, 0, len(m.secrets))
+	removed := 0
+	for _, secret := range m.secrets {
+		if !secret.IsPrimary && secret.ExpiresAt != nil && !secret.ExpiresAt.After(now) {
+			removed++
+			continue
+		}
+		kept = append(kept, secret)
+	}
+	m.secrets = kept
+	return removed
+}
+
+// StartCleanupLoop spawns a goroutine that calls RemoveExpiredSecrets at the
+// given interval. Stops when the parent context is cancelled. Calling again
+// cancels the previous loop's context and starts a fresh one.
+// If interval <= 0, the loop is disabled (cleanup must be triggered manually
+// via RemoveExpiredSecrets).
+func (m *JWTSecretManager) StartCleanupLoop(ctx context.Context, interval time.Duration) {
+	m.mu.Lock()
+	if m.cleanupCancel != nil {
+		m.cleanupCancel()
+	}
+	loopCtx, cancel := context.WithCancel(ctx)
+	m.cleanupCancel = cancel
+	m.mu.Unlock()
+
+	if interval <= 0 {
+		log.Warn().Dur("interval", interval).Msg("JWT secret cleanup interval is non-positive, loop disabled")
+		return
+	}
+
+	go func() {
+		ticker := time.NewTicker(interval)
+		defer ticker.Stop()
+		log.Info().Dur("interval", interval).Msg("JWT secret cleanup loop started")
+		for {
+			select {
+			case <-loopCtx.Done():
+				log.Info().Msg("JWT secret cleanup loop stopped")
+				return
+			case <-ticker.C:
+				removed := m.RemoveExpiredSecrets()
+				if removed > 0 {
+					log.Info().Int("removed", removed).Msg("JWT secrets cleaned up")
+				} else {
+					log.Trace().Msg("JWT cleanup tick: no expired secrets")
+				}
+			}
+		}
+	}()
+}
--- a/pkg/user/jwt_manager_test.go
+++ b/pkg/user/jwt_manager_test.go
@@ -1,6 +1,7 @@
 package user

 import (
+	"context"
 	"testing"
 	"time"

@@ -84,3 +85,73 @@ func TestJWTSecretExpiration(t *testing.T) {
 	}
 	assert.True(t, foundExpiring)
 }
+
+// TestRemoveExpiredSecrets_ExpiredNonPrimaryRemoved confirms that
+// RemoveExpiredSecrets drops a non-primary secret whose ExpiresAt is in the past.
+func TestRemoveExpiredSecrets_ExpiredNonPrimaryRemoved(t *testing.T) {
+	manager := NewJWTSecretManager("primary")
+
+	// Add a secret that expired 1 hour ago by setting expiresIn to a small
+	// positive duration then mutating after via AddSecret + manipulation.
+	// Simpler: add with a 1ns lifetime and sleep 2ns equivalent (tiny TTL).
+	manager.AddSecret("about-to-expire", false, 1*time.Nanosecond)
+	time.Sleep(5 * time.Millisecond)
+
+	removed := manager.RemoveExpiredSecrets()
+	assert.Equal(t, 1, removed, "one expired secret should be removed")
+
+	secrets := manager.GetAllValidSecrets()
+	assert.Len(t, secrets, 1, "only primary should remain")
+	assert.Equal(t, "primary", secrets[0].Secret)
+	assert.True(t, secrets[0].IsPrimary)
+}
+
+// TestRemoveExpiredSecrets_PrimaryNeverRemoved confirms the primary secret
+// is preserved even if (somehow) marked expired - ADR-0021 invariant.
+func TestRemoveExpiredSecrets_PrimaryNeverRemoved(t *testing.T) {
+	manager := NewJWTSecretManager("primary")
+
+	// Add a non-primary that doesn't expire
+	manager.AddSecret("kept", false, 0)
+
+	// Simulate an "expired primary" by manipulating internals via Reset then
+	// re-creating - here we rely on the public contract: primary has no
+	// ExpiresAt by default. Confirm cleanup leaves it.
+	removed := manager.RemoveExpiredSecrets()
+	assert.Equal(t, 0, removed)
+
+	assert.Equal(t, "primary", manager.GetPrimarySecret())
+}
+
+// TestRemoveExpiredSecrets_NonExpiredKept confirms a future-expiring secret
+// stays after cleanup.
+func TestRemoveExpiredSecrets_NonExpiredKept(t *testing.T) {
+	manager := NewJWTSecretManager("primary")
+	manager.AddSecret("future", false, 1*time.Hour)
+
+	removed := manager.RemoveExpiredSecrets()
+	assert.Equal(t, 0, removed)
+	assert.Len(t, manager.GetAllValidSecrets(), 2)
+}
+
+// TestStartCleanupLoop_FiresAndStops confirms the goroutine actually calls
+// RemoveExpiredSecrets on each tick and stops cleanly when the context is
+// cancelled. Uses a short interval to keep the test fast.
+func TestStartCleanupLoop_FiresAndStops(t *testing.T) {
+	manager := NewJWTSecretManager("primary")
+	manager.AddSecret("dies", false, 5*time.Millisecond)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	manager.StartCleanupLoop(ctx, 10*time.Millisecond)
+
+	// Wait long enough for at least one tick + the secret's TTL
+	time.Sleep(50 * time.Millisecond)
+
+	cancel() // stop the loop
+
+	secrets := manager.GetAllValidSecrets()
+	assert.Len(t, secrets, 1, "expired secret should have been removed by the loop")
+	assert.Equal(t, "primary", secrets[0].Secret)
+}
--- a/pkg/user/postgres_repository.go
+++ b/pkg/user/postgres_repository.go
@@ -125,6 +125,75 @@ func NewPostgresRepository(cfg *config.Config) (*PostgresRepository, error) {
 	return repo, nil
 }

+// NewPostgresRepositoryFromDSN creates a PostgresRepository connected via the given DSN
+// and runs AutoMigrate against it. Used by BDD test infra to create a per-scenario
+// repository pointing at an isolated schema (the DSN typically includes search_path=<schema>).
+//
+// Pass the same cfg used elsewhere (it is required by methods that read pool settings),
+// but the DSN passed here OVERRIDES the host/port/dbname/etc that cfg would have built.
+func NewPostgresRepositoryFromDSN(cfg *config.Config, dsn string) (*PostgresRepository, error) {
+	repo := &PostgresRepository{
+		config:     cfg,
+		spanPrefix: "user.repo.",
+	}
+
+	gormLogger := logger.New(
+		log.New(os.Stderr, "\n", log.LstdFlags),
+		logger.Config{
+			SlowThreshold:             time.Second,
+			LogLevel:                  logger.Warn,
+			IgnoreRecordNotFoundError: true,
+			Colorful:                  false,
+		},
+	)
+
+	db, err := gorm.Open(postgres.Open(dsn), &gorm.Config{Logger: gormLogger})
+	if err != nil {
+		return nil, fmt.Errorf("failed to connect to PostgreSQL with custom DSN: %w", err)
+	}
+
+	sqlDB, err := db.DB()
+	if err != nil {
+		return nil, fmt.Errorf("failed to get sql.DB from gorm: %w", err)
+	}
+	sqlDB.SetMaxOpenConns(cfg.GetDatabaseMaxOpenConns())
+	sqlDB.SetMaxIdleConns(cfg.GetDatabaseMaxIdleConns())
+	sqlDB.SetConnMaxLifetime(cfg.GetDatabaseConnMaxLifetime())
+
+	if err := db.AutoMigrate(&User{}); err != nil {
+		return nil, fmt.Errorf("failed to auto-migrate via custom DSN: %w", err)
+	}
+
+	repo.db = db
+	return repo, nil
+}
+
+// BuildSchemaIsolatedDSN returns a Postgres DSN that targets the given schema via
+// the search_path connection parameter. Use with NewPostgresRepositoryFromDSN to
+// get a repository whose connection only sees the per-scenario schema.
+func BuildSchemaIsolatedDSN(cfg *config.Config, schemaName string) string {
+	return fmt.Sprintf(
+		"host=%s port=%d user=%s password=%s dbname=%s sslmode=%s search_path=%s",
+		cfg.GetDatabaseHost(),
+		cfg.GetDatabasePort(),
+		cfg.GetDatabaseUser(),
+		cfg.GetDatabasePassword(),
+		cfg.GetDatabaseName(),
+		cfg.GetDatabaseSSLMode(),
+		schemaName,
+	)
+}
+
+// Exec runs a raw SQL statement against the repository's connection.
+// Used by BDD test infra for schema lifecycle (CREATE SCHEMA / DROP SCHEMA).
+// Avoid in production code paths -- prefer the typed Repository methods.
+func (r *PostgresRepository) Exec(sql string) error {
+	if r.db == nil {
+		return fmt.Errorf("Exec called on PostgresRepository with nil db")
+	}
+	return r.db.Exec(sql).Error
+}
+
 // initializeDatabase sets up the PostgreSQL database connection and runs migrations
 func (r *PostgresRepository) initializeDatabase() error {
 	// Configure GORM logger based on config
--- a/pkg/user/postgres_repository_isolated_test.go
+++ b/pkg/user/postgres_repository_isolated_test.go
@@ -0,0 +1,118 @@
+package user
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"testing"
+
+	"dance-lessons-coach/pkg/config"
+
+	_ "github.com/lib/pq"
+)
+
+// TestNewPostgresRepositoryFromDSN_SchemaIsolation verifies that the factory
+// + BuildSchemaIsolatedDSN combo produces a repository whose AutoMigrate
+// creates the users table inside a per-scenario schema (NOT public).
+//
+// This is the foundation block for parallel-safe BDD tests (T12).
+// Wiring it into the BDD testserver's SetupScenarioSchema is a follow-up.
+//
+// Skipped if Postgres is not available (no env vars / connection refused).
+func TestNewPostgresRepositoryFromDSN_SchemaIsolation(t *testing.T) {
+	host := os.Getenv("DLC_DATABASE_HOST")
+	if host == "" {
+		t.Skip("DLC_DATABASE_HOST not set, skipping integration test")
+	}
+
+	cfg := &config.Config{}
+	cfg.Database.Host = host
+	cfg.Database.Port = parsePortOrDefault(os.Getenv("DLC_DATABASE_PORT"), 5432)
+	cfg.Database.User = envOr("DLC_DATABASE_USER", "postgres")
+	cfg.Database.Password = envOr("DLC_DATABASE_PASSWORD", "postgres")
+	cfg.Database.Name = envOr("DLC_DATABASE_NAME", "dance_lessons_coach_bdd_test")
+	cfg.Database.SSLMode = envOr("DLC_DATABASE_SSL_MODE", "disable")
+
+	schemaName := "test_isolated_dsn_factory"
+
+	// Open default repo (public schema) just to manage the schema lifecycle
+	defaultRepo, err := NewPostgresRepository(cfg)
+	if err != nil {
+		t.Skipf("Postgres unavailable: %v", err)
+	}
+	defer defaultRepo.Close()
+
+	// Drop schema if it exists from a previous run
+	if err := defaultRepo.db.Exec(fmt.Sprintf("DROP SCHEMA IF EXISTS %s CASCADE", schemaName)).Error; err != nil {
+		t.Fatalf("DROP SCHEMA setup failed: %v", err)
+	}
+	defer func() {
+		_ = defaultRepo.db.Exec(fmt.Sprintf("DROP SCHEMA IF EXISTS %s CASCADE", schemaName)).Error
+	}()
+
+	// CREATE SCHEMA
+	if err := defaultRepo.db.Exec(fmt.Sprintf("CREATE SCHEMA %s", schemaName)).Error; err != nil {
+		t.Fatalf("CREATE SCHEMA failed: %v", err)
+	}
+
+	// Now use the factory to open a repo whose connection has search_path=schemaName
+	dsn := BuildSchemaIsolatedDSN(cfg, schemaName)
+	isolatedRepo, err := NewPostgresRepositoryFromDSN(cfg, dsn)
+	if err != nil {
+		t.Fatalf("NewPostgresRepositoryFromDSN failed: %v", err)
+	}
+	defer isolatedRepo.Close()
+
+	// Verify the users table now exists in our schema (not just in public)
+	var count int64
+	q := fmt.Sprintf("SELECT count(*) FROM information_schema.tables WHERE table_schema='%s' AND table_name='users'", schemaName)
+	if err := isolatedRepo.db.Raw(q).Scan(&count).Error; err != nil {
+		t.Fatalf("information_schema query failed: %v", err)
+	}
+	if count != 1 {
+		t.Fatalf("expected users table in schema %s after AutoMigrate, count=%d", schemaName, count)
+	}
+
+	// Verify a CreateUser via the isolated repo writes into the new schema, NOT public
+	u := &User{Username: "isolated_factory_user", PasswordHash: "x"}
+	if err := isolatedRepo.CreateUser(context.Background(), u); err != nil {
+		t.Fatalf("CreateUser via isolated repo failed: %v", err)
+	}
+
+	var publicCount int64
+	if err := defaultRepo.db.Raw(fmt.Sprintf("SELECT count(*) FROM public.users WHERE username='%s'", u.Username)).Scan(&publicCount).Error; err != nil {
+		t.Fatalf("query public.users failed: %v", err)
+	}
+	if publicCount != 0 {
+		t.Fatalf("isolation leak: expected 0 rows in public.users for username=%s, got %d", u.Username, publicCount)
+	}
+
+	var schemaCount int64
+	if err := isolatedRepo.db.Raw(fmt.Sprintf("SELECT count(*) FROM %s.users WHERE username='%s'", schemaName, u.Username)).Scan(&schemaCount).Error; err != nil {
+		t.Fatalf("query schema users failed: %v", err)
+	}
+	if schemaCount != 1 {
+		t.Fatalf("expected 1 row in %s.users, got %d", schemaName, schemaCount)
+	}
+}
+
+// envOr returns the env var value or the fallback if empty.
+func envOr(key, fallback string) string {
+	if v := os.Getenv(key); v != "" {
+		return v
+	}
+	return fallback
+}
+
+// parsePortOrDefault parses a port string or returns the fallback.
+func parsePortOrDefault(s string, fallback int) int {
+	if s == "" {
+		return fallback
+	}
+	var n int
+	_, err := fmt.Sscanf(s, "%d", &n)
+	if err != nil || n <= 0 {
+		return fallback
+	}
+	return n
+}
--- a/pkg/user/user.go
+++ b/pkg/user/user.go
@@ -43,6 +43,15 @@ type AuthService interface {
 	RotateJWTSecret(newSecret string)
 	GetJWTSecretByIndex(index int) (string, bool)
 	ResetJWTSecrets() // Reset JWT secrets to initial state for test cleanup
+	// StartJWTSecretCleanupLoop starts a goroutine that periodically calls
+	// RemoveExpiredJWTSecrets at the given interval, stopping when ctx is
+	// cancelled. Implements the cleanup half of ADR-0021. interval <= 0
+	// disables the loop.
+	StartJWTSecretCleanupLoop(ctx context.Context, interval time.Duration)
+	// RemoveExpiredJWTSecrets triggers an immediate cleanup pass and returns
+	// the count of removed non-primary expired secrets. Useful for tests
+	// driving cleanup synchronously.
+	RemoveExpiredJWTSecrets() int
 }

 // UserManager defines interface for user management operations
--- a/scripts/run-bdd-tests.sh
+++ b/scripts/run-bdd-tests.sh
@@ -133,7 +133,7 @@ run_tests_with_tags() {
    set +e
    
    # Default tag filter: exclude flaky, todo, and skip scenarios
-    DEFAULT_TAGS="~@flaky && ~@todo && ~@skip"
+    DEFAULT_TAGS="~@flaky && ~@todo && ~@skip && ~@v2"
    
    if [ -n "$tags" ]; then
        # Use godog directly for tag filtering with exclusion
@@ -144,7 +144,21 @@ run_tests_with_tags() {
        # Note: -tags flag in go test is for Go build tags, NOT Godog feature tags
        # We use GODOG_TAGS env var which is read by the test framework
        echo "🚀 Running: GODOG_TAGS=\"${DEFAULT_TAGS}\" go test ./features/..."
-        GODOG_TAGS="$DEFAULT_TAGS" go test ./features/... -v -cover -coverpkg=./... -coverprofile=coverage.out 2>&1 | tee /tmp/bdd_test_output.txt && test_output=$(cat /tmp/bdd_test_output.txt) && rm -f /tmp/bdd_test_output.txt || test_output=$(cat /tmp/bdd_test_output.txt 2>/dev/null || echo "")
+        # When BDD_SCHEMA_ISOLATION=true (T12 architecture):
+        #   each test PACKAGE gets its own isolated PostgreSQL schema with its own
+        #   connection pool + migrations (cf. pkg/bdd/testserver/server.go Start()).
+        #   Packages then run in parallel safely. ~2.85x speedup observed locally.
+        # When unset:
+        #   fall back to -p 1 (sequential). Uses public schema with TRUNCATE-style
+        #   cleanup between scenarios.
+        if [ "${BDD_SCHEMA_ISOLATION:-}" = "true" ]; then
+          PARALLEL_FLAG=""
+          echo "🔀 BDD_SCHEMA_ISOLATION=true → feature packages run in parallel"
+        else
+          PARALLEL_FLAG="-p 1"
+          echo "🐌 BDD_SCHEMA_ISOLATION not set → feature packages run sequentially (-p 1)"
+        fi
+        GODOG_TAGS="$DEFAULT_TAGS" go test ./features/... -v $PARALLEL_FLAG -cover -coverpkg=./... -coverprofile=coverage.out 2>&1 | tee /tmp/bdd_test_output.txt && test_output=$(cat /tmp/bdd_test_output.txt) && rm -f /tmp/bdd_test_output.txt || test_output=$(cat /tmp/bdd_test_output.txt 2>/dev/null || echo "")
        test_exit_code=${PIPESTATUS[0]}
    fi
    
--- a/scripts/start-server.sh
+++ b/scripts/start-server.sh
@@ -4,7 +4,8 @@
 # This script starts the server in the background and provides control functions

 # Configuration
-PROJECT_DIR="/Users/gabrielradureau/Work/Vibe/dance-lessons-coach"
+SCRIPTS_DIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
+PROJECT_DIR=$(dirname "$SCRIPTS_DIR")
 SERVER_CMD="go run ./cmd/server"
 LOG_FILE="server.log"
 PID_FILE="server.pid"
--- a/scripts/test-graceful-shutdown.sh
+++ b/scripts/test-graceful-shutdown.sh
@@ -7,7 +7,8 @@
 set -e

 # Configuration
-PROJECT_DIR="/Users/gabrielradureau/Work/Vibe/dance-lessons-coach"
+SCRIPTS_DIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
+PROJECT_DIR=$(dirname "$SCRIPTS_DIR")
 SERVER_CMD="./scripts/start-server.sh"
 LOG_FILE="server.log"
 PID_FILE="server.pid"
@@ -59,11 +60,40 @@ echo "Response: $GREET_NAME_RESPONSE"
 echo ""
 echo "Stopping server gracefully..."

-# Test readiness during shutdown (in background)
-(curl -s http://localhost:8080/api/ready > /dev/null 2>&1 &)
+# Send SIGTERM once and probe /api/ready during the 1-second propagation window
+# the server holds open (pkg/server/server.go: time.Sleep(1s) after readiness
+# cancel). Previously the curl fired *before* the signal — it always saw "ready".
+# We also avoid calling "$SERVER_CMD stop" afterwards because that would send a
+# second SIGTERM: after signal.NotifyContext is done, the default handler kicks in
+# and the process terminates with a non-JSON "signal: terminated" on stderr.
+SERVER_PID=$(cat "$PID_FILE" 2>/dev/null || echo "")
+if [[ -z "$SERVER_PID" ]]; then
+    echo -e "\033[0;31m❌ FAIL: PID file not found\033[0m"
+    exit 1
+fi

-$SERVER_CMD stop
-sleep 3
+kill -TERM "$SERVER_PID"
+# Brief yield so the signal handler runs and CancelableContext.Cancel() fires
+sleep 0.2
+READY_DURING_SHUTDOWN=$(curl -s -w "\n[HTTP %{http_code}]" http://localhost:8080/api/ready 2>&1 || echo "[connection refused]")
+echo "Readiness during shutdown: $READY_DURING_SHUTDOWN"
+
+# Wait for the process to exit cleanly (up to 30s) without sending another signal
+echo "Waiting for server to exit..."
+for i in {1..30}; do
+    if ! ps -p "$SERVER_PID" > /dev/null 2>&1; then
+        echo "Server stopped successfully"
+        rm -f "$PID_FILE"
+        break
+    fi
+    sleep 1
+done
+if ps -p "$SERVER_PID" > /dev/null 2>&1; then
+    echo -e "\033[0;31m❌ FAIL: Server did not stop within 30s\033[0m"
+    kill -9 "$SERVER_PID" 2>/dev/null || true
+    exit 1
+fi
+sleep 0.5

 echo ""
 echo "Analyzing server logs..."
@@ -201,6 +231,12 @@ fi
 echo ""
 echo -e "\033[0;32m🎉 GRACEFUL SHUTDOWN TEST PASSED!\033[0m"
 echo "All required logs are present and in correct order."
+
+echo ""
+echo "📋 Full server log:"
+echo "==============================="
+cat "$LOG_FILE" | jq -r '"[\(.level | ascii_upcase)] \(.time | tostring) — \(.message)"'
+echo "==============================="
 echo ""

 # Clean up
--- a/scripts/test-opentelemetry.sh
+++ b/scripts/test-opentelemetry.sh
@@ -9,7 +9,8 @@ echo -e "\033[1;34m=== dance-lessons-coach OpenTelemetry Test ===\033[0m"
 echo ""

 # Configuration
-PROJECT_DIR="/Users/gabrielradureau/Work/Vibe/dance-lessons-coach"
+SCRIPTS_DIR=$(dirname "$(realpath "${BASH_SOURCE[0]}")")
+PROJECT_DIR=$(dirname "$SCRIPTS_DIR")
 SERVER_CMD="./scripts/start-server.sh"
 LOG_FILE="server.log"
 PID_FILE="server.pid"