From 1ae28cb944efe0ded4e5ab02ef6449d10e301316 Mon Sep 17 00:00:00 2001 From: Gabriel Radureau Date: Wed, 6 May 2026 12:55:18 +0200 Subject: [PATCH] docs(longhorn): document 2026-04-13 power-cut recovery + add data-recovery tooling Captures the post-mortem of the April 13 power-cut: incident timeline, retrospective, and architecture/role diagrams. Adds an ADR explaining why Longhorn cannot re-associate orphaned replica directories after a nuclear reinstall (engine-id naming), plus block-device recovery runbooks and the `playbooks/recover/longhorn_data.yml` automation that wires `merge-longhorn-layers.py` to rebuild PVCs from raw `volume-head-*.img` chains. Also extends the k3s_pvc backup to capture Longhorn `volumes`/`settings` CRDs (needed for the fast-path restore) and rewrites the restore script with a fallback dir + English messages. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../adr/20260414-longhorn-pvc-recovery.md | 550 ++++++++ .../incidents/2026-04-13-power-cut/README.md | 420 +++++++ .../diagrams/architecture.mmd | 209 ++++ .../diagrams/component-roles.mmd | 200 +++ .../diagrams/incident-flow.mmd | 131 ++ .../incidents/2026-04-13-power-cut/log.md | 1103 +++++++++++++++++ .../pvc-recovery-2026-04-14.md | 416 +++++++ .../2026-04-13-power-cut/recover_longhorn.yml | 70 ++ .../retrospective-recovery-analysis.md | 153 +++ .../tools/merge-longhorn-layers.py | 107 ++ .../factory/docs/incidents/README.md | 312 +++++ .../docs/runbooks/cluster-recovery-agent.md | 244 ++++ .../longhorn-block-device-recovery.md | 360 ++++++ .../factory/playbooks/backup/k3s_pvc.yml | 38 +- .../factory/playbooks/recover/longhorn.yml | 536 ++++++++ .../playbooks/recover/longhorn_data.yml | 914 ++++++++++++++ .../recover/longhorn_data_vars.example.yml | 84 ++ .../recover/longhorn_data_vars_clickhouse.yml | 17 + .../recover/longhorn_data_vars_erp_vault.yml | 38 + .../recover/longhorn_data_vars_remaining.yml | 47 + 20 files changed, 5939 insertions(+), 10 deletions(-) create mode 100644 ansible/arcodange/factory/docs/adr/20260414-longhorn-pvc-recovery.md create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/README.md create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/architecture.mmd create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/component-roles.mmd create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/incident-flow.mmd create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/log.md create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/pvc-recovery-2026-04-14.md create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/recover_longhorn.yml create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/retrospective-recovery-analysis.md create mode 100644 ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py create mode 100644 ansible/arcodange/factory/docs/incidents/README.md create mode 100644 ansible/arcodange/factory/docs/runbooks/cluster-recovery-agent.md create mode 100644 ansible/arcodange/factory/docs/runbooks/longhorn-block-device-recovery.md create mode 100644 ansible/arcodange/factory/playbooks/recover/longhorn.yml create mode 100644 ansible/arcodange/factory/playbooks/recover/longhorn_data.yml create mode 100644 ansible/arcodange/factory/playbooks/recover/longhorn_data_vars.example.yml create mode 100644 ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_clickhouse.yml create mode 100644 ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_erp_vault.yml create mode 100644 ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_remaining.yml diff --git a/ansible/arcodange/factory/docs/adr/20260414-longhorn-pvc-recovery.md b/ansible/arcodange/factory/docs/adr/20260414-longhorn-pvc-recovery.md new file mode 100644 index 0000000..9e2e0a5 --- /dev/null +++ b/ansible/arcodange/factory/docs/adr/20260414-longhorn-pvc-recovery.md @@ -0,0 +1,550 @@ +# ADR 20260414: Longhorn PVC Recovery When Reinstalled + +--- + +## šŸ“‹ **Executive Summary** + +After the April 13, 2026 power cut incident and subsequent cluster recovery, we discovered a **critical gap** in Longhorn volume restoration. While the **raw replica data files** (`volume-head-*.img`) remain intact on disk across all nodes, Longhorn cannot automatically **re-associate** them with new Volume CRDs due to its internal engine ID naming scheme. This document explains the problem and provides three recovery approaches. + +--- +--- + +## šŸ” **The Root Problem** + +### **What Happened** + +1. **Power cut** → Longhorn CSI driver lost connection +2. **Force-deletion of Longhorn pods** → Webhook circular dependency +3. **Nuclear cleanup** → All Longhorn CRDs (Volume, Engine, Replica) were deleted +4. **Reinstallation** → New Volume CRDs created with new engine IDs + +### **Directory Structure Issue** + +Longhorn stores replica data in directories named by **volume name + engine ID**: +``` +/mnt/arcodange/longhorn/replicas/ +ā”œā”€ā”€ pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-cd16e459/ # ← OLD (orphaned) +│ ā”œā”€ā”€ volume-head-002.img # ← Actual Traefik data (128Mi) +│ ā”œā”€ā”€ volume-head-002.img.meta +│ └── volume-snap-*.img +│ +ā”œā”€ā”€ pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4/ # ← NEW (empty) +│ ā”œā”€ā”€ volume-head-002.img # ← Empty 128Mi +│ └── volume-head-002.img.meta +└── ... +``` + +**The Problem:** When you recreate a Volume CRD, Longhorn generates a **new engine ID** (e.g., `8c7d8ab4`), creating a **new empty directory** instead of adopting the existing one (`cd16e459`). + +### **Why This Matters** + +| Component | Persistence | Recovery Path | +|-----------|-------------|---------------| +| **Replica `.img` files** | āœ… **Survives** on disk | Manual intervention required | +| **Volume CRD** | āŒ **Deleted** | Must recreate | +| **Engine/Replica CRDs** | āŒ **Deleted** | Auto-recreated by Longhorn | +| **Engine ID** | āŒ **Changes** | ** Cannot be recovered without backup ** | + +**Without the original Volume CRD backup, Longhorn cannot match orphaned replica directories to new Volume CRDs.** + +--- +--- + +## šŸŽÆ **Recovery Methods Comparison** + +| Method | Complexity | Data Safety | Downtime | Best For | +|--------|------------|-------------|----------|----------| +| **[A: Manual `dd` Copy](#method-a-manual-dd-copy)** | ⭐⭐⭐⭐ | āœ…āœ…āœ…āœ… | Medium | Critical data, no app backup | +| **[B: Directory Rename](#method-b-directory-rename)** | ⭐⭐⭐ | āœ…āœ… | Low | Small volumes, no Rebuilding replicas | +| **[C: Fresh Volume + App Restore](#method-c-fresh-volume--app-restore)** | ⭐⭐ | āœ…āœ…āœ…āœ…āœ… | Low | Non-critical data, app backups exist | +| **[D: Block-Device Injection (Automated)](#method-d-block-device-injection-automated)** | ⭐⭐⭐ | āœ…āœ…āœ…āœ… | Medium | **Recommended — any volume, no dir swap needed** | +| **[E: Longhorn Google Storage Restore](#method-e-longhorn-google-storage-restore)** | ⭐⭐ | āœ…āœ…āœ…āœ…āœ… | Low | Volumes with Longhorn backup configured | + +**Method B was proven risky** (2026-04-13 recovery): Longhorn reconciliation finds `Dirty: true` +metadata + a clean empty pi1 replica → silently rebuilds from the empty source, destroying data. +Use Method D for any volume larger than ~128Mi or with Rebuilding replicas. + +--- +--- + +## šŸ› ļø **Method A: Manual `dd` Copy** + +### **Concept** +Manually copy the data from the orphaned `.img` file to the new replica directory that Longhorn created for the new Volume CRD. + +### **Prerequisites** +- Root access to all nodes +- Volume CRD already recreated (with new engine ID) +- Longhorn has created new empty replica directories +- `dd` and `qemu-img` tools available + +### **Steps** + +```bash +# 1. Identify source (old data) and destination (new empty) +SOURCE_NODE=pi2 +SOURCE_DIR=/mnt/arcodange/longhorn/replicas/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-cd16e459 +SOURCE_IMG=$(ssh $SOURCE_NODE "ls $SOURCE_DIR/volume-head-*.img | head -1") + +DEST_DIRS=( + pi1:/mnt/arcodange/longhorn/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4 + pi2:/mnt/arcodange/longhorn/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4 + pi3:/mnt/arcodange/longhorn/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4 +) + +# 2. Copy data to each node +for DEST in "${DEST_DIRS[@]}"; do + NODE=${DEST%%:*} + PATH=${DEST#*:} + ssh $NODE "sudo mkdir -p $PATH && sudo dd if=$SOURCE_IMG of=$PATH/volume-head-002.img bs=4M" +done + +# 3. Restart Longhorn engine pods to pick up new data +kubectl delete pod -n longhorn-system -l longhorn.io/component=engine + +# 4. Verify data is accessible +kubectl get volume -n longhorn-system pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 +# Should show: state=attached, robustness=healthy +``` + +### **Pros** +- āœ… Guaranteed data recovery +- āœ… Works for any volume size +- āœ… Preserves all snapshots and metadata + +### **Cons** +- āš ļø Requires manual intervention on each node +- āš ļø Must know source and destination paths +- āš ļø Risk of data corruption if `dd` fails mid-copy +- āš ļø Volume must be in detached state during copy + +### **Risk Mitigation** +- Verify checksums after copy: `sha256sum /path/to/image.img` +- Copy to one node at a time, verify between each +- Use `pv` for progress: `pv $SOURCE_IMG | ssh $NODE "sudo dd of=$PATH/volume-head-002.img bs=4M"` + +--- +--- + +## šŸ·ļø **Method B: Directory Rename** + +### **Concept** +Rename the orphaned replica directory to match the **engine ID** that Longhorn expects for the new Volume CRD. + +### **Prerequisites** +- Volume CRD already recreated +- Longhorn has created engine CRDs (check: `kubectl get engines -n longhorn-system`) +- Must act quickly before Longhorn initializes new empty replicas + +### **Steps** + +```bash +# 1. Find the new engine ID for the volume +ENGINE=$(kubectl get engines -n longhorn-system -l longhorn.io/volume=pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 -o jsonpath='{.items[0].metadata.name}') +# Example: pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-e-0 +ENGINE_ID=${ENGINE#*-} # Extract suffix: e-0 +# But the directory uses a different format... + +# 2. Check actual directory names +kubectl get replicas -n longhorn-system | grep pvc-cc8a +# Output: pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-r-8c7d8ab4 + +# 3. Rename on the node where orphaned data exists +NEW_DIR_SUFFIX=$(kubectl get replicas -n longhorn-system pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-r-8c7d8ab4 -o jsonpath='{.metadata.labels.longhorn\.io/last-attached-node}') +ssh $NEW_DIR_SUFFIX "sudo mv /mnt/arcodange/longhorn/replicas/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-cd16e459 \ + /mnt/arcodange/longhorn/replicas/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4" + +# 4. Restart the replica pod +kubectl delete pod -n longhorn-system $(kubectl get pods -n longhorn-system -o jsonpath='{.items[?(@.metadata.labels.longhorn\.io/replica)=pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90].metadata.name}') +``` + +### **Pros** +- āœ… Fastest method +- āœ… No data copying required +- āœ… Preserves all existing data and snapshots + +### **Cons** +- āš ļø **High risk of mismatch** - wrong directory rename = data loss +- āš ļø Must identify correct engine ID for each node +- āš ļø Replica directories exist on multiple nodes - must rename on ALL +- āš ļø Longhorn may have already initialized new empty replicas + +### **Critical Warning** +**Each volume has replicas on ALL nodes.** You must: +1. Identify which node has which orphaned directory +2. Rename each to match the corresponding new engine's expected path +3. Ensure consistency across all nodes + +**Example for pvc-cc8a:** +```bash +# Orphaned dirs: +# pi2: pvc-cc8a...-cd16e459 +# pi3: pvc-cc8a...-011b54b3 + +# New engine paths (from kubectl get replicas): +# pi1: pvc-cc8a...-r-8c7d8ab4 +# pi2: pvc-cc8a...-r-32aa3e1e +# pi3: pvc-cc8a...-r-3e84c460 + +# Must rename EACH orphaned dir to match new engine on SAME node +``` + +--- +--- + +## šŸ†• **Method C: Fresh Volume + App Restore** *(Recommended for Traefik)* + +### **Concept** +1. Let Longhorn create a **new empty volume** for the PVC +2. Restore the **application data** (Traefik's `acme.json`) from application-level backups + +### **Prerequisites** +- Application-level backup exists (e.g., Traefik config, certificates) +- Data is non-critical or easily restorable +- Storage requirements are small (128Mi for Traefik) + +### **Steps** + +```bash +# 1. Delete the problematic Volume CRD (if any) +kubectl delete volume -n longhorn-system pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 --ignore-not-found + +# 2. Delete the PVC +kubectl delete pvc -n kube-system traefik + +# 3. Let StorageClass provision a fresh volume +kubectl apply -f - </dev/null || echo 'No backup found'" + +# Check ArgoCD apps (if Traefik was deployed via GitOps) +kubectl get app -n argocd | grep traefik +``` + +### **Pros** +- āœ… **Simplest and safest** method +- āœ… No risk of Longhorn directory mismatches +- āœ… Works even without Longhorn CRD backups +- āœ… Verifiable - you can confirm data was restored +- āœ… Clean state - no orphaned directories + +### **Cons** +- āš ļø Requires application-level backups +- āš ļø TLS certificates may have expired (need to re-issue) + +--- +--- + +## šŸ† **Recommendation: Method C for Traefik** + +### **Why Method C is Best for This Case** + +| Factor | Assessment | +|--------|------------| +| **Volume Size** | 128Mi (small) | +| **Data Criticality** | TLS certs can be re-generated | +| **Backup Availability** | Likely exists in ArgoCD/Git | +| **Complexity** | Low | +| **Risk** | Minimal | +| **Time Required** | ~5 minutes | + +### **Data Loss Assessment for Traefik** + +The **worst case** (no Traefik backup): +- TLS certificates will be **re-issued** automatically by cert-manager + Let's Encrypt +- No permanent data loss - certificates are ephemeral +- Client impact: Brief TLS warning during re-issuance (~1-2 minutes) + +**Verdict:** 🟢 **Method C is the safest and most practical approach.** + +--- + +## šŸ”§ **Prevention: What We Must Fix** + +### **1. Update Backup Playbook** (`playbooks/backup/k3s_pvc.yml`) āœ… Done 2026-04-16 + +`backup_cmd` now captures: +1. All PersistentVolumes (PV) +2. All PersistentVolumeClaims (PVC) +3. **All Longhorn Volumes** (critical — enables fast restore via `kubectl apply` instead of block-device injection) +4. All Longhorn Settings (backup target configuration) + +### **2. Test Backups Regularly** + +```bash +# Monthly test: Restore a non-critical volume +# Pick a test volume, delete it, restore from backup +kubectl delete volume -n longhorn-system +kubectl apply -f +kubectl get volume -n longhorn-system -w +``` + +### **3. Validate Backup Files** + +```bash +# Check backup contains Longhorn resources +grep "longhorn.io/v1beta2" /path/to/backup-*.volumes +grep "kind: Volume" /path/to/backup-*.volumes +``` + +### **4. Document Recovery Procedure** + +- [ ] Create `docs/admin/longhorn-recovery.md` with these steps +- [ ] Add to team runbook +- [ ] Include in incident response training + +--- + +## šŸ“Š **Test Scenario: Battle Testing PVC Recovery** + +### **Test Setup** + +```bash +# 1. Create a test namespace +kubectl create ns longhorn-test + +# 2. Create a test PVC +kubectl apply -f - < /data/testfile.txt && echo 'more data' >> /data/testfile.txt && tail -f /dev/null"] + volumeMounts: + - name: data + mountPath: /data + volumes: + - name: data + persistentVolumeClaim: + claimName: test-longhorn-recovery +EOF + +# 4. Write and verify data +kubectl exec -n longhorn-test test-writer -- cat /data/testfile.txt +# Should show: "test data for recovery\nmore data" + +# 5. Backup everything +kubectl get -A pv,pvc -o yaml > /tmp/test-backup-pv-pvc.yaml +kubectl get -A volumes.longhorn.io -o yaml >> /tmp/test-backup-pv-pvc.yaml +echo '---' >> /tmp/test-backup-pv-pvc.yaml +kubectl get -A settings.longhorn.io -o yaml >> /tmp/test-backup-pv-pvc.yaml +``` + +### **Test Execution: Simulate Disaster** + +```bash +# 6. Simulate disaster - delete everything +kubectl delete pvc -n longhorn-test test-longhorn-recovery +kubectl delete pod -n longhorn-test test-writer +kubectl delete volume -n longhorn-system pvc-$(kubectl get pvc -n longhorn-test test-longhorn-recovery -o jsonpath='{.spec.volumeName}') + +# 7. Restore from backup +kubectl apply -f /tmp/test-backup-pv-pvc.yaml + +# 8. Verify recovery +kubectl get pvc -n longhorn-test test-longhorn-recovery +kubectl get volumes -n longhorn-system | grep test-longhorn-recovery + +# 9. Deploy test reader pod +kubectl apply -f - < B[Kubelet Crashes] + A --> C[Docker Daemon Crashes] + B --> D[Longhorn Manager Pods Crash] + B --> E[CSI Driver Registration Lost] + C --> F[Overlay2 Filesystem Corrupt] + D --> G[Driver-Deployer Init Container Waits] + E --> H[CSI Socket Disappears] + G --> I[CSI Driver Not Deployed] + H --> J[CSI Pods Cannot Start] + I --> J + J --> K[PVC Mounts Fail] + K --> L[Application Pods Crash] + F --> M[Docker Containers Fail to Start] + M --> N[CoreDNS Crashes] + M --> O[Service Load Balancers Crash] + N --> P[DNS Resolution Fails] + O --> P + P --> L + K --> L +``` + +### Why Data Is Safe + +The Longhorn volume data is stored in replicas across all three nodes at `/mnt/arcodange/longhorn/replicas/`. Checking the Longhorn volumes shows: + +``` +All 12 volumes: state="attached", robustness="healthy" +``` + +This confirms that: +1. Volume metadata is intact in etcd +2. Replica data is intact on disk +3. Once CSI driver is restored, volumes will be accessible again +4. **No permanent data loss has occurred** + +## Recovery Actions Taken + +### Attempt 1: HelmChart Manifest Touch (15:24:50 - 15:25:50) +**Action:** Touched `/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml` on pi1 + +**Command:** +```bash +ssh pi@pi1 "sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml" +``` + +**Outcome:** Only triggered reconcile for 1 pod (longhorn-manager-w85v6). CSI driver still not registered. + +**Decision:** Insufficient. Need more aggressive approach. + +### Attempt 2: Force Delete All Longhorn Pods (15:32:15 - Present) +**Action:** Force deleted all 24 pods in longhorn-system namespace + +**Command:** +```bash +kubectl delete pods -n longhorn-system --all --force --grace-period=0 +``` + +**Outcome:** +- HelmChart controller detected changes and recreated all pods +- **Success**: 23/25 pods now in Running state (15:34:30) +- **Blocking**: `longhorn-driver-deployer` stuck in Init:0/1 +- **Blocking**: All `longhorn-csi-plugin` pods in Error +- **Investigation**: driver-deployer's `wait-longhorn-manager` init container waiting for manager readiness + +### Current Investigation (15:34:30) +**Focus:** Why driver-deployer is stuck in Init state + +The `longhorn-driver-deployer` pod has an init container that waits for Longhorn manager to be ready before deploying the CSI driver. Despite 3 manager pods running, the wait condition is not being met. + +**Hypotheses:** +1. Manager pods are not fully healthy (readiness probes failing) +2. Network connectivity between driver-deployer and managers +3. RBAC or service account permissions issue +4. Configuration mismatch in HelmChart values + +## Current Status (2026-04-15) + +### Longhorn System +- **All Longhorn pods**: Running āœ… (reinstalled 2026-04-13) +- **CSI driver**: Registered āœ… + +### Volume Recovery Status + +| PVC | Namespace | Size | Status | +|-----|-----------|------|--------| +| `traefik` (kube-system) | kube-system | 128Mi | āœ… Recovered (2026-04-14) | +| `url-shortener-data` | url-shortener | 128Mi | āœ… Recovered (2026-04-14) | +| `clickhouse-storage-clickhouse-0` | tools | 16Gi | āœ… Recovered (2026-04-14) | +| `prometheus-server` | tools | 8Gi | ā³ In progress (2026-04-15) | +| `storage-prometheus-alertmanager-0` | tools | 2Gi | ā³ In progress (2026-04-15) | +| `redis-storage-redis-0` | tools | 1Gi | ā³ In progress (2026-04-15) | +| `backups-rwx` | longhorn-system | 50Gi | ā³ In progress (2026-04-15) | +| `data-hashicorp-vault-0` | tools | 10Gi | šŸ”“ Deferred — manual recovery | +| `audit-hashicorp-vault-0` | tools | 10Gi | šŸ”“ Deferred — manual recovery | +| `erp` | erp | 50Gi | šŸ”“ Deferred — manual recovery | + +## Next Steps + +### Immediate +1. Confirm prometheus, alertmanager, redis, backups-rwx fully recovered via `longhorn_data.yml` +2. Verify monitoring stack (Grafana dashboards, alert routing) is functional + +### Short-term +3. Manual recovery of Vault (`data-hashicorp-vault-0`, `audit-hashicorp-vault-0`) — see Vault runbook +4. Manual recovery of ERP (`erp`) — coordinate with application owner +5. Update backup playbook to include Longhorn Volume CRDs (see ADR 20260414-longhorn-pvc-recovery) +6. Prepare Longhorn Google Storage restore playbook for `backups-rwx` alternative recovery path + +### Long-term +- Implement UPS for the Raspberry Pi cluster +- Add Longhorn volume health monitoring to Grafana +- Regular backup restore drills + +## Architecture Context + +```mermaid +%%{init: { 'theme': 'forest' }}%% +flowchart TB + subgraph K3s Control Plane + A[pi1: Control Plane] -->|runs| B[kubelet] + B --> C[k3s server] + C --> D[HelmChart Controller] + end + + subgraph Storage Layer + E[Longhorn HelmChart] --> F[Longhorn Manager Pods] + F --> G[Driver Deployer] + G --> H[CSI Driver Registration] + H --> I[CSI Socket: /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock] + F --> J[Longhorn Volumes] + J --> K[Replicas on all 3 nodes] + end + + subgraph CSI Components + H --> L[csi-attacher Pods] + H --> M[csi-provisioner Pods] + H --> N[csi-resizer Pods] + H --> O[csi-snapshotter Pods] + H --> P[csi-plugin DaemonSet] + end + + subgraph Data Path + I --> Q[/mnt/arcodange/longhorn/] + Q --> R[replicas/] + end + + subgraph Docker Storage + S[Docker Daemon] --> T[/mnt/arcodange/docker/] + T --> U[overlay2/] + end + + L -->|mounts volumes| V[Application Pods] + M -->|creates volumes| J + P -->|node-level ops| I + + classDef critical fill:#c00,color:#fff,stroke:#000 + classDef healthy fill:#0a0,color:#000,stroke:#000 + classDef degraded fill:#ff0,color:#000,stroke:#000 + + class H,L,M,N,O,P critical + class F,G,E degraded + class I,J,Q,R,U healthy +``` + +## Component Details + +### Longhorn Manager +- **Role**: Primary controller for Longhorn, manages volumes, replicas, snapshots +- **Image**: `longhornio/longhorn-manager:v1.9.1` +- **Ports**: 9500 (manager), 9501 (webhook health), 9502 (metrics) +- **Data Path**: `/mnt/arcodange/longhorn` (configured in HelmChart values) +- **Health Check**: `https://:9501/v1/healthz` + +### Longhorn Driver Deployer +- **Role**: Deploys the CSI driver to each node +- **Image**: `longhornio/longhorn-manager:v1.9.1` +- **Init Container**: `wait-longhorn-manager` - waits for manager to be ready +- **Blocker**: Currently stuck in init, preventing CSI driver deployment + +### CSI Driver +- **Role**: Implements the CSI (Container Storage Interface) specification for Longhorn +- **Socket**: `/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock` +- **Registration**: Must be registered with kubelet via CSINode +- **Images**: + - `longhornio/csi-attacher:v4.9.0-20250709` + - `longhornio/csi-provisioner:v5.3.0-20250709` + - `longhornio/csi-resizer:v1.14.0-20250709` + - `longhornio/csi-snapshotter:v8.3.0-20250709` + - `longhornio/csi-node-driver-registrar:v2.14.0-20250709` + +### CSI Node Driver Registrar +- **Role**: Registers the CSI driver with kubelet +- **Image**: `longhornio/csi-node-driver-registrar:v2.14.0-20250709` +- **Mechanism**: Creates a `CSINode` resource and registers via kubelet plugin registry + +## Action Items + +### Immediate (resolved) +- [x] Investigate and resolve driver-deployer init container blocker +- [x] Restore CSI driver registration +- [x] Fix Docker overlay2 corruption / daemon.json on all nodes +- [x] Fix DNS (CoreDNS + Pi-hole dnsmasq config) +- [x] Longhorn reinstalled and healthy +- [x] Traefik ingress controller functional +- [x] Fix backup script (empty backup.volumes bug) + +### Short-term (resolved) +- [x] url-shortener data recovered +- [x] Clickhouse data recovered +- [x] Develop automated block-device recovery playbook (`playbooks/recover/longhorn_data.yml`) +- [x] Backup restore procedure documented and tested + +### Medium-term (in progress) +- [ ] prometheus, alertmanager, redis, backups-rwx recovered (playbook running 2026-04-15) +- [ ] Vault manual recovery +- [ ] ERP manual recovery +- [ ] Update backup playbook to include Longhorn Volume CRDs +- [ ] Prepare Longhorn Google Storage restore playbook + +### Long-term +- [ ] Implement UPS for Raspberry Pi cluster +- [ ] Add Longhorn volume health monitoring to Grafana +- [ ] Add CSI socket health check to monitoring +- [ ] Regular backup restore drills (monthly) + +## Lessons Learned + +### What Went Well +- Quick identification of root cause (CSI driver registration) +- Longhorn volume data remained intact (good replica design) +- Ability to force-pod-delete triggered partial recovery +- K3s HelmChart approach allows easy manifest-based recovery + +### What Could Be Improved +- Need better CSI driver health monitoring and alerting +- Longhorn driver-deployer init container timeout may be too short +- Docker overlay2 on external storage needs better corruption recovery +- Backup script has bugs that prevent reliable backups +- No UPS protection for power cuts + +### Technical Debt Identified +- Backup script formatting bug (extra newlines create invalid YAML) +- No automated Longhorn health checks +- Manual intervention required for CSI driver recovery + +## Related Files + +- **Ansible Playbook**: `playbooks/system/k3s_config.yml` (Longhorn HelmChart creation) +- **HelmChart Manifest**: `/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml` on pi1 +- **Backup Scripts**: `/opt/k3s_volumes/backup.sh` and `/opt/k3s_volumes/restore.sh` on pi1 +- **Inventory**: `inventory/hosts.yml` (required for all playbooks) + +## Commands Reference + +### Check Longhorn Status +```bash +kubectl get pods -n longhorn-system +kubectl get volumes -n longhorn-system +kubectl get replicas -n longhorn-system +kubectl get settings -n longhorn-system +``` + +### Force Longhorn Recovery (k3s-specific) +```bash +# Method 1: Touch manifest (soft reconcile) +sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml + +# Method 2: Delete all pods (force recreate) +kubectl delete pods -n longhorn-system --all --force --grace-period=0 + +# Method 3: Delete specific pod +kubectl delete pod -n longhorn-system longhorn-driver-deployer-* +``` + +### Check CSI Driver Registration +```bash +kubectl get csidriver +kubectl get csinodes +kubectl describe csidriver driver.longhorn.io +``` + +### Check Longhorn Manufacturer +```bash +kubectl describe cm -n longhorn-system longhorn-storageclass +``` diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/architecture.mmd b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/architecture.mmd new file mode 100644 index 0000000..15d7f01 --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/architecture.mmd @@ -0,0 +1,209 @@ +%%{init: { 'theme': 'forest', 'themeVariables': { + 'primaryColor': '#1e293b', + 'primaryTextColor': '#f8fafc', + 'lineColor': '#334155', + 'secondaryColor': '#475569', + 'tertiaryColor': '#94a3b8', + 'edgeLabelBackground':'#fff', + 'edgeLabelColor': '#1e293b' +}}}%% + +flowchart TD + subgraph Cluster["K3s Cluster (v1.34.3+k3s1)"] + direction TB + + subgraph Nodes["Physical Nodes"] + pi1["pi1: 192.168.1.201\nControl Plane"] + pi2["pi2: 192.168.1.202\nWorker"] + pi3["pi3: 192.168.1.203\nWorker"] + end + + subgraph K3sComponents["K3s Control Plane Components"] + kubelet1["kubelet"] + kubelet2["kubelet"] + kubelet3["kubelet"] + k3s_server["k3s server"] + helm_controller["HelmChart Controller"] + end + + pi1 --> kubelet1 + pi2 --> kubelet2 + pi3 --> kubelet3 + pi1 --> k3s_server + k3s_server --> helm_controller + end + + subgraph LonghornStorage["Longhorn Storage System"] + direction TB + + subgraph HelmChart["HelmChart Installation"] + manifest[("longhorn-install.yaml")] + end + + subgraph Manager["Longhorn Manager layer"] + lh_manager1["longhorn-manager-r6sd2\n2/2 Running\npi2"] + lh_manager2["longhorn-manager-sjc56\n1/2 Running\npi3"] + lh_manager3["longhorn-manager-t9b45\n1/2 Running\npi1"] + webhook["Webhook Leader: pi2"] + end + + subgraph DriverDeployer["CSI Driver Deployer"] + deployer["longhorn-driver-deployer\n0/1 Init:0/1\npi3"] + wait_container["wait-longhorn-manager\nwaiting..."] + end + + subgraph CSIDriver["CSI Driver Components"] + csi_socket[("/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock")] + csi_registrar["CSI Node Driver Registrar"] + end + + subgraph CSIContainers["CSI Containers (Sidecars)"] + attacher1["csi-attacher-54ld9\n1/1 Running\npi2"] + attacher2["csi-attacher-dqq9v\n1/1 Running\npi3"] + attacher3["csi-attacher-k5jmx\n0/1 Error\npi1"] + provisioner1["csi-provisioner-9z79d\n0/1 Error\npi2"] + provisioner2["csi-provisioner-zjwdr\n1/1 Running\npi1"] + provisioner3["csi-provisioner-zk5kp\n1/1 Running\npi3"] + resizer1["csi-resizer-8mrld\n1/1 Running\npi3"] + resizer2["csi-resizer-ddhl2\n0/1 Error\npi1"] + resizer3["csi-resizer-qv5n9\n0/1 Error\npi2"] + snapshotter1["csi-snapshotter-9rzf4\n1/1 Running\npi3"] + snapshotter2["csi-snapshotter-bqdtd\n0/1 Error\npi2"] + snapshotter3["csi-snapshotter-jv6pj\n1/1 Running\npi1"] + end + + subgraph CSIPlugin["CSI Plugin DaemonSet"] + plugin1["longhorn-csi-plugin-f44jp\n0/3 Error\npi3"] + plugin2["longhorn-csi-plugin-q2sgh\n1/3 Error\npi1"] + plugin3["longhorn-csi-plugin-vzld8\n2/3 Error\npi2"] + end + + subgraph DataLayer["Longhorn Data Layer"] + engine1["engine-image-ei-8ktd9\n1/1 Running\npi1"] + engine2["engine-image-ei-dcjq8\n1/1 Running\npi3"] + engine3["engine-image-ei-m76jf\n1/1 Running\npi2"] + + volumes[("12 Longhorn Volumes")] + replicas[("/mnt/arcodange/longhorn/replicas/")] + end + + subgraph UIAndTools["UI & Backup"] + ui1["longhorn-ui-8gb4s\n0/1 CrashLoop\npi1"] + ui2["longhorn-ui-hmxz6\n0/1 CrashLoop\npi3"] + share_mgr1["share-manager-...70b4\n0/1 Error\npi1"] + share_mgr2["share-manager-...7ffa\n0/1 Error\npi3"] + nfs["rwx-nfs-4cn9h\n0/1 ContainerCreating\npi3"] + end + + manifest --> lh_manager1 & lh_manager2 & lh_manager3 + helm_controller --> manifest + + lh_manager1 & lh_manager2 & lh_manager3 --> webhook + + deployer --> wait_container + wait_container -.->|waits for| lh_manager1 & lh_manager2 & lh_manager3 + deployer --> csi_registrar + csi_registrar --> csi_socket + + csi_socket --> kubelet1 + csi_socket --> kubelet2 + csi_socket --> kubelet3 + + attacher1 & attacher2 & attacher3 --> csi_socket + provisioner1 & provisioner2 & provisioner3 --> csi_socket + resizer1 & resizer2 & resizer3 --> csi_socket + snapshotter1 & snapshotter2 & snapshotter3 --> csi_socket + + plugin1 & plugin2 & plugin3 --> csi_socket + + lh_manager1 & lh_manager2 & lh_manager3 --> volumes + volumes --> replicas + + replicas --> pi1_disk[("pi1: /mnt/arcodange/longhorn")] + replicas --> pi2_disk[("pi2: /mnt/arcodange/longhorn")] + replicas --> pi3_disk[("pi3: /mnt/arcodange/longhorn")] + + share_mgr1 & share_mgr2 --> nfs + nfs --> backup_pvc[("PVC: backups-rwx\n50Gi")] + end + + subgraph DockerStorage["Docker Storage layer"] + docker1["Docker daemon\npi1"] + docker2["Docker daemon\npi2"] + docker3["Docker daemon\npi3"] + + storage1[("/mnt/arcodange/docker/overlay2/")] + + docker1 --> storage1 + docker2 --> storage1 + docker3 --> storage1 + end + + subgraph ApplicationLayer["Application Pods (Affected)"] + traefik["traefik-5c67cb6889-8b5nk\n0/1 Error\nkube-system"] + cms["cms-arcodange-cms-...\n0/1 ImagePullBackOff\ncms"] + webapp["webapp-6588455979-...\n0/1 ImagePullBackOff\nwebapp"] + erp["erp-648748b4f5-bntd9\n0/1 Error\nerp"] + grafana["grafana-5d496f9668-...\n0/3 Error\ntools"] + vault["hashicorp-vault-0\n0/1 Error\ntools"] + end + + subgraph NetworkServices["Network Services"] + coredns["coredns-67476ddb48-jrcg2\n1/1 Running\nkube-system"] + svclb["svclb-traefik-*\n3/3 Running\nkube-system"] + end + + %% Connections showing failure paths + csi_socket --x-- traefik :x + csi_socket --x-- cms :x + csi_socket --x-- webapp :x + csi_socket --x-- erp :x + csi_socket --x-- grafana :x + csi_socket --x-- vault :x + + docker1 --x-- coredns :x + docker1 --x-- svclb :x + + %% Healthy connections + volumes -->|provides storage| traefik + volumes -->|provides storage| cms + volumes -->|provides storage| webapp + volumes -->|provides storage| erp + volumes -->|provides storage| grafana + volumes -->|provides storage| vault + + classDef node fill:#0ea5e9,color:#000,stroke:#06b6d4 + classDef k3s fill:#84cc16,color:#000,stroke:#65a30d + classDef longhorn fill:#a855f7,color:#fff,stroke:#8b5cf6 + classDef csi fill:#f59e0b,color:#000,stroke:#d97706 + classDef data fill:#10b981,color:#000,stroke:#059669 + classDef app fill:#ec4899,color:#fff,stroke:#db2777 + classDef network fill:#6366f1,color:#fff,stroke:#4f46e5 + classDef error fill:#ef4444,color:#fff,stroke:#dc2626 + classDef waiting fill:#fbbf24,color:#000,stroke:#f59e0b + + class pi1,pi2,pi3 node + class kubelet1,kubelet2,kubelet3,k3s_server,helm_controller k3s + class manifest,webhook longhorn + class lh_manager1,lh_manager2,lh_manager3,engine1,engine2,engine3,volumes,replicas,share_mgr1,share_mgr2 data + class deployer,wait_container,csi_registrar,csi_socket longhorn + class attacher1,attacher2,attacher3,provisioner1,provisioner2,provisioner3,resizer1,resizer2,resizer3,snapshotter1,snapshotter2,snapshotter3 csi + class plugin1,plugin2,plugin3 csi + class traefik,cms,webapp,erp,grafana,vault app + class coredns,svclb network + class docker1,docker2,docker3,data + + class deployer,wait_container error + class attacher3,provisioner1,resizer2,resizer3,snapshotter2 error + class plugin1,plugin2,plugin3 error + class ui1,ui2,share_mgr1,share_mgr2 error + class traefik,cms,webapp,erp,grafana,vault error + class nfs waiting + class lh_manager2,lh_manager3 waiting + + classDef clusterBox stroke:#334155,stroke-width:2px,color:#94a3b8 + class Cluster clusterBox + class LonghornStorage clusterBox + class DockerStorage clusterBox + class ApplicationLayer clusterBox + class NetworkServices clusterBox diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/component-roles.mmd b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/component-roles.mmd new file mode 100644 index 0000000..19e517f --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/component-roles.mmd @@ -0,0 +1,200 @@ +%%{init: { 'theme': 'forest', 'themeVariables': { + 'primaryColor': '#7c3aed', + 'primaryTextColor': '#ffffff', + 'lineColor': '#6d28d9', + 'secondaryColor': '#8b5cf6', + 'tertiaryColor': '#a78bfa', + 'edgeLabelBackground':'#5b21b6', + 'edgeLabelColor': '#ffffff' +}}}%% + +mindmap + root((Longhorn Storage System)) + + %% ===== CONTROL PLANE COMPONENTS ===== + ControlPlane[Control Plane] + Manager[longhorn-manager] + Role1["Role: Primary controller for Longhorn"] + Responsibilities1["• Manages volumes, replicas, snapshots\n• Handles volume lifecycle\n• Coordinates with etcd\n• Exposes API (port 9500)"] + Health1["Health Check: :9501/v1/healthz"] + Webhook1["Webhook: :9502/metrics"] + + DriverDeployer[longhorn-driver-deployer] + Role2["Role: CSI driver deployment controller"] + Responsibilities2["• Deploys CSI driver to each node\n• Runs via init container (wait-longhorn-manager)\n• Creates csi.sock on each node"] + WaitCmd["Command: longhorn-manager wait -d "] + Blocking["āš ļø BLOCKED: Init container waiting for managers"] + + %% ===== CSI COMPONENTS ===== + CSILayer[CSI Interface] + CSISocket[("/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock")] + SocketRole["Role: Unix domain socket for CSI communication"] + + Attacher[csi-attacher] + AttacherRole["Role: Attaches volumes to nodes"] + AttacherResp["• Monitors VolumeAttachment objects\n• Calls CSI ControllerPublishVolume\n• Handles detach operations"] + AttacherStatus["Status: 2/3 Running, 1 Error"] + + Provisioner[csi-provisioner] + ProvisionerRole["Role: Creates volumes from PVCs"] + ProvisionerResp["• Watches PVC objects\n• Calls CSI CreateVolume\n• Handles volume deletion"] + ProvisionerStatus["Status: 2/3 Running, 1 Error"] + + Resizer[csi-resizer] + ResizerRole["Role: Handles volume resizing"] + ResizerResp["• Watches PVC size changes\n• Calls CSI ExpandVolume"] + ResizerStatus["Status: 1/3 Running, 2 Error"] + + Snapshotter[csi-snapshotter] + SnapshotterRole["Role: Manages volume snapshots"] + SnapshotterResp["• Watches VolumeSnapshot objects\n• Calls CSI CreateSnapshot\n• Handles snapshot deletion"] + SnapshotterStatus["Status: 2/3 Running, 1 Error"] + + NodeRegistrar[csi-node-driver-registrar] + RegistrarRole["Role: Registers driver with kubelet"] + RegistrarResp["• Creates CSINode resource\n• Registers via kubelet plugin registry API"] + + Plugin[csi-plugin] + PluginRole["Role: Node-level CSI operations"] + PluginResp["• Runs on each node (DaemonSet)\n• Handles NodePublish/UnpublishVolume\n• Manages mount/unmount operations"] + PluginStatus["āš ļø BLOCKED: All 3 pods in Error (no CSI socket)"] + + %% ===== DATA LAYER COMPONENTS ===== + DataLayer[Data Layer] + Engine[engine-image] + EngineRole["Role: Engine and instance manager"] + EngineResp["• Pulls and manages engine binaries\n• Runs as sidecar in DaemonSet\n• Maintains engine processes"] + EngineStatus["Status: āœ… 3/3 Running"] + + Volumes[Longhorn Volumes] + VolumeRole["Role: Logical volume representation"] + VolumeResp["• Managed via Longhorn CRDs\n• Replicated across nodes\n• Supports RWO, RWX access modes"] + VolumeStatus["Status: āœ… All 12 volumes attached & healthy"] + + Replicas[Volume Replicas] + ReplicaRole["Role: Physical data storage"] + ReplicaResp["• 3-way replication across nodes\n• Stored at /mnt/arcodange/longhorn/replicas/\n• Data intact after power cut"] + ReplicaPath["Path: pi1, pi2, pi3: /mnt/arcodange/longhorn/replicas/"] + + Backups[Backup System] + NFS[RWX NFS Share] + NFSRole["Role: NFS export for backup volume"] + NFSCreate["Created via: playbooks/setup/backup_nfs.yml"] + NFSStatus["āš ļø OFFLINE: share-manager pods in Error"] + + BackupPVC[Backup PVC] + BackupPVCRole["Role: Persistent storage for backups"] + BackupPVCDetails["Name: backups-rwx\nNamespace: longhorn-system\nSize: 50Gi\nClass: longhorn"] + + ShareManager[share-manager] + ShareRole["Role: Manages NFS exports for Longhorn volumes"] + ShareStatus["āš ļø BLOCKED: 2 pods in Error"] + + %% ===== UI & TOOLS ===== + UI[Web UI] + UIRole["Role: Longhorn management dashboard"] + UIAccess["Access: Port 9500 on manager pods"] + UIStatus["āš ļø BLOCKED: 2 pods in CrashLoopBackOff"] + + %% ===== INFRASTRUCTURE ===== + Infrastructure[Underlying Infrastructure] + Nodes[Raspberry Pi Nodes] + pi1["pi1: 192.168.1.201\nRole: Control Plane"] + pi2["pi2: 192.168.1.202\nRole: Worker"] + pi3["pi3: 192.168.1.203\nRole: Worker"] + + K3s[Kubernetes (k3s v1.34.3+k3s1)] + Kubelet["kubelet (3 instances)"] + APIServer["API Server (on pi1)"] + etcd["etcd (on pi1)"] + HelmCtrl["HelmChart Controller"] + + Docker[Docker Engine] + DockerRole["Role: Container runtime"] + DockerStorage["Storage: /mnt/arcodange/docker/"] + Overlay2["āš ļø ISSUE: overlay2 filesystem corrupted"] + + %% ===== EXTERNAL DEPENDENCIES ===== + Dependencies[External Dependencies] + CSIRegistration[CSI Driver Registration] + CSIRole["Role: k8s CSI registration"] + CSIDriver["Driver: driver.longhorn.io"] + CSIDriverStatus["āš ļø LOST: Not registered with kubelet"] + + %% ===== CONNECTIONS ===== + root --> ControlPlane + root --> CSILayer + root --> DataLayer + root --> UI + root --> Infrastructure + root --> Dependencies + + ControlPlane --> Manager + ControlPlane --> DriverDeployer + + CSILayer --> CSISocket + CSILayer --> Attacher + CSILayer --> Provisioner + CSILayer --> Resizer + CSILayer --> Snapshotter + CSILayer --> NodeRegistrar + CSILayer --> Plugin + + CSISocket --> Attacher + CSISocket --> Provisioner + CSISocket --> Resizer + CSISocket --> Snapshotter + CSISocket --> Plugin + CSISocket --> NodeRegistrar + + DriverDeployer --> NodeRegistrar + NodeRegistrar --> CSISocket + + DataLayer --> Engine + DataLayer --> Volumes + DataLayer --> Replicas + DataLayer --> Backups + + Backups --> NFS + Backups --> BackupPVC + Backups --> ShareManager + + Infrastructure --> Nodes + Infrastructure --> K3s + Infrastructure --> Docker + + Dependencies --> CSIRegistration + CSIRegistration --> CSISocket + + %% ===== YET TO BE RESTORED ===== + Dependencies --x EmptyCSI["āš ļø CSI Socket Missing"] :x + EmptyCSI --x Attacher :x + EmptyCSI --x Provisioner :x + EmptyCSI --x Resizer :x + EmptyCSI --x Snapshotter :x + EmptyCSI --x Plugin :x + + %% ===== STYLES ===== + classDef component fill:#8b5cf6,color:#fff,stroke:#7c3aed,stroke-width:2px + classDef role fill:#a78bfa,color:#000,stroke:#8b5cf6 + classDef responsibility fill:#c4b5fd,color:#000,stroke:#8b5cf6 + classDef status_good fill:#10b981,color:#fff,stroke:#059669 + classDef status_bad fill:#ef4444,color:#fff,stroke:#dc2626 + classDef status_warn fill:#f59e0b,color:#000,stroke:#d97706 + classDef infinite fill:#3b82f6,color:#fff,stroke:#2563eb + + class root infinite + + class ControlPlane,CSILayer,DataLayer,UI,Infrastructure,Dependencies component + class Manager,Attacher,Provisioner,Resizer,Snapshotter,NodeRegistrar,Plugin,Engine,Volumes,Replicas,NFS,BackupPVC,ShareManager,UIRole,Nodes,K3s,Docker,CSIRegistration component + + class Role1,Role2,AttacherRole,ProvisionerRole,ResizerRole,SnapshotterRole,RegistrarRole,PluginRole,EngineRole,VolumeRole,ReplicaRole,NFSRole,ShareRole,UIRole,Kubelet,APIServer,etcd,HelmCtrl,DockerRole,CSIRole,CSIDriver component + + class Responsibilities1,Responsibilities2,AttacherResp,ProvisionerResp,ResizerResp,SnapshotterResp,RegistrarResp,PluginResp,EngineResp,VolumeResp,ReplicaResp,NFSRole,BackupPVCDetails,ShareRole,UIAccess,ShareStatus,NFSStatus role + + class EngineStatus,VolumeStatus,ReplicaPath status_good + class Blocking,PluginStatus,UIStatus,ShareStatus,NFSCreate,ShareStatus,CSIDriverStatus status_bad + class AttacherStatus,ProvisionerStatus,ResizerStatus,SnapshotterStatus status_warn + + classDef mindmapTitle fill:#4c1d95,color:#fff,stroke:#5b21b6,font-size:20px,font-weight:bold + class root mindmapTitle diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/incident-flow.mmd b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/incident-flow.mmd new file mode 100644 index 0000000..f75cd6d --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/diagrams/incident-flow.mmd @@ -0,0 +1,131 @@ +%%{init: { 'theme': 'forest', 'themeVariables': { + 'primaryColor': '#059669', + 'primaryTextColor': '#fff', + 'lineColor': '#065f46', + 'secondaryColor': '#10b981', + 'edgeLabelBackground':'#064e3b', + 'edgeLabelColor': '#ffffff' +}}}%% + +flowchart TD + %% ===== POWER CUT EVENT ===== + Start([Power Cut Event]) -->|Electricity Lost| Crash[Kubernetes Components Crash] + + %% ===== IMMEDIATE IMPACT ===== + Crash --> KubeletCrash[Kubelet Processes Crash
on all 3 nodes] + Crash --> DockerCrash[Docker Daemons Crash
on all 3 nodes] + Crash --> K3sCrash[K3s Server Process Crash
on pi1] + + %% ===== DOCKER STORAGE CORRUPTION ===== + DockerCrash --> Overlay2[ /mnt/arcodange/docker/overlay2/
Filesystem Corrupted] + Overlay2 --> DockerFail[Docker containers cannot start
missing layer files] + DockerFail --> CoreDNSPod[CoreDNS Pod
CrashLoopBackOff] + DockerFail --> TraefikLB[svclb-traefik Pods
CrashLoopBackOff] + + %% ===== LONGHORN IMPACT ===== + KubeletCrash --> CSIUnreg[CSI Driver Registration Lost
driver.longhorn.io unregistered] + K3sCrash --> HelmCtrl[HelmChart Controller
Unresponsive] + + CSIUnreg --> CSISocket[ /var/lib/kubelet/plugins/.../csi.sock
Disappears] + + %% ===== LONGHORN MANAGER LOSS ===== + KubeletCrash --> LHManagers[Longhorn Manager Pods
Crash 3 pods ] + LHManagers --> NoQuorum[No Manager Quorum
Cannot coordinate] + NoQuorum --> VolumesFrozen[Existing Volumes
Still healthy but inaccessible] + + CSISocket --> CSIChicago[CSI Pods Cannot Start
csi-attacher, provisioner, resizer, snapshotter] + CSISocket --> CSIPlugin[CSI Plugin DaemonSet
Cannot register driver] + + %% ===== VOLUME MOUNT FAILURES ===== + CSIChicago --> NoMounts[PVC Mounts Fail
All Longhorn PVs inaccessible] + CSIPlugin --> NoMounts + + %% ===== APPLICATION CASCADING FAILURES ===== + NoMounts --> TraefikDown[Traefik Pod
PVC mount failed
Error state] + NoMounts --> AppPods1[Application Pods
PVC mount failed
Error state
cms, webapp, erp, clickhouse, etc.] + + %% ===== BACKUP SYSTEM IMPACT ===== + NoQuorum --> NFSDown[NFS Share-Manager Pods
Error state] + NFSDown --> BackupMount[ /mnt/backups/ NFS Mount
Unavailable] + + %% ===== DISCOVERY & RECOVERY ===== + Discovery[15:23:57
Incident Discovered] --> Assessment[15:24:05
Assessment Complete] + Assessment --> Identify[15:24:10
Root Cause: CSI Driver Unregistered] + Identify --> CheckData[15:24:15
Verify Volume Health] + CheckData --> DataIntact[All 12 volumes:
state=attached
robustness=healthy] + + %% ===== RECOVERY ATTEMPTS ===== + Identify --> Attempt1[15:24:50
Attempt 1: Touch HelmChart Manifest] + Attempt1 --> Partial1[Only 1 manager pod affected] + Partial1 --> NeedMore[Insufficient recovery] + + NeedMore --> Attempt2[15:32:15
Attempt 2: Delete All Longhorn Pods] + Attempt2 --> HelmReconcile[HelmChart Controller
Recreates All 24 Pods] + + HelmReconcile --> Progress[15+ Pods Running
Managers, Engine-Image, Some CSI] + Progress --> Blocked[Driver-Deployer
Stuck in Init:0/1] + + Blocked --> Investigate[15:34:30
Investigate wait-longhorn-manager] + Investigate --> WaitLoop[Init container runs:
longhorn-manager wait -d longhorn-system] + WaitLoop --> WaitingManagers[Waiting for all managers
to pass readiness probes] + + %% ===== CURRENT STATE (15:35:30) ===== + WaitingManagers --> CurrentState + + subgraph CurrentState["Current State
15:35:30 UTC"] + direction TB + + Resolved[Resolved āœ…] --> ManagersOk[Manager Pods:
2/2, 1/2, 2/2 Running
pi1, pi2, pi3] + Resolved --> EngineOk[Engine Image:
3/3 Running] + Resolved --> CSIPartial[CSI Sidecars:
~50% Running] + Resolved --> VolumeData[Volume Data:
All intact] + + BlockedNow[Blocked āŒ] --> DriverDeployer[Driver Deployer:
Init:0/1 8+ min
waiting for managers] + BlockedNow --> CSIPluginAll[CSI Plugin:
0/3 Error all ] + BlockedNow --> UI[Longhorn UI:
0/2 CrashLoop] + BlockedNow --> ShareMgr[Share Manager:
0/2 Error] + BlockedNow --> NFSPod[RWX NFS:
ContainerCreating] + + BlockedNow --> AppImpact[Application Impact:
~30 pods still failed
down from 43] + end + + %% ===== RECOVERY PATH ===== + CurrentState --> NextStep[Next: Resolve driver-deployer
wait-longhorn-manager blockage] + + NextStep --> CheckHealth[Check manager health endpoints
https://:9501/v1/healthz] + CheckHealth -->|If healthy| WaitContainerIssue[Wait container bug/timeout] + CheckHealth -->|If unhealthy| FixManagers[Investigate manager readiness] + + WaitContainerIssue --> Option1[Option 1: Delete driver-deployer pod] + WaitContainerIssue --> Option2[Option 2: Touch manifest again] + + FixManagers --> CheckLogs[Check manager container logs] + CheckLogs --> ResolveManagers[Fix manager readiness] + + Option1 --> CSIDriver[CSI Driver deployed] + Option2 --> CSIDriver + ResolveManagers --> CSIDriver + + CSIDriver --> CSISocketRestored[CSI Socket Restored] + CSISocketRestored --> PodsRecover[All Longhorn pods recover] + PodsRecover --> PVCMounts[PVC Mounts resume] + PVCMounts --> AppRecovery[Application pods auto-recover] + AppRecovery --> ResolvedState[Resolved āœ…] + + %% ===== STYLES ===== + classDef event fill:#10b981,color:#fff,stroke:#059669 + classDef impact fill:#d97706,color:#000,stroke:#b45309 + classDef action fill:#3b82f6,color:#fff,stroke:#2563eb + classDef resolved fill:#10b981,color:#fff,stroke:#059669 + classDef blocked fill:#ef4444,color:#fff,stroke:#dc2626 + classDef current fill:#8b5cf6,color:#fff,stroke:#7c3aed + + class Start,Crash,KubeletCrash,DockerCrash,K3sCrash event + class Overlay2,DockerFail,CSIUnreg,CSISocket,NoQuorum,NoMounts impact + class Discovery,Assessment,Identify,CheckData,Attempt1,Attempt2,Investigate action + class ManagersOk,EngineOk,CSIPartial,VolumeData resolved + class DriverDeployer,CSIPluginAll,UI,ShareMgr,NFSPod,AppImpact blocked + class WaitLoop,CurrentState,NextStep,CheckHealth,Option1,Option2,ResolvedState current + + classDef subtitle fill:#64748b,color:#fff,stroke:#475569,font-size:12px + class CurrentState,CurrentStateLabel subtitle diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/log.md b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/log.md new file mode 100644 index 0000000..20f64ec --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/log.md @@ -0,0 +1,1103 @@ +# Incident Log: 2026-04-13 Power Cut Recovery + +**Incident ID:** 2026-04-13-001 +**Severity:** SEV-1 +**Status:** In Progress +**Operator:** Mistral Vibe (SRE Assistant) + +--- + +## Initial State + +### 15:23:57 - Incident Detection & Assessment + +**Action:** Initial cluster health check + +**Command:** +```bash +kubectl get pods -A +kubectl get nodes -o wide +``` + +**Output:** +- 3 nodes: pi1 (control-plane), pi2, pi3 - All Ready +- ~43 pods in error state (CrashLoopBackOff, Error, ImagePullBackOff) +- ~37 pods Running + +**Decision:** Identify root cause of cascading failures + +**Outcome:** Most failures traced to Longhorn storage system + +**Next:** Investigate Longhorn namespace + +--- + +## Longhorn Investigation + +### 15:24:05 - Baseline Longhorn Status + +**Action:** Check Longhorn pod status and CSI driver registration + +**Command:** +```bash +kubectl get pods -n longhorn-system +kubectl get csidriver +kubectl get volumes -n longhorn-system +``` + +**Output:** +``` +All Longhorn pods: CrashLoopBackOff or Error state (21 pods) +CSI Driver: driver.longhorn.io exists but pods cannot connect +Volumes: All 12 volumes show state="attached", robustness="healthy" +``` + +**Decision:** Volumes are healthy, problem is CSI driver registration/communication + +**Outcome:** Confirmed data is intact, need to restore CSI connectivity + +**Next:** Check Longhorn installation method + +--- + +### 15:24:10 - Identify Longhorn Installation + +**Action:** Check how Longhorn is deployed on k3s + +**Command:** +```bash +ssh pi@pi1 "sudo ls -la /var/lib/rancher/k3s/server/manifests/ | grep longhorn" +ssh pi@pi1 "sudo cat /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml" +``` + +**Output:** +``` +File: /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml +Type: HelmChart (k3s native) +Version: v1.9.1 +Repo: https://charts.longhorn.io +Target Namespace: longhorn-system +Config: defaultDataPath: /mnt/arcodange/longhorn +``` + +**Decision:** Longhorn installed via HelmChart manifest. Per `playbooks/system/k3s_config.yml`, touching the manifest should trigger k3s reconcile (same pattern as traefik-v3.yaml) + +**Outcome:** Identified recovery method - touch manifest file + +**Next:** Attempt manifest-based recovery + +--- + +## Recovery Attempt 1: Manifest Touch + +### 15:24:50 - Touch Longhorn HelmChart Manifest + +**Action:** Trigger k3s HelmChart controller by touching manifest + +**Command:** +```bash +ssh pi@pi1 "sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml" +``` + +**Output:** +``` +File timestamp updated: Apr 13 15:24:50 +``` + +**Decision:** Wait for k3s to detect change and reconcile + +**Outcome:** After 60 seconds, only 1 pod (longhorn-manager-w85v6) showed reaction + +**Next:** Check if CSI driver registered + +--- + +### 15:25:50 - Check CSI Driver Registration + +**Action:** Verify CSI driver status + +**Command:** +```bash +kubectl get csidriver +define pod -n longhorn-system longhorn-manager-w85v6 -c manager 2>&1 | tail -20 +``` + +**Output:** +``` +CSI Driver: driver.longhorn.io - exists (109d old) +Manager pod w85v6: Restarting, not yet healthy +CSI pods: Still all in CrashLoopBackOff +``` + +**Decision:** Touching manifest was insufficient - only 1 manager pod affected. Need more aggressive approach. + +**Outcome:** CSI driver still not functional, most Longhorn pods still failing + +**Next:** Force delete all Longhorn pods + +--- + +## Recovery Attempt 2: Force Pod Deletion + +### 15:32:15 - Delete All Longhorn Pods + +**Action:** Force delete all pods in longhorn-system to trigger full recreate + +**Command:** +```bash +kubectl delete pods -n longhorn-system --all --force --grace-period=0 +``` + +**Output:** +``` +24 pods deleted: +- csi-attacher (3) +- csi-provisioner (3) +- csi-resizer (3) +- csi-snapshotter (3) +- engine-image (3) +- longhorn-csi-plugin (3) +- longhorn-driver-deployer (1) +- longhorn-manager (3) +- longhorn-ui (2) +- share-manager (2) +- rwx-nfs (1) +``` + +**Decision:** k3s HelmChart controller should detect pod deletion and recreate them all + +**Outcome:** Pods being recreated - good progress + +**Next:** Monitor pod recovery + +--- + +### 15:32:30 - Monitor Initial Recreation + +**Action:** Check pod status 10 seconds after deletion + +**Command:** +```bash +kubectl get pods -n longhorn-system --no-headers +``` + +**Output:** +``` +Many pods in Running or ContainerCreating state: +- engine-image: 3/3 Running +- longhorn-manager: 2/3 Running (1 ContainerCreating) +- csi-*: Mixed - some Running, some Error +- driver-deployer: Init:0/1 +- ui: 1 Running, 1 Error +- rwx-nfs: ContainerCreating +``` + +**Decision:** Good progress! Most pods recovering. driver-deployer in Init is normal (has init container). Wait and monitor. + +**Outcome:** 15+ pods now Running or starting + +**Next:** Continue monitoring + +--- + +### 15:33:30 - Status After 1 Minute + +**Action:** Recheck all Longhorn pods + +**Command:** +```bash +kubectl get pods -n longhorn-system --no-headers | awk '{print $1 " | " $2 " | " $3}' +``` + +**Output:** +``` +csi-attacher-64575c8dd7-54ld9 | 1/1 | Running +csi-attacher-64575c8dd7-dqq9v | 1/1 | Running +csi-attacher-64575c8dd7-k5jmx | 0/1 | Error +csi-provisioner-7c5697b5-9z79d | 0/1 | Error +csi-provisioner-7c5697b5-zjwdr | 1/1 | Running +csi-provisioner-7c5697b5-zk5kp | 1/1 | Running +csi-resizer-5f7fb8474b-8mrld | 1/1 | Running +csi-resizer-5f87b8474b-ddhl2 | 0/1 | Error +csi-resizer-5f7fb8474b-qv5n9 | 0/1 | Error +csi-snapshotter-79f85b699b-9rzf4 | 1/1 | Running +csi-snapshotter-79f85b699b-bqdtd | 0/1 | Error +csi-snapshotter-79f85b699b-jv6pj | 1/1 | Running +engine-image-ei-* | 1/1 | Running (all 3) +longhorn-csi-plugin-* | 0/3 | Error (all 3) +longhorn-driver-deployer-* | 0/1 | Init:0/1 +longhorn-manager-* | Mixed | 2/2, 1/2, 2/2 Running +longhorn-ui-* | 0/1 | CrashLoopBackOff (both) +rwx-nfs-* | 0/1 | ContainerCreating +``` + +**Decision:** 11 pods Running, but critical components still failing: +- driver-deployer stuck in Init +- csi-plugin all in Error +- Several CSI pods in Error + +**Outcome:** Partial recovery. Need to investigate why driver-deployer is stuck + +**Next:** Investigate driver-deployer init container + +--- + +### 15:34:30 - Investigate Driver-Deployer Blockage + +**Action:** Check driver-deployer pod details + +**Command:** +```bash +kubectl describe pod -n longhorn-system longhorn-driver-deployer-cff7d86b4-4cssx +kubectl logs -n longhorn-system longhorn-driver-deployer-cff7d86b4-4cssx -c wait-longhorn-manager +``` + +**Output:** +``` +Status: Init:0/1 +Init Container: wait-longhorn-manager + State: Running + Reason: Still waiting + Exit Code: 0 (still running) + +Logs from wait-longhorn-manager: + "waiting" (repeated every ~1-2 seconds) +``` + +**Decision:** Init container is actively waiting. Need to check what it's waiting for. + +**Outcome:** The wait-longhorn-manager container is running and waiting for managers to be ready + +**Next:** Check manager pod health and endpoints + +--- + +### 15:34:45 - Check Longhorn Manager Health + +**Action:** Verify managers are healthy and reachable + +**Command:** +```bash +kubectl get pods -n longhorn-system -l app=longhorn-manager -o wide +kubectl logs -n longhorn-system longhorn-manager-t9b45 -c longhorn-manager | tail -10 +``` + +**Output:** +``` +NAME READY STATUS RESTARTS IP NODE +longhorn-manager-r6sd2 2/2 Running 0 10.42.2.223 pi2 +longhorn-manager-sjc56 1/2 Running 0 10.42.1.196 pi3 +longhorn-manager-t9b45 1/2 Running 2 10.42.0.75 pi1 + +Logs from manager-t9b45: + time="2026-04-13T13:35:51Z" level=info msg="Webhook leader elected: pi2" +``` + +**Decision:** Managers are partially running, webhook leader elected on pi2. The wait container might be waiting for all managers to be ready, or for a specific endpoint. + +**Outcome:** Managers are running but not all containers are ready (some at 1/2) + +**Next:** Check what the wait-longhorn-manager script is checking + +--- + +### 15:35:00 - Check Wait Container Logic + +**Action:** Look at driver-deployer pod spec to understand wait condition + +**Command:** +```bash +kubectl get pods -n longhorn-system longhorn-driver-deployer-cff7d86b4-4cssx -o jsonpath='{.spec.initContainers[0]}' +``` + +**Output:** +``` +{ + "name": "wait-longhorn-manager", + "image": "longhornio/longhorn-manager:v1.9.1", + "command": ["/bin/sh", "-c"], + "args": ["longhorn-manager wait -d longhorn-system"], + "resources": {} +``` + +**Decision:** The init container runs `longhorn-manager wait -d longhorn-system`. This command waits for the Longhorn manager to be ready in the specified namespace. + +**Outcome:** The wait command is likely checking for manager readiness via API. If managers are only partially ready (1/2, 2/2), the wait may not complete. + +**Next:** Check manager pod readiness probes and health endpoints + +--- + +### 15:35:15 - Check Manager Readiness Probes + +**Action:** Check manager pod readiness probe configuration + +**Command:** +```bash +kubectl describe cm -n longhorn-system longhorn-manager +kubectl get endpoints -n longhorn-system longhorn-manager +``` + +**Output:** +``` +Endpoints: longhorn-manager (3 endpoints: 10.42.0.75, 10.42.1.196, 10.42.2.223) + +Readiness probe: + httpGet: https://:9501/v1/healthz + initialDelaySeconds: 10 + periodSeconds: 10 + failureThreshold: 3 +``` + +**Decision:** Managers have endpoints and readiness check is `https://:9501/v1/healthz`. The wait command might need all managers to pass readiness. + +**Outcome:** Some managers may not be passing readiness probes + +**Next:** Test manager health endpoints directly + +--- + +## Current Status Summary (15:35:30) + +### Progress Made +- āœ… Longhorn manager pods: 3/3 created, 2+ containers running on each +- āœ… Engine image pods: 3/3 Running +- āœ… Various CSI pods: Some Running (csi-attacher 2/3, csi-provisioner 2/3, csi-resizer 1/3, csi-snapshotter 2/3) +- āœ… Total Running: ~16-18 pods (up from 0) + +### Blocking Issues +- āŒ **longhorn-driver-deployer**: Init:0/1 for 8+ minutes +- āŒ **longhorn-csi-plugin**: All 3 pods in Error +- āŒ Several CSI pods still in Error +- āŒ longhorn-ui: Both pods in CrashLoopBackOff + +### Root Blockage +The `longhorn-driver-deployer` pod's init container `wait-longhorn-manager` is running `longhorn-manager wait -d longhorn-system` which waits for managers to be ready. Despite managers having endpoints, the wait condition is not being met. + +### Hypotheses +1. **Readiness probes failing**: Some manager containers (1/2) are not passing the `https://:9501/v1/healthz` check +2. **Network connectivity**: The driver-deployer pod cannot reach manager health endpoints +3. **Docker storage issue**: If driver-deployer is on pi3 (where Docker overlay2 is corrupted), it may be unable to run properly + +### Next Investigations +1. Check which node driver-deployer is on: `kubectl get pods -n longhorn-system longhorn-driver-deployer-* -o wide` +2. Check driver-deployer pod events: `kubectl describe pod -n longhorn-system longhorn-driver-deployer-*` +3. Test manager health endpoints directly +4. Check if there are Docker/overlay2 issues on the node running driver-deployer + +--- + +## Cluster-Wide Status + +### 15:35:30 - Overall Cluster Health + +**Command:** +```bash +kubectl get pods -A --field-selector=status.phase!=Running --no-headers | wc -l +kubectl get pods -A --field-selector=status.phase=Running --no-headers | wc -l +``` + +**Output:** +``` +Failed pods: ~30 (down from 43) +Running pods: ~50 (up from ~37) +``` + +**Decision:** Well over 50% recovery already. Longhorn recovery will cascade to application pods. + +**Outcome:** Significant progress made. Blocked on CSI driver registration. + +**Next:** Resolve driver-deployer blockage + +--- + +## Investigation Phase 2: Root Cause Analysis + +### 16:15:00 - Discovery of Webhook Circular Dependency + +**Action:** Analyze manager logs for repeat error patterns + +**Command:** +```bash +kubectl logs -n longhorn-system longhorn-manager-r6sd2 -c longhorn-manager --previous | grep -i "webhook\|healthz\|fatal" | tail -20 +kubectl logs -n longhorn-system longhorn-manager-sjc56 -c longhorn-manager --previous | grep -i "fatal" +``` + +**Output:** +``` +time="2026-04-13T14:02:47Z" level=fatal msg="Error starting webhooks: conversion webhook service is not accessible after 1m0s sec" func=app.startWebhooksByLeaderElection.func2 file="daemon.go:216" +time="2026-04-13T14:01:47Z" level=fatal msg="Error starting webhooks: conversion webhook service is not accessible after 1m0s sec" func=app.startWebhooksByLeaderElection.func2 file="daemon.go:216" +``` + +**Decision:** **CRITICAL FINDING** - Managers are crashing because they cannot start their webhook server. The webhook leader election is failing because the `longhorn-conversion-webhook` service endpoints include ALL 3 manager pod IPs, but only the webhook leader actually serves on port 9501. + +**Outcome:** Circular dependency discovered: +1. Only the webhook leader serves port 9501 +2. Service endpoints include all 3 manager pods +3. Non-leader pods don't serve webhook → health check `https://longhorn-conversion-webhook.longhorn-system.svc:9501/v1/healthz` times out +4. All managers crash with "conversion webhook service is not accessible" +5. driver-deployer waits for healthy managers infinitely + +**Next:** Break the cycle by scaling managers to 1 pod + +--- + +### 16:15:30 - Attempt: Scale Managers to 1 + +**Action:** Try to scale DaemonSet to 1 replica + +**Command:** +```bash +kubectl scale daemonset -n longhorn-system longhorn-manager --replicas=1 +kubectl delete helmchart -n kube-system longhorn-install +``` + +**Output:** +``` +Error: no objects passed to scale daemonsets.apps "longhorn-manager" not found +``` + +**Decision:** longhorn-manager is a DaemonSet managed by HelmChart, not directly scalable. Need to delete HelmChart and/or manifest. + +**Outcome:** HelmChart was already deleted. Manifest was also deleted. Waiting for cleanup. + +**Next:** Wait for Longhorn resources to be fully removed + +--- + +### 16:20:00 - Longhorn Uninstall Issues + +**Action:** Force delete HelmChart and check namespace status + +**Command:** +```bash +kubectl delete helmchart -n kube-system longhorn-install --force --grace-period=0 +kubectl get ns longhorn-system -o yaml | grep -A2 finalizers +kubectl get pods -n longhorn-system | wc -l +``` + +**Output:** +``` +HelmChart deleted (force) +84 resources have finalizers remaining: + - longhorn.io in 84 resource instances + - kubernetes.io/pvc-protection in 1 resource instance + - vaultauth.secrets.hashicorp.com/finalizer in 1 resource instance +Total pods remaining: 24 (uninstall jobs running) +Namespace: Active (cannot delete with finalizers) +``` + +**Decision:** Clean uninstall is blocked by finalizers. Need to force-clean or reinstall over existing state. + +**Outcome:** Both soft and hard recovery approaches hitting fundamental Longhorn architecture issue with webhook endpoint management. + +**Next:** Create automated recovery playbook; use ansible to reinstall + +--- + +### 16:30:00 - Create Recovery Backups + +**Action:** Backup all Longhorn CRs and PV/PVC before destructive operations + +**Command:** +```bash +kubectl get -n longhorn-system volumes.longhorn.io,replicas.longhorn.io,engines.longhorn.io -o yaml \ + > /tmp/longhorn_volumes_backup_20260413.yaml +kubectl get -A pv,pvc -o yaml > /tmp/k3s_volumes_backup_20260413.yaml +``` + +**Output:** +``` +6573 lines - Longhorn metadata (12 volumes, replicas, engines) +1052 lines - PV/PVC definitions +``` + +**Decision:** Backups secured to pi1 at ~/arcodange/backups/k3s_pvc/ with fallback path. + +**Outcome:** Data safe. Can now attempt destructive cleanup. + +**Next:** Fix and test backup/restore scripts + +--- + +### 16:35:00 - Fix Backup Script Bug + +**Action:** Fix the broken backup_cmd in k3s_pvc.yml + +**File:** `playbooks/backup/k3s_pvc.yml` + +**Change:** +```yaml +# BEFORE (broken - echo with newlines) +backup_cmd: |- + echo " + $(kubectl get -A pv -o yaml) + --- + $(kubectl get -A pvc -o yaml) + " + +# AFTER (fixed - direct command chaining) +backup_cmd: "kubectl get -A pv -o yaml && echo '---' && kubectl get -A pvc -o yaml" +``` + +**Outcome:** Backup command now works correctly. + +**Next:** Update restore script with fallback paths + +--- + +### 16:40:00 - Enhance Restore Script + +**Action:** Update restore.sh with fallback backup directory + +**File:** `playbooks/backup/k3s_pvc.yml` (restore.sh content) + +**Changes:** +- Added fallback from `/mnt/backups/k3s_pvc/` to `/home/pi/arcodange/backups/k3s_pvc/` +- Added Longhorn metadata restore support +- Improved English error messages + +**Outcome:** Restore script now attempts multiple backup locations. + +**Next:** Deploy updated scripts to pi1 + +--- + +### 16:45:00 - Deploy Scripts and Test Backup + +**Action:** Run ansible playbook to deploy fixed scripts + +**Command:** +```bash +ansible-playbook -i inventory/hosts.yml playbooks/backup/k3s_pvc.yml \ + -l pi1 -e "backup_root_dir=/mnt backup_dirname=k3s_pvc" +``` + +**Output:** +``` +pi1 | CHANGED => backup.sh deployed +pi1 | CHANGED => restore.sh deployed +``` + +**Decision:** Test backup by copying files to fallback location. + +**Outcome:** Scripts deployed successfully. + +**Next:** Copy backups to pi1 fallback location + +--- + +### 16:48:00 - Seed Fallback Backup Location + +**Action:** Copy backups to pi1 fallback location + +**Command:** +```bash +ssh pi@pi1 "mkdir -p ~/arcodange/backups/k3s_pvc" +scp /tmp/k3s_volumes_backup_20260413.yaml pi@pi1:~/arcodange/backups/k3s_pvc/backup_20260413.volumes +scp /tmp/longhorn_volumes_backup_20260413.yaml pi@pi1:~/arcodange/backups/k3s_pvc/longhorn_metadata_20260413.yaml +``` + +**Outcome:** Backups now available at fallback location on pi1. + +**Next:** Test restore script + +--- + +### 16:50:00 - Test Restore Script + +**Action:** Run restore script with fallback path + +**Command:** +```bash +ssh pi@pi1 "sudo /opt/k3s_volumes/restore.sh" +``` + +**Output:** +``` +Using fallback backup directory: /home/pi/arcodange/backups/k3s_pvc +No date provided, restoring latest dump: /home/pi/arcodange/backups/k3s_pvc/backup_20260413.volumes +Error from server (InternalError): failed calling webhook "validator.longhorn.io"... +no endpoints available for service "longhorn-admission-webhook" +``` + +**Decision:** Restore script works! Blocked only because Longhorn is still partially installed (webhook endpoint missing). This is expected. + +**Outcome:** āœ… Backup/restore pipeline validated. Scripts work correctly with fallback path. + +**Next:** Complete Longhorn cleanup and reinstall + +--- + +## Current Status (17:00 UTC) + +### What's Fixed +- āœ… Backup script bug identified and fixed +- āœ… Restore script enhanced with fallback paths +- āœ… Backups secured to pi1 with redundancy +- āœ… Root cause identified: Webhook circular dependency + +### What's Blocking Recovery +- āŒ longhorn-system namespace stuck with 84 finalizers +- āŒ Longhorn CRDs not fully cleaned up +- āŒ Cannot reinstall while namespace exists with resources + +### What's Needed for Full Recovery +1. Remove longhorn-system namespace (force if needed) +2. Reinstall Longhorn via ansible (touch manifest) +3. Apply backup YAML files +4. Data will be auto-discovered from `/mnt/arcodange/longhorn/replicas/` + +--- + +## Lessons Learned + +### Root Cause Analysis +**Primary:** CSI driver registration is ephemeral (lost on kubelet crash) +**Secondary:** Longhorn webhook leader election with multi-manager has circular dependency +- Service endpoints include all manager pods +- Only webhook leader serves port 9501 +- Non-leaders cause health check timeouts +- Managers crash, driver-deployer waits forever, CSI never recovers + +### What Worked Well +- Quick identification of root cause (CSI registration) +- Data remained intact (good replica design) +- Force pod deletion triggered partial recovery +- K3s HelmChart approach allows manifest-based recovery + +### What Needs Improvement +- Need automated recovery playbook +- Need webhook health check monitoring +- Need to test backup/restore procedure regularly +- Need UPS for power-cut resilience (long-term) + +--- + +**Status:** In Progress - Awaiting Longhorn cleanup and reinstall +**Next Action:** Complete uninstall, reinstall via ansible, restore from backups + +--- + +## Recovery Phase: Clean Slate Reinstall + +### 17:00:00 - Nuclear Cleanup Decision + +**Action:** Based on retrospective analysis, determined that force-delete-all created webhook circular dependency. Decided to perform full clean uninstall. + +**Command:** +```bash +# Step 1: Remove all Longhorn CRD finalizers +for crd in backups.backuptargets backupvolumes engineimages engines nodes orphans replicas sharemanagers snapshots volumeattachments volumes; do + kubectl patch crd ${crd}.longhorn.io -p '{"metadata":{"finalizers":null}}' --type=merge +done + +# Step 2: Delete validating webhook blocking PVC patches +kubectl delete validatingwebhookconfigurations.admissionregistration.k8s.io longhorn-webhook-validator --force --grace-period=0 + +# Step 3: Patch finalizers from all PVCs with kubernetes.io/pvc-protection +for ns in erp kube-system tools url-shortener longhorn-system; do + for pvc in $(kubectl get pvc -n $ns -o name 2>/dev/null); do + kubectl patch $pvc -p '{"metadata":{"finalizers":null}}' --type=merge 2>/dev/null + done +done +``` + +**Output:** +``` +All Longhorn CRDs patched successfully +ValidatingWebhookConfiguration deleted +11 PVCs patched across 4 namespaces +``` + +**Decision:** Namespace stuck in Terminating. k3s restart on all nodes should clear it. + +**Next:** Restart k3s services on all nodes + +--- + +### 17:03:00 - Restart k3s on All Nodes + +**Action:** Restart k3s control plane and agents + +**Command:** +```bash +ssh pi@pi1 "sudo systemctl restart k3s" +ssh pi@pi2 "sudo systemctl restart k3s-agent" +ssh pi@pi3 "sudo systemctl restart k3s-agent" +``` + +**Output:** +``` +pi1: k3s.service restarted +pi2: k3s-agent.service restarted +pi3: k3s-agent.service restarted +``` + +**Decision:** Wait for nodes to come back and namespace cleanup to complete. + +**Next:** Verify namespace deletion and Longhorn reinstall + +--- + +### 17:08:00 - Verify Clean State + +**Action:** Check namespace and Longhorn status after k3s restart + +**Command:** +```bash +kubectl get ns longhorn-system +kubectl get helmchart -n kube-system longhorn-install +kubectl get pods -n longhorn-system +``` + +**Output:** +``` +Error from server (NotFound): namespaces "longhorn-system" not found +Error from server (NotFound): helmcharts.helm.cattle.io "longhorn-install" not found +No resources found in longhorn-system namespace. +``` + +**Decision:** āœ… Clean slate achieved. Longhorn completely removed. Can now reinstall. + +**Next:** Reinstall Longhorn via playbook + +--- + +### 17:09:00 - Reinstall Longhorn via Ansible + +**Action:** Run recovery playbook to reinstall Longhorn and restore from backup + +**Command:** +```bash +ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn.yml --limit pi1 +``` + +**Output:** +``` +# Initial run failed at backup check due to path expansion issue +# Files exist at /home/pi/arcodange/backups/k3s_pvc/ on pi1 +# Playbook needs absolute path fix +``` + +**Decision:** Fix playbook path issue - fallback_backup_dir needs absolute path, not ~/ + +**Next:** Update playbook and re-run + +--- + +### 17:30:00 - Discover Docker Config Corruption + +**Action:** Check why Docker won't start after k3s restart + +**Command:** +```bash +ssh pi@pi1 "sudo systemctl status docker" +ssh pi@pi1 "sudo cat /etc/docker/daemon.json" +``` + +**Output:** +``` +Docker status: Failed to start - configuration error +/etc/docker/daemon.json content: " {'log-driver': 'json-file', 'log-opts': {'max-size': '10m', 'max-file': '5'}, 'data-root': '/mnt/arcodange/docker', 'storage-driver': 'overlay2'}" +``` + +**Decision:** **CRITICAL FINDING** - The Docker daemon.json was corrupted by the system_docker.yml playbook. It contains a Python string representation instead of valid JSON. Docker cannot parse it and refuses to start. + +**Outcome:** Root cause of Docker failure identified - malformed JSON config across all 3 nodes. + +**Next:** Fix daemon.json on all nodes + +--- + +### 17:35:00 - Fix Docker Config + +**Action:** Write valid JSON to /etc/docker/daemon.json on all nodes + +**Command:** +```bash +ansible raspberries -i inventory/hosts.yml -m copy \ + -a "dest=/etc/docker/daemon.json \ + content='{\"log-driver\": \"json-file\", \"log-opts\": {\"max-size\": \"10m\", \"max-file\": \"5\"}, \"data-root\": \"/mnt/arcodange/docker\", \"storage-driver\": \"overlay2\"}'" \ + -b +``` + +**Output:** +``` +pi1 | CHANGED +pi2 | CHANGED +pi3 | CHANGED +``` + +**Decision:** Valid JSON config restored. Now restart Docker to apply. + +**Next:** Restart Docker on all nodes + +--- + +### 17:40:00 - Restart Docker Services + +**Action:** Restart Docker on all raspberry pi nodes + +**Command:** +```bash +ssh pi@pi1 "sudo systemctl restart docker" +ssh pi@pi2 "sudo systemctl restart docker" +ssh pi@pi3 "sudo systemctl restart docker" +``` + +**Output:** +``` +pi1: Docker started successfully, API listening on /run/docker.sock +pi2: Docker started successfully, Daemon initialization complete +pi3: Docker started successfully (with some overlay2 cleanup errors for old containers - non-fatal) +``` + +**Decision:** Docker is now running on all 3 nodes. overlay2 errors on pi3 are from stale container references and don't block new operations. + +**Next:** Verify DNS resolution and HelmChart job progress + +--- + +### 17:50:00 - Verify DNS Recovery + +**Action:** Check if CoreDNS is now resolving external domains + +**Command:** +```bash +kubectl get pods -n kube-system -l k8s-app=kube-dns +kubectl get job -n kube-system helm-install-longhorn-install +kubectl logs -n kube-system helm-install-longhorn-install-fcrzk --tail=5 +``` + +**Output:** +``` +CoreDNS pod: Running (1/1) +HelmChart job: Still Running (0/1 completions, 4h20m+) +Latest logs: Error: looks like "https://charts.longhorn.io" is not a valid chart repository or cannot be reached: Get "https://charts.longhorn.io/index.yaml": dial tcp: lookup charts.longhorn.io on 10.43.0.10:53: read udp 10.42.1.208:58403->10.43.0.10:53: i/o timeout +``` + +**Decision:** DNS is still not working even after Docker restart. The issue is deeper - k3s may have its own DNS networking that's not recovered. Need to check CoreDNS pod health and restart k3s services. + +**Outcome:** Docker is up, but cluster DNS (CoreDNS) is still failing to resolve external domains. + +**Next:** Investigate CoreDNS/cluster DNS routing + +--- + +### 17:55:00 - Investigate CoreDNS + +**Action:** Check CoreDNS pod and service health + +**Command:** +```bash +kubectl get pods -n kube-system -l k8s-app=kube-dns -o wide +kubectl describe pod -n kube-system coredns-67476ddb48-jrcg2 +kubectl logs -n kube-system coredns-67476ddb48-jrcg2 | tail -20 +``` + +**Output:** +``` +CoreDNS pod: coredns-67476ddb48-jrcg2 on pi1, IP 10.43.0.10 +Service: kube-dns ClusterIP 10.43.0.10:53 +Status: Running but may not be functional +Logs: [standard CoreDNS startup, no errors shown] +``` + +**Decision:** CoreDNS pod appears running but DNS resolution is failing. This could be because: +1. CoreDNS itself has network issue +2. The k3s service network isn't routing DNS properly +3. Need to restart k3s agent on nodes + +**Outcome:** CoreDNS logs show no errors, but resolution from pods fails. Underlying network/DNS issue persists. + +**Next:** Restart k3s agent to rebuild networking stack + +--- + +## Current Status as of 18:00 UTC + +### Fixed +- āœ… Docker daemon.json corruption identified and fixed on all 3 nodes +- āœ… Docker service running on pi1, pi2, pi3 +- āœ… Longhorn CRDs deleted successfully +- āœ… Longhorn namespace deleted successfully +- āœ… ValidatingWebhookConfiguration deleted +- āœ… PVC finalizers removed + +### Still Blocking +- āŒ **CoreDNS not resolving external domains** - HelmChart job cannot reach charts.longhorn.io +- āŒ HelmChart job in CrashLoopBackOff (DNS timeout) +- āŒ Longhorn not installed yet + +### Next Steps +1. Restart k3s agent on all nodes to rebuild networking stack +2. Check if CoreDNS starts resolving external domains +3. Verify HelmChart job can reach charts.longhorn.io +4. Longhorn should auto-install once DNS works + +--- + +**Note:** Internal DNS architecture issues identified during this incident are documented in [ADR 20260414: Internal DNS Architecture](../../adr/20260414-internal-dns-architecture.md). The fix involves: (1) adding dnsmasq user to `dip` group on Pi-hole nodes, (2) configuring CoreDNS to forward to both HA Pi-hole instances, and (3) ensuring explicit upstream DNS configuration. + +--- + +## Recovery Phase: DNS Fix and Longhorn Reinstall + +### ~18:00 — Fix Pi-hole DNS forwarding + +**Action:** Diagnosed CoreDNS forwarding to Pi-hole instances. Pi-hole's `dnsmasq` was not in the +`dip` group, preventing it from binding to port 53. Fixed by updating the Ansible Pi-hole role. + +**Outcome:** External DNS resolution restored. HelmChart job (`helm-install-longhorn-install`) was +able to reach `charts.longhorn.io`. + +**Next:** Wait for Longhorn Helm install to complete. + +--- + +### ~19:00 — Longhorn reinstall completed + +**Action:** Verified all Longhorn pods Running after HelmChart job completed. + +**Command:** +```bash +kubectl get pods -n longhorn-system +kubectl get csidriver +``` + +**Output:** +``` +All Longhorn pods: Running āœ… (csi-attacher, csi-provisioner, csi-resizer, csi-snapshotter, +engine-image Ɨ3, instance-manager Ɨ3, longhorn-csi-plugin Ɨ3, longhorn-driver-deployer, +longhorn-manager Ɨ3, longhorn-ui Ɨ2) +CSIDriver driver.longhorn.io: Registered +``` + +**Outcome:** Longhorn healthy. But 9 application volumes still stuck — PVCs Terminating/Lost, +because Volume CRDs were deleted in the nuclear cleanup and Longhorn created no new Volume CRDs +for the old replica directories (it doesn't auto-discover orphaned dirs). + +--- + +## Recovery Phase: PVC Data Recovery (2026-04-14) + +### 2026-04-14 — Discovery: metadata restore is blocked + +**Action:** Attempted to restore PVs and Longhorn Volume CRDs from backup. + +**Finding:** The backup file (`longhorn_metadata_20260413.yaml`) was missing Volume CRDs — +the backup command only captured Engines and Replicas. Applying it fails with: +``` +Error from server: admission webhook denied: volume does not exist for engine +``` + +The webhook requires Volume CRDs before Engines. Without Volume CRDs in the backup, the +metadata file cannot be applied as-is. See `pvc-recovery-2026-04-14.md` for full analysis. + +--- + +### 2026-04-14 — Traefik recovery (simple path) + +**Action:** Traefik's old PV (`pvc-5391fa2b`) was still present in Longhorn (Released state). +Simply removed `claimRef` to make it Available, then created a new PVC pinned to it via +`volumeName:`. TLS certificates were auto-renewed by cert-manager. + +**Outcome:** āœ… Traefik `kube-system/traefik` Bound, pod Running. + +--- + +### 2026-04-14 — url-shortener recovery (Method B — dir rename) + +**Action:** Created Volume CRD, waited for Longhorn to create new replica dirs, then renamed +the old data dirs to match the new engine IDs on pi2 and pi3. Recreated PV/PVC. + +**Outcome:** āœ… url-shortener Running, SQLite data intact (`SELECT COUNT(*) FROM urls` returns +correct count). + +--- + +### 2026-04-14 — Block-device recovery approach developed + +**Finding:** Method B (directory rename) works for small volumes but is risky and does not scale: +- `volume.meta` has `Dirty: true` and `Rebuilding: true` on some replicas +- Longhorn detects inconsistency across replicas and rebuilds from the "cleanest" source (the + new empty pi1 replica), silently overwriting data +- This was observed during an early Clickhouse recovery attempt + +**Solution:** Direct block-device injection — bypass Longhorn's replica reconciliation entirely: +1. Merge the sparse replica layers into a single flat image (`merge-longhorn-layers.py`) +2. Create the Volume CRD and attach it in maintenance mode +3. `mkfs.ext4` the live block device, mount it, rsync data from the merged image +4. Detach, recreate PV/PVC, scale up workload + +**Files created:** +- `docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py` +- `playbooks/recover/longhorn_data.yml` (9-phase automated recovery playbook) +- `docs/runbooks/longhorn-block-device-recovery.md` + +--- + +### 2026-04-14 — Clickhouse recovered via automated playbook + +**Action:** First run of `playbooks/recover/longhorn_data.yml` with vars file: +- `source_node: pi3`, `source_dir: pvc-1251909b-...-1163420b` +- Phases 0–9 completed successfully + +**Verify command output:** +``` +clickhouse-client --query 'SHOW DATABASES' → plausible +``` + +**Outcome:** āœ… `clickhouse-0` Running 1/1. Plausible analytics database accessible. + +--- + +## Recovery Phase: Remaining Volumes (2026-04-15) + +### 2026-04-15 — Auto-discovery bug found and fixed + +**Finding:** Phase 0 scan used `du -sb` (apparent/logical size for sparse files). For a +Longhorn replica being rebuilt (`Rebuilding: true`), the sparse `.img` files have 24 GiB +apparent size (3 Ɨ 8 GiB) but only 1.3 GiB of actual data. The scan picked this invalid +replica over the healthy 11 GiB ones on pi2 and pi3. + +Result: `prometheus-server` merge tool produced an all-zeros image. + +**Fixes applied:** +1. Phase 0 scan now reads `volume.meta` and skips `Rebuilding: true` replica dirs +2. Phase 0 scan now uses `du -sk` (actual disk usage in KB) instead of `du -sb` (apparent bytes) +3. `prometheus-server` manually pinned to `pi2/pvc-88e18c7f-...-910583f6` (11G, not rebuilding) + +See: `playbooks/recover/longhorn_data_vars_remaining.yml` + +--- + +### 2026-04-15 — prometheus, alertmanager, redis, backups-rwx recovery (in progress) + +**Action:** Re-running `playbooks/recover/longhorn_data.yml` with the 4 remaining volumes. +Auto-discovered sources: +- alertmanager → pi2/`pvc-aed7f2c4-...-41c20064` (4 GiB actual) āœ… +- redis → pi2/`pvc-d1d5482b-...-e0a8cdbc` (2 GiB actual) āœ… +- backups-rwx → pi3/`pvc-efda1d2f-...-69454dd0` (150 GiB actual) āœ… +- prometheus → pi2/`pvc-88e18c7f-...-910583f6` (11 GiB actual, manual override) āœ… + +**Status:** Playbook running (Phase 1–2 merging in progress as of this log entry). + +--- + +### 2026-04-15 — Vault and ERP deferred + +**Decision:** HashiCorp Vault (`data-hashicorp-vault-0`, `audit-hashicorp-vault-0`) and ERP +(`pvc-7971918e`) are too sensitive for automated recovery. The automated playbook blindly +`mkfs.ext4` + rsync. For Vault, the unseal keys and root token must be coordinated carefully. +For ERP, business data requires a verified restore procedure. + +These volumes will be recovered manually via the block-device runbook with extra validation steps. + +--- + +**Status:** Mostly Resolved — 7/10 volumes recovered or in progress. Vault/ERP deferred. +**Full details:** `pvc-recovery-2026-04-14.md` +**Runbook:** `docs/runbooks/longhorn-block-device-recovery.md` +**Playbook:** `playbooks/recover/longhorn_data.yml` diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/pvc-recovery-2026-04-14.md b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/pvc-recovery-2026-04-14.md new file mode 100644 index 0000000..7d49946 --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/pvc-recovery-2026-04-14.md @@ -0,0 +1,416 @@ +--- +title: PVC Recovery — Post-Reinstall Volume Restoration +incident_id: 2026-04-13-001 +date: 2026-04-14 +status: Mostly Resolved +operator: Claude Code +--- + +# PVC Recovery — Post-Reinstall Volume Restoration + +## Situation as of 2026-04-14 + +Longhorn has been fully reinstalled and is healthy. The cluster nodes are all Ready. However, +**all application volumes are inaccessible** because the nuclear cleanup deleted the Longhorn +Volume/Engine/Replica CRDs, and the reinstalled Longhorn has no knowledge of the old volumes. + +### Longhorn Health (verified) + +``` +NAME READY STATUS AGE +csi-attacher (3 pods) 1/1 Running 30m +csi-provisioner (3 pods) 1/1 Running 30m +csi-resizer (3 pods) 1/1 Running 30m +csi-snapshotter (3 pods) 1/1 Running 30m +engine-image-ei-b4bcf0a5 (3 pods) 1/1 Running 31m +instance-manager (3 pods) 1/1 Running 30m +longhorn-csi-plugin (3 pods) 3/3 Running 30m +longhorn-driver-deployer 1/1 Running 31m +longhorn-manager (3 pods) 2/2 Running 14m +longhorn-ui (2 pods) 1/1 Running 31m + +CSIDriver driver.longhorn.io: Registered (AGE: 110d — restored) +``` + +Longhorn only knows about 3 volumes (crowdsec-config, crowdsec-db, traefik) — all newly provisioned +after reinstall. The other 9 volumes are missing from Longhorn's knowledge. + +--- + +## Backup Files Available + +| File | Location | Contents | Gap | +|------|----------|----------|-----| +| `backup_20260413.volumes` | `/home/pi/arcodange/backups/k3s_pvc/` | PV + PVC YAML (kubectl get -A pv,pvc) | No Longhorn CRDs | +| `longhorn_metadata_20260413.yaml` | `/home/pi/arcodange/backups/k3s_pvc/` | Engines + Replicas CRDs | **No Volume CRDs** | + +**Critical gap:** The metadata backup was collected with `kubectl get -n longhorn-system volumes.longhorn.io,replicas.longhorn.io,engines.longhorn.io -o yaml` but the resulting file contains only Engines and Replicas in 3 separate Lists. The Volume CRDs are absent. + +Attempting `kubectl apply -f longhorn_metadata_20260413.yaml` fails with: +``` +Error from server (Invalid): admission webhook "validator.longhorn.io" denied the request: +volume does not exist for engine +``` +The webhook requires Volume CRDs to exist before Engines can be created. Without Volume CRDs in the +backup, the metadata file cannot be applied as-is. + +--- + +## Data Survival Assessment + +### Pi1 — Replica directories + +Pi1 is the control plane. Its old replica directories were **deleted** during the nuclear cleanup. +Only 3 new directories exist (created after reinstall): + +``` +pvc-01b93e30-...-b1530c1d (crowdsec-config — NEW) +pvc-4785dc60-...-2f031b60 (crowdsec-db — NEW) +pvc-5391fa2b-...-0e2ff956 (traefik — NEW) +``` + +### Pi2 — Replica directories (OLD data preserved) + +``` +pvc-01b93e30-...-8649439a (crowdsec-config — new post-reinstall) +pvc-1251909b-...-e7a20fdf ← OLD DATA (clickhouse 16Gi) +pvc-14ccc47e-...-09021065 ← OLD DATA (crowdsec-db old PV) +pvc-4785dc60-...-4b48fdf1 (crowdsec-db — new post-reinstall) +pvc-5391fa2b-...-d3503612 (traefik — new post-reinstall) +pvc-63244de1-...-6076eb08 (unknown — not in engine backup) +pvc-6d2ea1c7-...-c7f287d8 ← OLD DATA (audit-vault 10Gi) +pvc-7971918e-...-2028617e ← OLD DATA (erp 50Gi) +pvc-88e18c7f-...-910583f6 ← OLD DATA (prometheus-server 8Gi) +pvc-abc7666c-...-34bec9b0 (unknown — not in engine backup) +pvc-aed7f2c4-...-41c20064 ← OLD DATA (alertmanager 2Gi) +pvc-ca5567d3-...-b537ca60 ← OLD DATA (data-vault 10Gi) +pvc-cc8a3cbb-...-cd16e459 ← OLD DATA (old traefik 128Mi) +pvc-cdd434d1-...-b2695689 ← OLD DATA (url-shortener 128Mi) +pvc-d1d5482b-...-e0a8cdbc ← OLD DATA (redis 1Gi) +pvc-efda1d2f-...-30c849a6 ← OLD DATA (backups-rwx 50Gi) +pvc-f9fe3504-...-20f64e9e ← OLD DATA (old crowdsec-config 100Mi) +pvc-fca13978-...-4749b404 (unknown — not in engine backup) +``` + +### Pi3 — Replica directories (OLD data preserved, multiple dirs per volume) + +``` +pvc-01b93e30-...-29592f50 (crowdsec-config — new post-reinstall) +pvc-1251909b-...-1163420b ← OLD DATA (clickhouse — replica 1) +pvc-1251909b-...-3a569b0a ← OLD DATA (clickhouse — replica 2) +pvc-1251909b-...-ccd05947 ← OLD DATA (clickhouse — replica 3 or stale) +pvc-14ccc47e-...-3856d64d ← OLD DATA (old crowdsec-db) +pvc-2e60385f-...-48e27d5a (unknown) +pvc-4785dc60-...-869f0e99 (crowdsec-db — new post-reinstall) +pvc-5391fa2b-...-958cd868 (traefik — new post-reinstall) +pvc-6d2ea1c7-...-0e73550d ← OLD DATA (audit-vault — dir 1) +pvc-6d2ea1c7-...-787ffefa ← OLD DATA (audit-vault — dir 2) +pvc-6d2ea1c7-...-e0f58d64 ← OLD DATA (audit-vault — dir 3 or stale) +pvc-7971918e-...-33191046 ← OLD DATA (erp — dir 1) +pvc-7971918e-...-88fc1dfc ← OLD DATA (erp — dir 2) +pvc-7971918e-...-b5c5530d ← OLD DATA (erp — dir 3 or stale) +pvc-88e18c7f-...-5d508830 ← OLD DATA (prometheus-server — dir 1) +pvc-88e18c7f-...-92c0ebfd ← OLD DATA (prometheus-server — dir 2) +pvc-88e18c7f-...-deea6182 ← OLD DATA (prometheus-server — dir 3 or stale) +pvc-abe09e90-...-a748d11b (unknown) +pvc-aed7f2c4-...-3452358f ← OLD DATA (alertmanager — dir 1) +pvc-aed7f2c4-...-826f05aa ← OLD DATA (alertmanager — dir 2) +pvc-ca5567d3-...-0ed6f691 ← OLD DATA (data-vault — dir 1) +pvc-ca5567d3-...-808d72b4 ← OLD DATA (data-vault — dir 2) +pvc-ca5567d3-...-9051ef48 ← OLD DATA (data-vault — dir 3 or stale) +pvc-cc8a3cbb-...-011b54b3 ← OLD DATA (old traefik — dir 1) +pvc-cc8a3cbb-...-a24fd91e ← OLD DATA (old traefik — dir 2) +pvc-cdd434d1-...-70197659 ← OLD DATA (url-shortener — dir 1) +pvc-cdd434d1-...-998f49ff ← OLD DATA (url-shortener — dir 2) +pvc-d1d5482b-...-6a730f00 ← OLD DATA (redis — dir 1) +pvc-d1d5482b-...-75da16fd ← OLD DATA (redis — dir 2) +pvc-efda1d2f-...-62fb04c9 ← OLD DATA (backups-rwx — dir 1) +pvc-efda1d2f-...-688f30f5 ← OLD DATA (backups-rwx — dir 2) +pvc-efda1d2f-...-69454dd0 ← OLD DATA (backups-rwx — dir 3 or stale) +pvc-f9fe3504-...-418df608 ← OLD DATA (old crowdsec-config) +``` + +**Note on multiple directories per volume on pi3:** Normal replicas = 1 dir per volume per node. +Multiple directories indicate either: rebuild attempts from before the nuclear cleanup, or stale +snapshots. Must verify by checking `.img` file sizes before renaming. + +--- + +## Volume → PVC Mapping (from backup_20260413.volumes) + +| PV Name | PVC | Namespace | Size | Status | +|---------|-----|-----------|------|--------| +| `pvc-1251909b-3cef-40c6-881c-3bb6e929a596` | `clickhouse-storage-clickhouse-0` | tools | 16Gi | Terminating | +| `pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70` | `audit-hashicorp-vault-0` | tools | 10Gi | Terminating | +| `pvc-7971918e-e47f-4739-a976-965ea2d770b4` | `erp` | erp | 50Gi | Terminating | +| `pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9` | `prometheus-server` | tools | 8Gi | Terminating | +| `pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4` | `storage-prometheus-alertmanager-0` | tools | 2Gi | Terminating | +| `pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635` | `data-hashicorp-vault-0` | tools | 10Gi | Terminating | +| `pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90` | `traefik` | kube-system | 128Mi | Terminating | +| `pvc-cdd434d1-88b4-4588-8fd2-8c7eafc56d07` | `url-shortener` | url-shortener | 128Mi | Terminating | +| `pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce` | `redis-storage-redis-0` | tools | 1Gi | Terminating | +| `pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa` | `backups-rwx` | longhorn-system | 50Gi | Lost | +| `pvc-14ccc47e-0b8c-49d4-97bb-70e550f644b0` | `crowdsec-db-pvc` | tools | 1Gi | already replaced | +| `pvc-f9fe3504-70ce-4401-8cda-bc6bb68bc1bf` | `crowdsec-config-pvc` | tools | 100Mi | already replaced | + +CrowdSec volumes (`pvc-14ccc47e`, `pvc-f9fe3504`) are the old PVs — CrowdSec already got new volumes +(`pvc-4785dc60`, `pvc-01b93e30`) and is running. These old dirs can be cleaned up later. + +--- + +## Recovery Plan + +### Why not restore PVCs + +New PVCs will be created by the workloads themselves when they restart. Restoring old PVCs would +conflict with both the stuck Terminating ones and any new ones pods may already be creating. +**Restore PVs only** — strip `claimRef` so they become `Available`, and new PVCs bind to them via +`storageClassName` + `accessMode` + `capacity` matching. + +### Step 1 — Clear stuck Terminating PVs + +The old PVs are stuck in `Terminating` with `kubernetes.io/pvc-protection` finalizers. Remove them: + +```bash +for pv in \ + pvc-1251909b-3cef-40c6-881c-3bb6e929a596 \ + pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70 \ + pvc-7971918e-e47f-4739-a976-965ea2d770b4 \ + pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9 \ + pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4 \ + pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635 \ + pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 \ + pvc-cdd434d1-88b4-4588-8fd2-8c7eafc56d07 \ + pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce \ + pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa; do + kubectl patch pv $pv -p '{"metadata":{"finalizers":null}}' --type=merge +done +``` + +### Step 2 — Restore PVs with claimRef removed and Retain policy + +Extract PVs from the backup, strip `claimRef` and set `persistentVolumeReclaimPolicy: Retain`, +then apply: + +```bash +ssh pi1 "sudo kubectl get pv \ + pvc-1251909b-3cef-40c6-881c-3bb6e929a596 \ + pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70 \ + pvc-7971918e-e47f-4739-a976-965ea2d770b4 \ + pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9 \ + pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4 \ + pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635 \ + pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 \ + pvc-cdd434d1-88b4-4588-8fd2-8c7eafc56d07 \ + pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce \ + pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa \ + -o yaml 2>/dev/null | \ + python3 -c \" +import sys, yaml +docs = list(yaml.safe_load_all(sys.stdin)) +for doc in docs: + if not doc: continue + items = doc.get('items', [doc]) + for pv in items: + if pv.get('kind') != 'PersistentVolume': continue + spec = pv.get('spec', {}) + spec.pop('claimRef', None) + spec['persistentVolumeReclaimPolicy'] = 'Retain' + pv.pop('status', None) + meta = pv.get('metadata', {}) + meta.pop('resourceVersion', None) + meta.pop('uid', None) + meta.pop('creationTimestamp', None) + print('---') + print(yaml.dump(pv)) +\" | kubectl apply -f -" +``` + +Expected result: PVs become `Available` (no claimRef = unbound). + +### Step 3 — Longhorn creates new Volume CRDs + replica dirs + +When new PVCs bind to the restored PVs and pods attempt to mount them, Longhorn's CSI provisioner +will create new Volume CRDs for each. These new Volume CRDs will have new engine IDs, and Longhorn +will create **new empty replica directories** on pi1, pi2, pi3. + +At this point the volume directory layout will be: +``` +/mnt/arcodange/longhorn/replicas/ + pvc-1251909b-...- ← pi2/pi3: OLD data + pvc-1251909b-...- ← pi1/pi2/pi3: NEW empty dirs +``` + +### Step 4 — Map old dirs to new dirs, verify data presence + +For each volume, on each node, identify: +- OLD dir: exists before new binding (larger .img file size, older timestamp) +- NEW dir: created after binding (empty or minimal .img file) + +```bash +# Example: check sizes on pi2 for clickhouse +ssh pi2 "du -sh /mnt/arcodange/longhorn/replicas/pvc-1251909b-*" +``` + +### Step 5 — Swap directories (Method B) + +For each volume on each node that has an old dir with data: + +```bash +# Scale down the workload first +kubectl scale statefulset clickhouse -n tools --replicas=0 + +# Wait for volume to detach +kubectl wait --for=jsonpath='{.status.state}'=detached \ + volume/pvc-1251909b-3cef-40c6-881c-3bb6e929a596 \ + -n longhorn-system --timeout=60s + +# On pi2: rename new empty dir, move old data dir to new name +ssh pi2 " + NEW=$(ls /mnt/arcodange/longhorn/replicas/ | grep pvc-1251909b | \ + xargs -I{} stat --format='%Y {}' /mnt/arcodange/longhorn/replicas/{} | \ + sort -rn | head -1 | awk '{print \$2}') + OLD=$(ls /mnt/arcodange/longhorn/replicas/ | grep pvc-1251909b | \ + xargs -I{} stat --format='%Y {}' /mnt/arcodange/longhorn/replicas/{} | \ + sort -n | head -1 | awk '{print \$2}') + echo \"OLD: \$OLD\" + echo \"NEW: \$NEW\" + sudo mv \$NEW \${NEW}.empty_backup + sudo mv \$OLD \$NEW +" +# Repeat on pi3 + +# Restart the instance manager on affected node to pick up new dir +kubectl delete pod -n longhorn-system -l \ + longhorn.io/node=pi2,longhorn.io/component=instance-manager +``` + +### Step 6 — Scale workloads back up and verify + +```bash +kubectl scale statefulset clickhouse -n tools --replicas=1 +kubectl get pvc -n tools clickhouse-storage-clickhouse-0 +kubectl get volumes -n longhorn-system pvc-1251909b-3cef-40c6-881c-3bb6e929a596 +``` + +--- + +## Priority Order for Recovery + +Given data criticality: + +1. **HashiCorp Vault data** (`pvc-ca5567d3` + `pvc-6d2ea1c7`) — credentials/secrets store +2. **ERP** (`pvc-7971918e`) — 50Gi, business data +3. **Prometheus** (`pvc-88e18c7f`) — 8Gi, metrics history (degraded OK, can rebuild) +4. **Redis** (`pvc-d1d5482b`) — 1Gi, cache (can rebuild from scratch if needed) +5. **Alertmanager** (`pvc-aed7f2c4`) — 2Gi, alert history (can rebuild) +6. **Clickhouse** (`pvc-1251909b`) — 16Gi +7. **URL shortener** (`pvc-cdd434d1`) — 128Mi +8. **Traefik** (`pvc-cc8a3cbb`) — 128Mi (TLS certs, can re-issue via cert-manager) +9. **Longhorn backups-rwx** (`pvc-efda1d2f`) — 50Gi, backup volume itself + +--- + +## Caution: Multiple Dirs on Pi3 + +Several volumes have 3 directories on pi3. This likely happened during the incident when Longhorn +attempted rebuilds before the nuclear cleanup. **Do not blindly take the newest or oldest** — check +actual `.img` file size to identify the one with data: + +```bash +ssh pi3 "du -sh /mnt/arcodange/longhorn/replicas/pvc-1251909b-*" +# The largest .img is the one with actual data +``` + +--- + +## Lessons for Backup Script + +The current backup command `kubectl get -A pv,pvc -o yaml && echo '---' && kubectl get -A pvc -o yaml` +captures PV/PVC but not Longhorn Volume CRDs. The backup command must be updated to include: + +```bash +kubectl get -A pv -o yaml && echo '---' \ + && kubectl get -A pvc -o yaml && echo '---' \ + && kubectl get -n longhorn-system volumes.longhorn.io -o yaml +``` + +This is tracked in ADR `docs/adr/20260414-longhorn-pvc-recovery.md` under "Prevention". + +--- + +## Volume Recovery Status + +| PV Name | PVC | Namespace | Size | Method | Status | +|---------|-----|-----------|------|--------|--------| +| `pvc-5391fa2b` | `traefik` | kube-system | 128Mi | PV claimRef remove | āœ… 2026-04-14 | +| `pvc-cdd434d1` | `url-shortener-data` | url-shortener | 128Mi | Method B (dir rename) | āœ… 2026-04-14 | +| `pvc-1251909b` | `clickhouse-storage-clickhouse-0` | tools | 16Gi | Block-device (playbook) | āœ… 2026-04-14 | +| `pvc-88e18c7f` | `prometheus-server` | tools | 8Gi | Block-device (playbook) | ā³ 2026-04-15 | +| `pvc-aed7f2c4` | `storage-prometheus-alertmanager-0` | tools | 2Gi | Block-device (playbook) | ā³ 2026-04-15 | +| `pvc-d1d5482b` | `redis-storage-redis-0` | tools | 1Gi | Block-device (playbook) | ā³ 2026-04-15 | +| `pvc-efda1d2f` | `backups-rwx` | longhorn-system | 50Gi | Block-device (playbook) | ā³ 2026-04-15 | +| `pvc-ca5567d3` | `data-hashicorp-vault-0` | tools | 10Gi | Manual (deferred) | šŸ”“ Pending | +| `pvc-6d2ea1c7` | `audit-hashicorp-vault-0` | tools | 10Gi | Manual (deferred) | šŸ”“ Pending | +| `pvc-7971918e` | `erp` | erp | 50Gi | Manual (deferred) | šŸ”“ Pending | + +**Vault and ERP are excluded from automated recovery** — they require coordinated manual procedures +(Vault unseal key management; ERP business data verification). Use `docs/runbooks/longhorn-block-device-recovery.md` +with extra validation steps for those volumes. + +--- + +## Automated Recovery: Block-Device Injection + +Directory rename (Method B) proved too risky for large volumes: Longhorn detects `Dirty: true` + +inconsistency across replicas and silently rebuilds from the empty pi1 replica, destroying data. + +**The approach that works** (implemented in `playbooks/recover/longhorn_data.yml`): + +1. **Phase 0** — Auto-discover best replica dir per volume (skip `Rebuilding: true`, rank by actual disk usage) +2. **Phase 1** — Backup untouched replica dir before touching anything +3. **Phase 2** — Merge sparse snapshot + head layers into a flat image (`merge-longhorn-layers.py`) +4. **Phase 3** — Create Longhorn Volume CRD, wait for replicas +5. **Phase 4** — Scale down workload +6. **Phase 5** — Attach volume via VolumeAttachment maintenance ticket +7. **Phase 6** — `mkfs.ext4` the live block device, rsync data from merged image +8. **Phase 7** — Remove maintenance attachment ticket +9. **Phase 8** — Recreate PV (Retain, no claimRef) + PVC (pinned to PV) +10. **Phase 9** — Scale up, wait for readyReplicas ≄ 1, optional verify_cmd + +**Pitfall discovered (2026-04-15):** `du -sb` returns apparent size for sparse files, making a +`Rebuilding: true` replica (1.3 GiB actual, 24 GiB apparent) beat healthy 11 GiB replicas. +Fixed by checking `Rebuilding` flag in `volume.meta` and using `du -sk` (actual usage). + +**Usage:** +```bash +ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \ + -e @playbooks/recover/longhorn_data_vars_remaining.yml +``` + +Vars files: +- `playbooks/recover/longhorn_data_vars_clickhouse.yml` — clickhouse (already recovered, archived) +- `playbooks/recover/longhorn_data_vars_remaining.yml` — prometheus, alertmanager, redis, backups-rwx +- `playbooks/recover/longhorn_data_vars.example.yml` — template for future use + +--- + +## Tested Recovery Procedure (url-shortener — 2026-04-14) + +Method B confirmed working for this volume (small, no Rebuilding replicas). Full sequence: + +1. Create Longhorn Volume CRD manually (size 128Mi, rwo, 3 replicas) +2. Create Longhorn VolumeAttachment ticket to pi1 (disableFrontend: true) → triggers replica dir creation +3. Remove attachment ticket → volume detaches +4. On pi2: `mv new-dir new-dir.empty && mv old-dir new-dir` +5. On pi3: same (chose `-70197659` over `-998f49ff` based on newer mtime: Apr 7 vs Apr 6) +6. Clear finalizers on stuck Terminating PV/PVC → both deleted +7. Recreate PV (Retain policy, no claimRef, same CSI volumeHandle) +8. Recreate PVC with `volumeName:` pinned to the PV +9. Delete old Error pod (was blocking volume attach) +10. New pod comes up 1/1 Running, volume attached healthy on pi3, all 3 replicas running + +**Traefik** was simpler — PV `pvc-5391fa2b` already existed in Longhorn (Released). Just removed +claimRef (→ Available), created `kube-system/traefik` PVC with `volumeName:` pinned. Bound immediately. + +**For all subsequent volumes** — use `playbooks/recover/longhorn_data.yml`. Method B is too risky. diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/recover_longhorn.yml b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/recover_longhorn.yml new file mode 100644 index 0000000..7cb4dcd --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/recover_longhorn.yml @@ -0,0 +1,70 @@ +--- +# Automated Longhorn Recovery Playbook (DRAFT) +# Purpose: Break circular dependency and restore CSI driver after power-cut +# +# REQUIREMENTS: +# - Ansible >= 2.15 +# - kubectl on control plane (pi1) +# - Backup scripts from playbooks/backup/k3s_pvc.yml must be deployed +# +# USAGE: +# ansible-playbook -i inventory/hosts.yml docs/incidents/2026-04-13-power-cut/recover_longhorn.yml +# +# REFERENCE FILES: +# - playbooks/system/k3s_config.yml (Longhorn HelmChart template) +# - playbooks/backup/k3s_pvc.yml (Backup/restore scripts) +# - inventory/hosts.yml (Target hosts) +# - /mnt/arcodange/longhorn/replicas/ (Data - MUST NOT be touched) +# - /home/pi/arcodange/backups/k3s_pvc/ (Fallback backup location) +# +# +# PLAYBOOK FLOW: +# +# Phase 1: DIAGNOSIS (idempotent, safe to run anytime) +# - Check CSI driver registration status +# - Check Longhorn manager health +# - Identify which recovery phase is needed +# +# Phase 2: SOFT RECOVERY (least destructive) +# - Touch longhorn-install.yaml manifest +# - Wait 60s for k3s HelmChart controller to reconcile +# - Verify pod recreation +# +# Phase 3: HARD RECOVERY (if soft fails) +# - Delete driver-deployer pod +# - Delete all longhorn-driver-deployer pods +# - Wait for HelmChart to recreate +# +# Phase 4: NUCLEAR RECOVERY (if hard fails) +# - Delete HelmChart resource +# - Remove manifest file +# - Force-delete longhorn-system namespace (after removing finalizers) +# - Reinstall Longhorn via manifest +# +# Phase 5: RESTORE FROM BACKUP (idempotent) +# - Apply PV/PVC from backup +# - Apply Longhorn CRs from backup +# - Data auto-discovered from disk +# +# DESIGNED TO HANDLE: +# - CSI driver registration lost +# - Longhorn manager webhook circular dependency +# - Partial pod crashes +# - Full Longhorn namespace corruption +# +# LIMITATIONS: +# - Requires pi1 (control plane) to be reachable +# - Data in /mnt/arcodange/longhorn/ MUST survive +# - Docker must be functional on at least 1 node +# - Does NOT handle Docker overlay2 corruption +# +# TESTED SCENARIOS: +# - [ ] CSI driver not registered (primary use case) +# - [ ] Longhorn manager CrashLoopBackOff +# - [ ] Full namespace deletion needed +# - [ ] Backup restore validation +# +# TODO: +# - Add Docker storage health check +# - Add pre-recovery data verification +# - Add post-recovery validation diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/retrospective-recovery-analysis.md b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/retrospective-recovery-analysis.md new file mode 100644 index 0000000..85f00b4 --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/retrospective-recovery-analysis.md @@ -0,0 +1,153 @@ +--- +title: Recovery Approach Analysis — Post-Incident Review +incident_id: 2026-04-13-001 +date: 2026-04-13 +author: Claude Code (external review) +--- + +# Recovery Approach Analysis + +## TL;DR + +The incident escalated from a **~5 minute fix** to a **full Longhorn reinstall with backup restore** because the simplest remediation (k3s restart) was never attempted, and a single aggressive command (`kubectl delete pods --all --force`) created a new problem that did not previously exist. + +--- + +## What Was Skipped + +### 1. Restart k3s on all nodes (never attempted) + +This should have been the **first or second action** after the manifest touch failed. + +```bash +systemctl restart k3s # pi1 — control plane +systemctl restart k3s-agent # pi2, pi3 — agent nodes +``` + +After a power cut, k3s/kubelet state is dirty. Restarting k3s: +- Forces kubelet to reinitialize the plugin registry cleanly +- Allows Longhorn pods to restart in correct dependency order +- Avoids the simultaneous-restart race condition that causes webhook issues +- Takes ~2 minutes with no destructive side effects + +This was listed as a last resort in the runbook consulted at incident start. It should have been tried **before any pod deletion**, not after. + +### 2. Stale CSI socket check on each node (never attempted) + +```bash +# On each node (pi1, pi2, pi3): +ls /var/lib/kubelet/plugins/driver.longhorn.io/ +# If a stale .sock file exists: +rm /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock +``` + +The incident log confirms the CSI socket was missing/stale, but no one went to the nodes to verify and clean this up. Removing a stale socket + restarting the `longhorn-csi-plugin` daemonset is a targeted, low-risk fix. + +--- + +## Where the Direction Went Wrong + +### The pivotal mistake: force deleting all 24 pods simultaneously + +**Command run at 15:32:15:** +```bash +kubectl delete pods -n longhorn-system --all --force --grace-period=0 +``` + +This command created the **webhook circular dependency problem**, which did not exist before it was run. + +**Why it caused the circular dependency:** + +In normal operation, Longhorn managers start sequentially. One becomes the webhook leader and begins serving on port 9501 before others register as service endpoints. + +When all 24 pods are force-deleted simultaneously: +1. All 3 manager pods race-start at the same time +2. All 3 IPs are registered as `longhorn-conversion-webhook` service endpoints immediately +3. The health check (`https://:9501/v1/healthz`) is run against all 3 +4. Only the elected leader actually serves port 9501 — the other 2 fail the probe +5. Failing managers crash: `"conversion webhook service is not accessible after 1m0s"` +6. `longhorn-driver-deployer` init container waits for healthy managers indefinitely +7. CSI socket is never created, CSI driver never registers + +**The original problem was only a lost CSI socket registration.** The webhook circular dependency is a new problem introduced by the recovery attempt. + +--- + +## The Escalation Cascade + +Each step created a harder problem than the one it was meant to solve: + +``` +Power cut + → CSI socket lost (original problem — simple fix) + → Force delete all pods + → Webhook circular dependency (new problem) + → Delete HelmChart + manifest + → 84 finalizers blocking namespace deletion (new problem) + → Full reinstall required + → Backup restore required + → Risk to volume metadata +``` + +The original problem required touching 1 socket file and restarting k3s. The current state requires: +- Manually patching finalizers off 84+ resources +- Full Longhorn reinstall +- Restoring PV/PVC and Longhorn CRs from backup +- Verifying data auto-discovery from replicas + +--- + +## Correct Recovery Sequence (Hindsight) + +### Step 1 — k3s restart (should have been tried at ~15:27) +```bash +ansible -i inventory/hosts.yml all -m shell -a "sudo systemctl restart k3s || sudo systemctl restart k3s-agent" +``` +Wait 3 minutes. In most power-cut scenarios, this alone restores CSI registration. + +### Step 2 — If still broken: targeted daemonset restart (not force-delete-all) +```bash +kubectl rollout restart daemonset/longhorn-manager -n longhorn-system +kubectl rollout status daemonset/longhorn-manager -n longhorn-system +``` +Graceful restart respects the dependency order. Wait for managers to stabilize before touching CSI pods. + +### Step 3 — Check and clean stale sockets on each node +```bash +# Run on pi1, pi2, pi3: +ls /var/lib/kubelet/plugins/driver.longhorn.io/ +rm -f /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock +kubectl rollout restart daemonset/longhorn-csi-plugin -n longhorn-system +``` + +### Step 4 — Verify CSI driver registered +```bash +kubectl get csidriver +kubectl get csinodes +``` + +### Step 5 — Only if all above failed: delete driver-deployer pod only +```bash +kubectl delete pod -n longhorn-system -l app=longhorn-driver-deployer +``` +Not all pods. One targeted pod. + +--- + +## What Was Done Well + +- Quick identification of the original root cause (CSI registration) +- Confirming volume data integrity early (`robustness="healthy"`) +- Securing backups before destructive operations (16:30) +- Fixing the backup script bug (useful regardless of incident) +- Detailed logging throughout + +--- + +## Action Items for Future Incidents + +- [ ] Add k3s restart as **step 2** in the Longhorn recovery runbook (before any pod deletion) +- [ ] Add CSI socket cleanup to the runbook as an explicit step on each node +- [ ] Add a "minimum destructive action" principle: prefer `rollout restart` over `delete --force --all` +- [ ] Implement `recover_longhorn.yml` playbook with the phased approach (soft → targeted → hard) to prevent ad-hoc escalation +- [ ] Add a pre-action checklist: "have I tried restarting the service before deleting its resources?" diff --git a/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py new file mode 100644 index 0000000..f22659c --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python3 +""" +Merge Longhorn snapshot + head layers into a single mountable raw image. + +Longhorn stores replica data as sparse raw images in a chain: + volume-snap-.img — full state at the time the snapshot was taken + volume-head-NNN.img — delta (only changed blocks) since the snapshot + +To reconstruct the full filesystem, head blocks take priority over snapshot +blocks. Sparse (all-zero) blocks in the head fall through to the snapshot. + +Usage: + sudo python3 merge-longhorn-layers.py + +Example: + sudo python3 merge-longhorn-layers.py \\ + /mnt/arcodange/longhorn/replicas/pvc-cdd434d1-...-998f49ff \\ + /tmp/merged.img + + # Then mount and inspect: + sudo mount -o loop /tmp/merged.img /mnt/recovery + ls /mnt/recovery/ + +Proven useful during incident 2026-04-13 to recover the url-shortener SQLite +database from a Longhorn replica that was never touched by the nuclear cleanup +(pi3, dir suffix -998f49ff, Apr 6 snapshot). + +Key lesson: always identify the untouched replica dir (oldest timestamps, +never renamed) before attempting directory swaps. Back it up first. +""" + +import os +import sys +import json + +BLOCK = 4096 + + +def find_layers(replica_dir: str) -> tuple[str | None, str | None]: + """ + Read volume.meta to find head filename and snapshot parent. + Returns (snapshot_path, head_path). snapshot_path is None for base volumes. + """ + meta_path = os.path.join(replica_dir, "volume.meta") + with open(meta_path) as f: + meta = json.load(f) + + head_name = meta["Head"] + parent_name = meta.get("Parent", "") + + head_path = os.path.join(replica_dir, head_name) + snap_path = os.path.join(replica_dir, parent_name) if parent_name else None + + return snap_path, head_path + + +def merge(snap_path: str | None, head_path: str, out_path: str) -> None: + size = os.path.getsize(head_path) + print(f"Volume size: {size // (1024 * 1024)} MiB") + print(f"Snapshot: {snap_path or '(none — base volume)'}") + print(f"Head: {head_path}") + print(f"Output: {out_path}") + + snap_f = open(snap_path, "rb") if snap_path else None + head_f = open(head_path, "rb") + + with open(out_path, "wb") as out: + out.truncate(size) + blocks = size // BLOCK + for i, offset in enumerate(range(0, size, BLOCK)): + head_f.seek(offset) + hb = head_f.read(BLOCK) + + if hb and any(hb): + out.seek(offset) + out.write(hb) + elif snap_f: + snap_f.seek(offset) + sb = snap_f.read(BLOCK) + if sb and any(sb): + out.seek(offset) + out.write(sb) + + if i % 4096 == 0: + pct = (i / blocks) * 100 + print(f"\r {pct:.0f}%", end="", flush=True) + + print("\r 100% — done.") + if snap_f: + snap_f.close() + head_f.close() + + +if __name__ == "__main__": + if len(sys.argv) != 3: + print(__doc__) + sys.exit(1) + + replica_dir = sys.argv[1] + out_path = sys.argv[2] + + if not os.path.isdir(replica_dir): + print(f"Error: {replica_dir} is not a directory", file=sys.stderr) + sys.exit(1) + + snap, head = find_layers(replica_dir) + merge(snap, head, out_path) diff --git a/ansible/arcodange/factory/docs/incidents/README.md b/ansible/arcodange/factory/docs/incidents/README.md new file mode 100644 index 0000000..1173fb5 --- /dev/null +++ b/ansible/arcodange/factory/docs/incidents/README.md @@ -0,0 +1,312 @@ +# Incident Documentation + +This directory contains incident reports, postmortems, and recovery logs for the Arcodange Factory infrastructure. + +## Purpose + +Document all infrastructure incidents to: +- Track root causes and resolutions +- Maintain a knowledge base for future troubleshooting +- Improve system reliability through lessons learned +- Provide clear guidance for on-call responders + +## Structure + +Each incident is documented in its own directory under `docs/incidents/` with the following naming convention: + +``` +docs/incidents/ +ā”œā”€ā”€ YYYY-MM-DD-incident-name/ +│ ā”œā”€ā”€ README.md # Incident summary and timeline +│ ā”œā”€ā”€ status.md # Real-time status updates (optional) +│ ā”œā”€ā”€ log.md # Detailed recovery actions and logs +│ ā”œā”€ā”€ root-cause.md # Technical analysis (optional) +│ └── diagrams/ # Architecture/flow diagrams (optional) +│ └── *.mmd # Mermaid diagrams +└── ... +``` + +## Incident Directory Contents + +### 1. `README.md` (Required) +The primary incident document. Must include: + +- **Incident ID**: Unique identifier (e.g., `2026-04-13-001`) +- **Title**: Clear, descriptive title +- **Date/Time**: Start and end timestamps +- **Status**: Open / Investigating / Resolved / Monitoring +- **Severity**: SEV-1 (Critical) / SEV-2 (High) / SEV-3 (Medium) / SEV-4 (Low) +- **Impact**: Brief description of affected services +- **Summary**: What happened +- **Timeline**: Key events with timestamps +- **Root Cause**: Technical analysis +- **Resolution**: Steps taken to resolve +- **Action Items**: Follow-up tasks +- **Lessons Learned**: Key takeaways + +**Front matter template:** +```markdown +--- +title: Incident Title +incident_id: YYYY-MM-DD-NNN +date: YYYY-MM-DD +time_start: HH:MM:SS UTC +time_end: HH:MM:SS UTC +status: Resolved +severity: SEV-2 +tags: + - kubernetes + - longhorn + - storage +--- +``` + +### 2. `log.md` (Recommended) +Detailed technical log of all recovery actions. Must include: + +- Commands executed with timestamps +- Command output (relevant portions) +- Decision rationale for each action +- Outcome of each action +- Next stepsidentified + +Format: +```markdown +## [Time] Action Description + +**Command:** `actual command run` + +**Output:** +``` +relevant output +``` + +**Decision:** Why this action was taken + +**Outcome:** What happened + +**Next:** What to do next +``` + +### 3. Mermaid Diagrams + +Include at least one Mermaid diagram in each incident to visualize: +- Architecture/flow before incident +- Failure propagation +- Recovery process +- New architecture after fixes + +**Example theme usage:** +```mermaid +%%{init: { 'theme': 'forest', 'themeVariables': { 'primaryColor': '#ffdfd3', 'edgeLabelBackground':'#fff' }}}%% +``` + +Available themes: `default`, `base`, `forest`, `dark`, `neutral` + +**Recommended diagrams:** +- `incident-flow.mmd`: Timeline/flow of the incident +- `architecture.mmd`: Affected components architecture +- `recovery-flow.mmd`: Recovery steps visualization +- `dependency-tree.mmd`: Component dependencies showing failure path + +## Incident Severity Definitions + +| Severity | Description | Response Time | Impact | +|----------|-------------|---------------|--------| +| SEV-1 | Critical system-wide outage | Immediate (24/7) | Multiple services down, potential data loss | +| SEV-2 | Major service degradation | < 1 hour | Single critical service down | +| SEV-3 | Partial service degradation | < 4 hours | Non-critical service affected | +| SEV-4 | Minor issue | Next business day | Cosmetic or non-impacting | + +## Available Ansible Playbooks for Recovery + +This collection provides comprehensive infrastructure management via Ansible. +Always use `-i inventory/hosts.yml` when running playbooks. + +### Master Playbooks (Run in order for full recovery) + +| Playbook | Purpose | Targets | +|----------|---------|---------| +| `playbooks/01_system.yml` | System setup (hostnames, iSCSI, Docker, Longhorn, DNS) | raspberries | +| `playbooks/02_setup.yml` | Infrastructure setup (NFS backup, PostgreSQL, Gitea) | localhost, postgres, gitea | +| `playbooks/03_cicd.yml` | CI/CD pipeline (Gitea tokens, Docker Compose, ArgoCD) | localhost, gitea | +| `playbooks/04_tools.yml` | Tool deployment (Hashicorp Vault, Crowdsec) | tools group | +| `playbooks/05_backup.yml` | Backup configuration | localhost | + +### Component-Specific Playbooks + +#### System +| Playbook | Purpose | Notes | +|----------|---------|-------| +| `playbooks/system/rpi.yml` | Raspberry Pi hostname setup | | +| `playbooks/system/dns.yml` | DNS/pi-hole configuration | | +| `playbooks/system/ssl.yml` | SSL certificate setup with step-ca | | +| `playbooks/system/prepare_disks.yml` | Disk partitioning and formatting | | +| `playbooks/system/system_docker.yml` | Docker installation with custom storage | Storage at `/mnt/arcodange/docker` | +| `playbooks/system/k3s_config.yml` | K3s configuration (Traefik, Longhorn HelmCharts) | **Key for k3s** | +| `playbooks/system/system_k3s.yml` | K3s cluster deployment | Uses k3s-ansible collection | +| `playbooks/system/iscsi_longhorn.yml` | iSCSI client for Longhorn | Prerequisite for Longhorn | +| `playbooks/system/k3s_dns.yml` | K3s DNS configuration | | +| `playbooks/system/k3s_ssl.yml` | K3s SSL/traefik certificates | | + +#### Storage +| Playbook | Purpose | Notes | +|----------|---------|-------| +| `playbooks/setup/backup_nfs.yml` | Longhorn RWX NFS backup volume | Creates 50Gi PVC + recurring backups | +| `playbooks/backup/k3s_pvc.yml` | PVC backup scripts | Creates `/opt/k3s_volumes/backup.sh` and `restore.sh` | + +#### Backup +| Playbook | Purpose | Notes | +|----------|---------|-------| +| `playbooks/backup/backup.yml` | Main backup orchestration | Calls postgres, gitea, k3s_pvc | +| `playbooks/backup/postgres.yml` | PostgreSQL database backup | Docker exec pg_dumpall | +| `playbooks/backup/gitea.yml` | Gitea backup | Uses gitea dump command | +| `playbooks/backup/cron_report.yml` | Mail utility for cron reports | | +| `playbooks/backup/cron_report_mailutility.yml` | MTA configuration | | + +### Inventory File + +**File:** `inventory/hosts.yml` + +**Groups:** +- `raspberries`: pi1, pi2, pi3 (Raspberry Pi nodes) +- `local`: localhost, pi1, pi2, pi3 +- `postgres`: pi2 (PostgreSQL host) +- `gitea`: pi2 (Gitea host, inherits postgres) +- `pihole`: pi1, pi3 (DNS hosts) +- `step_ca`: pi1, pi2, pi3 (Certificate authority) +- `all`: All above groups + +**Important:** All playbooks MUST be run with `-i inventory/hosts.yml` flag: +```bash +ansible-playbook -i inventory/hosts.yml playbooks/01_system.yml +``` + +### Handy Commands for Incident Response + +```bash +# Check all pods +kubectl get pods -A + +# Check Longhorn specifically +kubectl get pods -n longhorn-system +kubectl get volumes -n longhorn-system +kubectl get replicas -n longhorn-system + +# Check storage +kubectl get pv -A +kubectl get pvc -A +kubectl get csidriver + +# Check nodes +kubectl get nodes -o wide +kubectl describe node + +# Force Longhorn HelmChart reconcile (k3s-specific) +sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml + +# Restart Longhorn +kubectl delete pods -n longhorn-system --all --force --grace-period=0 + +# Check Longhorn data on disk +ls /mnt/arcodange/longhorn/replicas/ + +# Check Docker storage +ls /mnt/arcodange/docker/overlay2/ | head + +# Run ansible playbook (dry-run first) +ansible-playbook -i inventory/hosts.yml playbooks/01_system.yml --check --diff +ansible-playbook -i inventory/hosts.yml playbooks/01_system.yml --limit pi1 +``` + +### K3s-Specific Recovery Notes + +Longhorn is installed via **HelmChart manifest** (k3s native): +- File: `/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml` +- To trigger reconcile: `touch` the file (k3s watches for changes) +- DO NOT use `helm install` directly - it may conflict with k3s HelmChart controller + +Traefik is also installed via HelmChart manifest: +- File: `/var/lib/rancher/k3s/server/manifests/traefik-v3.yaml` + +## Incident Templates + +### Quick Start Template + +```markdown +--- +title: [Short Description] +incident_id: YYYY-MM-DD-NNN +date: $(date +%Y-%m-%d) +time_start: $(date +%H:%M:%S) +status: Investigating +severity: SEV-2 +tags: + - tag1 + - tag2 +--- + +## Summary + +[1-2 sentences describing the issue] + +## Impact + +[What services/users are affected] + +## Timeline + +| Time | Event | Owner | +|------|-------|-------| +| HH:MM | Initial detection | | @user +| HH:MM | Investigation started | | @user +| HH:MM | Root cause identified | | @user +| HH:MM | Resolution applied | | @user +| HH:MM | Service restored | | @user + +## Root Cause + +[Technical analysis] + +## Resolution + +[Step-by-step what was done] + +## Mermaid Diagram + +%%{init: { 'theme': 'forest' }}%% +graph TD + A[Component A] -->|depends on| B[Component B] + B -->|failed due to| C[Component C] + C -->|power cut| D[Root Cause] +``` + +*remember to always to this for labels:* +- have a space before a filepath +- no parenthesis '()' +- use
instead of \n for new lines + +## Action Items + +- [ ] Task 1 +- [ ] Task 2 + +## Lessons Learned + +- Lesson 1 +- Lesson 2 +``` + +## Contributing to Incident Documentation + +1. **During Incident**: Focus on resolution, log commands and outputs in `log.md` +2. **After Resolution**: Create/read the `README.md` with full incident details +3. **Add Diagrams**: Include at least one Mermaid diagram to visualize the issue +4. **Peer Review**: Have another team member review before closing +5. **Update Templates**: Improve templates based on what was missing + +## Directory Index + +| Incident | Date | Severity | Status | +|----------|------|----------|--------| +| [2026-04-13-power-cut](./2026-04-13-power-cut/README.md) | 2026-04-13 | SEV-1 | In Progress | diff --git a/ansible/arcodange/factory/docs/runbooks/cluster-recovery-agent.md b/ansible/arcodange/factory/docs/runbooks/cluster-recovery-agent.md new file mode 100644 index 0000000..5c93130 --- /dev/null +++ b/ansible/arcodange/factory/docs/runbooks/cluster-recovery-agent.md @@ -0,0 +1,244 @@ +# Cluster Recovery Agent Instructions + +You are recovering the Arcodange homelab k3s cluster after an outage (power cut, node failure, or +Longhorn reinstall). Your job is to assess damage, run the appropriate Ansible playbooks and +kubectl commands, and bring the cluster back to a fully healthy state. + +You do NOT need to modify any code. All recovery tooling already exists. + +--- + +## Cluster Overview + +| Component | Details | +|-----------|---------| +| Nodes | pi1, pi2, pi3 (Raspberry Pi, SSH via `pi.home`) | +| k8s distribution | k3s | +| Storage | Longhorn (`/mnt/arcodange/longhorn/`) | +| GitOps | ArgoCD (apps auto-sync from `gitea.arcodange.lab/arcodange-org/`) | +| Secrets | HashiCorp Vault (`tools` namespace, manual unseal) | +| Ingress | Traefik + CrowdSec bouncer | +| Working dir | `/Users/gabrielradureau/Work/Arcodange/factory/ansible/arcodange/factory/` | +| Inventory | `inventory/hosts.yml` | + +**Critical dependency:** ERP (Dolibarr) uses Vault-rotated DB credentials written to its PVC. +**Always recover and unseal Vault before scaling ERP up.** + +--- + +## Step 0 — Assess Damage + +Run these first to understand what is broken: + +```bash +# Overall pod health +kubectl get pods -A | grep -v Running | grep -v Completed + +# PVC health (anything not Bound is a problem) +kubectl get pvc -A | grep -v Bound + +# Longhorn volume states +kubectl get volumes.longhorn.io -n longhorn-system + +# Longhorn manager health (prerequisite for all recovery) +kubectl get pods -n longhorn-system -l app=longhorn-manager +``` + +--- + +## Step 1 — Longhorn Volume Recovery + +### Path A — Fast path (backup file exists, Volume CRDs were backed up) + +Check if a recent backup exists on pi1: +```bash +ssh pi1.home "ls -lt /mnt/backups/k3s_pvc/backup_*.volumes | head -5" +``` + +If a backup file exists and is recent (from before the incident): +```bash +ssh pi1.home "kubectl apply -f /mnt/backups/k3s_pvc/backup_.volumes" +``` + +Then verify PVCs bound and skip to Step 2. + +### Path B — Block-device injection (no usable backup, raw replica files intact) + +Use this when PVCs are `Lost`/`Terminating` and no Volume CRD backup is available. + +**Check which volumes need recovery:** +```bash +# Volumes with no PVC or Lost/Terminating PVC +kubectl get pvc -A | grep -v Bound +``` + +**For each failed volume, create a vars file** following the pattern in: +`playbooks/recover/longhorn_data_vars.example.yml` + +Existing vars files from the 2026-04-13 incident (reusable as references): +- `playbooks/recover/longhorn_data_vars_remaining.yml` — prometheus, alertmanager, redis, backups-rwx +- `playbooks/recover/longhorn_data_vars_erp_vault.yml` — erp, hashicorp-vault (audit + data) +- `playbooks/recover/longhorn_data_vars_clickhouse.yml` — clickhouse + +**Key rules for the vars file:** +- `source_node`/`source_dir` can be omitted — Phase 0 auto-discovers the largest non-Rebuilding replica +- Set `workload_name: ""` for ERP — it must not scale up until Vault is unsealed +- For StatefulSets with multiple PVCs (e.g. Vault), set `workload_name: ""` on all but the last entry + +**Run the recovery playbook:** +```bash +ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \ + -e @playbooks/recover/longhorn_data_vars_.yml +``` + +The playbook is **idempotent** — safe to re-run if it fails midway. + +**Playbook phases (for context when troubleshooting):** +| Phase | What it does | +|-------|-------------| +| 0 | Auto-discovers best replica dir (skips `Rebuilding: true`) | +| 1 | Backs up untouched replica dir to `/home/pi/arcodange/backups/longhorn-recovery/` | +| 2 | Merges snapshot+head layers into a single `.img` via `merge-longhorn-layers.py` | +| 3 | **Scales down workloads first**, then clears stuck Terminating PVCs, creates Volume CRD | +| 4 | Scale down (second pass, idempotent) | +| 5 | Attaches volume via maintenance ticket to source node | +| 6 | `mkfs.ext4` (if unformatted) + `rsync` from merged image into live block device | +| 7 | Removes maintenance ticket (volume detaches) | +| 8 | Creates PV (Retain, no claimRef) + PVC pinned to PV | +| 9 | Scales up workloads, waits for readyReplicas ≄ 1 (failures here are `ignore_errors: yes`) | + +**Common Phase 8 failure — StatefulSet re-creates PVCs before they can be pinned:** +The playbook handles this automatically (scales down before finalizer removal). If you still hit it: +```bash +kubectl scale statefulset -n --replicas=0 +kubectl patch pvc -n --type=merge -p '{"metadata":{"finalizers":null}}' +kubectl delete pvc -n +# Then re-run the playbook +``` + +--- + +## Step 2 — Unseal HashiCorp Vault + +After Vault's PVCs are recovered, the pod boots **sealed**. Check: +```bash +kubectl get pod hashicorp-vault-0 -n tools +kubectl exec hashicorp-vault-0 -n tools -- vault status 2>/dev/null | grep Sealed +``` + +If sealed, run the unseal playbook (requires interactive terminal for the Gitea password prompt): +```bash +ansible-playbook -i inventory/hosts.yml playbooks/tools/hashicorp_vault.yml +``` + +Unseal keys are at `~/.arcodange/cluster-keys.json` on the local machine. The playbook reads them automatically. + +After the playbook completes, verify: +```bash +kubectl get pod hashicorp-vault-0 -n tools # must be 1/1 Ready +kubectl exec hashicorp-vault-0 -n tools -- vault status | grep Sealed # must be false +``` + +--- + +## Step 3 — Scale Up ERP + +Only after Vault is unsealed and Ready: +```bash +kubectl scale deployment erp -n erp --replicas=1 +kubectl rollout status deployment/erp -n erp +``` + +--- + +## Step 4 — Reconfigure Tools (CrowdSec, etc.) + +Run if CrowdSec bouncer or Traefik middleware needs reconfiguring: +```bash +# Standard run (bouncer key + Traefik middleware + restart) +ansible-playbook -i inventory/hosts.yml playbooks/tools/crowdsec.yml + +# Include captcha HTML injection (use when captcha page is broken) +ansible-playbook -i inventory/hosts.yml playbooks/tools/crowdsec.yml --tags never,all +``` + +If crowdsec-agent or crowdsec-appsec pods are stuck in `Error` after a long outage, +the playbook handles restarting them automatically. + +--- + +## Step 5 — Re-enable ArgoCD selfHeal + +Check if `selfHeal` was disabled during recovery (look for `selfHeal: false` in the tools app): +```bash +grep -A5 "tools:" /Users/gabrielradureau/Work/Arcodange/factory/argocd/values.yaml +``` + +If disabled, re-enable it by editing `argocd/values.yaml` and setting `selfHeal: true`, +then syncing the ArgoCD app: +```bash +kubectl get app tools -n argocd +``` + +--- + +## Step 6 — Final Verification + +```bash +# All pods running +kubectl get pods -A | grep -v Running | grep -v Completed | grep -v "^NAME" + +# All PVCs bound +kubectl get pvc -A | grep -v Bound + +# All Longhorn volumes healthy +kubectl get volumes.longhorn.io -n longhorn-system + +# Run a fresh backup to capture the recovered state +ansible-playbook -i inventory/hosts.yml playbooks/backup/backup.yml \ + -e backup_root_dir=/mnt/backups +``` + +--- + +## Key Files Reference + +| File | Purpose | +|------|---------| +| `playbooks/recover/longhorn_data.yml` | Main block-device recovery playbook | +| `playbooks/recover/longhorn.yml` | Recovery when Volume CRDs still exist | +| `playbooks/recover/longhorn_data_vars.example.yml` | Template for recovery vars | +| `playbooks/recover/longhorn_data_vars_erp_vault.yml` | Vars for erp + vault (2026-04-13 incident) | +| `playbooks/recover/longhorn_data_vars_remaining.yml` | Vars for other volumes (2026-04-13 incident) | +| `playbooks/backup/backup.yml` | Full backup (postgres + gitea + k3s PVCs + Longhorn CRDs) | +| `playbooks/backup/k3s_pvc.yml` | PV/PVC/Longhorn Volume CRD backup | +| `playbooks/tools/hashicorp_vault.yml` | Vault unseal + OIDC reconfiguration | +| `playbooks/tools/crowdsec.yml` | CrowdSec bouncer + Traefik middleware setup | +| `docs/adr/20260414-longhorn-pvc-recovery.md` | Full incident ADR with all recovery methods | +| `~/.arcodange/cluster-keys.json` | Vault unseal keys (local machine only) | + +--- + +## Decision Tree + +``` +Cluster down after outage +│ +ā”œā”€ kubectl works? ──No──▶ Check k3s: `systemctl status k3s` on pi1/pi2/pi3 +│ +└─ Yes + │ + ā”œā”€ PVCs all Bound? ──Yes──▶ Skip to Step 2 (check Vault) + │ + └─ No + │ + ā”œā”€ Recent .volumes backup on pi1? ──Yes──▶ Path A (kubectl apply backup) + │ + └─ No + │ + ā”œā”€ Longhorn Volume CRDs exist? ──Yes──▶ playbooks/recover/longhorn.yml + │ + └─ No ──▶ Path B (longhorn_data.yml block-device injection) + Check replica dirs exist first: + ssh pi{1,2,3}.home "sudo du -sh /mnt/arcodange/longhorn/replicas/pvc-*" +``` diff --git a/ansible/arcodange/factory/docs/runbooks/longhorn-block-device-recovery.md b/ansible/arcodange/factory/docs/runbooks/longhorn-block-device-recovery.md new file mode 100644 index 0000000..5585497 --- /dev/null +++ b/ansible/arcodange/factory/docs/runbooks/longhorn-block-device-recovery.md @@ -0,0 +1,360 @@ +# Runbook: Longhorn Block-Device Data Recovery + +**When to use:** Longhorn has been fully reinstalled (nuclear cleanup). Volume CRDs are gone. +Application PVCs are stuck `Terminating` or `Lost`. The raw replica `.img` files still exist +on disk across the nodes. kubectl/k8s objects cannot help — we must work directly with the +Longhorn replica directories and block devices. + +**Automated version:** `playbooks/recover/longhorn_data.yml` + +--- + +## Mental Model + +Longhorn stores each replica as a chain of sparse raw image files inside a directory named +`-` under `/replicas/`. Each directory contains: + +``` +volume.meta — engine state (Head filename, Parent snapshot, Dirty flag) +volume-head-NNN.img — active write log (sparse, only changed blocks) +volume-head-NNN.img.meta — head metadata +volume-snap-.img — snapshot at a point in time (sparse, full state) +volume-snap-.img.meta — snapshot metadata +revision.counter — monotonically increasing write counter +``` + +After a nuclear cleanup + reinstall, Longhorn creates **new empty replica directories** with +new random hex suffixes. The old directories (with data) are left on disk but orphaned. + +**Why directory-swap fails:** the old `volume.meta` has a different engine generation and +`Dirty: true`. Longhorn detects the inconsistency across replicas and rebuilds from the +"cleanest" source (the new empty pi1 replica), overwriting the old data. + +**What works:** extract the filesystem from the untouched replica directory directly, then +inject the data files into the live Longhorn block device while the volume is temporarily +attached in maintenance mode. + +--- + +## Decision Tree + +``` +Are Volume CRDs present in Longhorn? +ā”œā”€ā”€ YES → normal PV/PVC restore is enough, use playbooks/recover/longhorn.yml +└── NO + └── Are replica directories present on disk? + ā”œā”€ā”€ NO → data is lost, provision fresh volumes + └── YES + └── Is there an untouched replica dir (timestamps from before the incident)? + ā”œā”€ā”€ NO → data likely unrecoverable (all dirs were zeroed during reconciliation) + └── YES → follow this runbook +``` + +--- + +## Step 0 — Pre-flight: Inventory Surviving Replica Directories + +On each node, list replica dirs and their sizes. Dirs with actual data are large (>16K). +New empty dirs created by Longhorn are always exactly 16K. + +```bash +for node in pi1 pi2 pi3; do + echo "=== $node ===" + ssh $node "sudo du -sh /mnt/arcodange/longhorn/replicas/pvc--* 2>/dev/null" +done +``` + +**Key rule:** identify the replica dir that was **never touched** by the reinstall — it has +old timestamps (from before the incident) and its size matches the original volume usage. +This is your recovery source. **Back it up before touching anything.** + +```bash +# On the node that has the untouched dir: +sudo mkdir -p /home/pi/arcodange/backups/longhorn-recovery// +sudo cp -a /mnt/arcodange/longhorn/replicas/-/ \ + /home/pi/arcodange/backups/longhorn-recovery// +``` + +--- + +## Step 1 — Reconstruct the Filesystem + +The replica directory contains a snapshot chain. Each layer is a sparse raw image — unchanged +blocks appear as zeroed sparse regions, only written blocks contain data. To reconstruct the +full filesystem, layers must be merged: head takes priority, then snapshot. + +Use `docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py`: + +```bash +# On the node holding the backup: +sudo python3 merge-longhorn-layers.py \ + /home/pi/arcodange/backups/longhorn-recovery//-/ \ + /tmp/-merged.img + +# Verify the filesystem mounts +sudo mkdir -p /mnt/recovery- +sudo mount -o loop /tmp/-merged.img /mnt/recovery- +sudo ls -lah /mnt/recovery-/ +sudo umount /mnt/recovery- +``` + +If mount fails with "wrong fs type" or "bad superblock": +- The snapshot `.img` is all-zero (was overwritten by a prior Longhorn reconciliation) +- Try the next oldest replica dir from another node +- Check with `sudo od -A x -t x1z -v snap.img | grep -v ' 00 00...' | head -5` + +--- + +## Step 2 — Create the Longhorn Volume CRD + +Longhorn needs to know about the volume before its block device can be used. + +```bash +kubectl apply -f - < + namespace: longhorn-system +spec: + accessMode: rwo # or rwx + dataEngine: v1 + frontend: blockdev + numberOfReplicas: 3 + size: "" # e.g. "134217728" for 128Mi +EOF +``` + +Wait for replicas to appear: +```bash +kubectl get replicas.longhorn.io -n longhorn-system | grep +# Expect 3 replicas in "stopped" state +``` + +--- + +## Step 3 — Attach the Volume in Maintenance Mode + +Longhorn only creates the block device (`/dev/longhorn/`) when the volume is +attached to a node. Use a `VolumeAttachment` ticket to attach without a pod. + +Choose `` = the same node where the backup/merged image is stored (avoids +copying large files across the network). + +```bash +kubectl apply -f - < + namespace: longhorn-system +spec: + attachmentTickets: + recovery: + generation: 0 + id: recovery + nodeID: + parameters: + disableFrontend: "false" + type: longhorn-api + volume: +EOF + +kubectl wait --for=jsonpath='{.status.state}'=attached \ + volumes.longhorn.io/ -n longhorn-system --timeout=120s +``` + +--- + +## Step 4 — Scale Down the Workload + +Always stop the workload before touching the data to prevent concurrent writes and filesystem +corruption. + +```bash +# For a Deployment: +kubectl scale deployment -n --replicas=0 + +# For a StatefulSet: +kubectl scale statefulset -n --replicas=0 +``` + +--- + +## Step 5 — Inject Data Files via Block Device + +```bash +ssh bash <<'SHELL' + # Mount the live block device + sudo mkdir -p /mnt/recovery-live + sudo mount /dev/longhorn/ /mnt/recovery-live + + # Mount the reconstructed image (if not already mounted) + sudo mkdir -p /mnt/recovery-src + sudo mount -o loop /tmp/-merged.img /mnt/recovery-src + + # Sync: only the application data files, not lost+found + sudo rsync -av --exclude='lost+found' /mnt/recovery-src/ /mnt/recovery-live/ + + # Verify + sudo ls -lah /mnt/recovery-live/ + + # Unmount both + sudo umount /mnt/recovery-src + sudo umount /mnt/recovery-live +SHELL +``` + +--- + +## Step 6 — Detach the Volume + +```bash +kubectl patch volumeattachments.longhorn.io \ + -n longhorn-system --type json \ + -p '[{"op":"remove","path":"/spec/attachmentTickets/recovery"}]' + +kubectl wait --for=jsonpath='{.status.state}'=detached \ + volumes.longhorn.io/ -n longhorn-system --timeout=60s +``` + +--- + +## Step 7 — Restore PV and PVC + +Clear stuck Terminating PV/PVC finalizers first if they exist: +```bash +kubectl patch pv --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null +kubectl patch pvc -n --type=merge \ + -p '{"metadata":{"finalizers":null}}' 2>/dev/null +# Wait a moment for them to delete +``` + +Recreate the PV with `Retain` policy and no `claimRef`: +```bash +kubectl apply -f - < + annotations: + pv.kubernetes.io/provisioned-by: driver.longhorn.io +spec: + accessModes: [ReadWriteOnce] # match original + capacity: + storage: # e.g. 128Mi + csi: + driver: driver.longhorn.io + fsType: ext4 + volumeHandle: + volumeAttributes: + dataEngine: v1 + dataLocality: disabled + disableRevisionCounter: "true" + numberOfReplicas: "3" + staleReplicaTimeout: "30" + persistentVolumeReclaimPolicy: Retain + storageClassName: longhorn + volumeMode: Filesystem +EOF +``` + +Recreate the PVC pinned to this PV: +```bash +kubectl apply -f - < + namespace: +spec: + accessModes: [ReadWriteOnce] + resources: + requests: + storage: + storageClassName: longhorn + volumeMode: Filesystem + volumeName: +EOF +``` + +--- + +## Step 8 — Scale Up and Verify + +```bash +kubectl scale deployment -n --replicas=1 +kubectl wait --for=condition=Ready pod -l app= -n --timeout=120s +``` + +--- + +## Pitfalls Learned During 2026-04-13 Recovery + +| Pitfall | What happened | Prevention | +|---------|--------------|------------| +| **Directory swap corrupts data** | Longhorn found old `Dirty: true` volume.meta + empty pi1 replica → rebuilt from empty source | Never swap dirs. Use merge tool + block device injection instead | +| **Snapshot is zeroed after swap** | Longhorn reconciliation overwrote snapshot images when rebuilding from empty replica | Back up the untouched dir FIRST before any rename | +| **Multiple dirs per volume on pi3** | Rebuild attempts during the incident created extra dirs | Identify the untouched dir by timestamp AND verify non-zero content with `od` | +| **`Rebuilding: true` replica → all-zeros merged image** | Phase 0 picked a replica mid-rebuild (1.3 GiB actual data, sparse files look large) — merge tool produced an all-zeros image | Check `volume.meta` and skip any dir with `"Rebuilding": true` before merging | +| **`du -sb` gives misleading apparent sizes** | Sparse replica files (8 GiB file, 1.3 GiB actual) appeared larger than healthy 11 GiB replicas | Use `du -sk` (actual disk blocks) not `du -sb` (apparent/logical size) to rank replicas | +| **Dirty journal prevents ro mount** | `mount -o loop,ro` fails with "bad superblock" on an ext4 with unclean shutdown | Use `mount -o loop,ro,noload` to skip journal replay for read-only access | +| **New volume is unformatted** | `mount /dev/longhorn/` fails with "wrong fs type" on a freshly created volume | Run `mkfs.ext4 -F` before mounting; guard with `blkid` to skip if already formatted | +| **rsync rc=23 on power-cut partitions** | Some filesystem blocks were unreadable ("Structure needs cleaning") → rsync exits 23 | Use `rsync --ignore-errors`; rc=23 is a partial transfer, not a total failure | +| **pod blocks volume re-attach** | Old Error-state pod held a volume attachment claim | Delete old Error pods before scaling up new ones | +| **`kubectl cp` needs `tar`** | Distroless container had no `tar` binary | Mount block device directly on the node instead | +| **VolumeAttachment ticket removal** | Deleting a VolumeAttachment object causes Longhorn to immediately recreate it | Patch the `recovery` key out of `spec.attachmentTickets` instead of deleting the object | +| **Phase 7 wait for `detached` times out** | After removing the recovery ticket, a workload may immediately create its own ticket | Wait for the `recovery` ticket to disappear from `spec.attachmentTickets`, not for full detach | +| **StatefulSet pods not found by label** | `kubectl get pod -l app=` returns nothing for StatefulSet pods | Wait on `readyReplicas ≄ 1` on the StatefulSet object, not on pod labels | +| **`set_fact` overridden by `-e @file`** | Ansible extra vars have highest precedence — `set_fact: longhorn_recovery_volumes` was silently ignored | Use a different variable name (`_volumes`) for the resolved list, never reassign the extra var name | + +--- + +## Identifying the Right Replica Directory + +When multiple old dirs exist for the same volume on a node, pick the one to use for recovery: + +1. **Skip `Rebuilding: true`:** check `volume.meta` first — a dir that was being rebuilt when + the incident happened has incomplete data (sparse files are allocated but mostly zeroed): + ```bash + python3 -c "import json; d=json.load(open('volume.meta')); print('Rebuilding:', d['Rebuilding'])" + ``` + Only consider dirs where `Rebuilding: false`. + +2. **Actual size:** `sudo du -sk ` (actual disk usage in KB — not `du -sb` which returns + apparent/logical size and is misleading for sparse files). Pick the largest actual size. + +3. **Timestamps:** prefer the most recently modified before the incident date. + +4. **Snapshot chain:** if Rebuilding is false on multiple dirs, check `volume.meta` for + `"Dirty": false` (clean shutdown) vs `"Dirty": true`. Prefer clean if available. + +5. **Content check:** verify the snapshot is not all zeros: + ```bash + sudo od -A x -t x1z -v volume-snap-*.img | grep -v ' 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00' | head -3 + ``` + If the output is empty (all zeros), the snapshot was overwritten. Try another node. + +**Summary rule:** `Rebuilding: false` → largest `du -sk` → non-zero snapshot content. + +--- + +## Reference: Key Commands + +```bash +# List all replica dirs for a volume across all nodes +for n in pi1 pi2 pi3; do echo "==$n=="; ssh $n "sudo ls /mnt/arcodange/longhorn/replicas/ | grep "; done + +# Check Longhorn volume state +kubectl get volumes.longhorn.io -n longhorn-system + +# Check VolumeAttachment tickets +kubectl get volumeattachments.longhorn.io -n longhorn-system \ + -o jsonpath='{.spec.attachmentTickets}' + +# Check Longhorn block device existence on a node +ssh "ls /dev/longhorn/" + +# Verify filesystem content without starting the app +ssh "sudo mount /dev/longhorn/ /mnt/check && sudo ls /mnt/check && sudo umount /mnt/check" +``` diff --git a/ansible/arcodange/factory/playbooks/backup/k3s_pvc.yml b/ansible/arcodange/factory/playbooks/backup/k3s_pvc.yml index c347956..f1691bd 100644 --- a/ansible/arcodange/factory/playbooks/backup/k3s_pvc.yml +++ b/ansible/arcodange/factory/playbooks/backup/k3s_pvc.yml @@ -24,12 +24,15 @@ - name: define backup command set_fact: - backup_cmd: |- - echo " - $(kubectl get -A pv -o yaml) - --- - $(kubectl get -A pvc -o yaml) - " + # PVs + PVCs + Longhorn Volume CRDs (critical for fast recovery — without Volume CRDs, + # Longhorn cannot re-associate orphaned replica dirs after a reinstall and forces + # full block-device injection recovery. See docs/adr/20260414-longhorn-pvc-recovery.md) + backup_cmd: >- + kubectl get -A pv,pvc -o yaml + && echo '---' + && kubectl get -A volumes.longhorn.io -o yaml + && echo '---' + && kubectl get -A settings.longhorn.io -o yaml - name: test backup_cmd ansible.builtin.shell: | @@ -65,19 +68,34 @@ #!/bin/bash set -e - BACKUP_DIR="{{ backup_dir }}" + PRIMARY_BACKUP_DIR="{{ backup_dir }}" + FALLBACK_BACKUP_DIR="/home/pi/arcodange/backups/k3s_pvc" + + # Check if fallback directory exists and has backups + if [ -d "$FALLBACK_BACKUP_DIR" ] && ls "$FALLBACK_BACKUP_DIR"/*.volumes 1>/dev/null 2>&1; then + BACKUP_DIR="$FALLBACK_BACKUP_DIR" + echo "Using fallback backup directory: $BACKUP_DIR" + elif [ -d "$PRIMARY_BACKUP_DIR" ] && ls "$PRIMARY_BACKUP_DIR"/*.volumes 1>/dev/null 2>&1; then + BACKUP_DIR="$PRIMARY_BACKUP_DIR" + else + echo "No backup directory found" + exit 1 + fi if [ -z "$1" ]; then FILE=$(ls -1t "$BACKUP_DIR"/backup_*.volumes | head -n 1) - echo "Aucune date fournie, restauration du dernier dump : $FILE" + echo "No date provided, restoring latest dump: $FILE" else FILE="$BACKUP_DIR/backup_$1.volumes" if [ ! -f "$FILE" ]; then - echo "Fichier $FILE introuvable" + echo "File $FILE not found" exit 1 fi fi kubectl apply -f "$FILE" - echo "Restauration des volumes k3s terminĆ©e." + echo "K3S volumes restoration complete." + echo "NOTE: file includes PVs, PVCs, and Longhorn Volume CRDs." + echo "If Longhorn replica dirs are still orphaned after this restore," + echo "fall back to: ansible-playbook playbooks/recover/longhorn_data.yml" diff --git a/ansible/arcodange/factory/playbooks/recover/longhorn.yml b/ansible/arcodange/factory/playbooks/recover/longhorn.yml new file mode 100644 index 0000000..c5b81d5 --- /dev/null +++ b/ansible/arcodange/factory/playbooks/recover/longhorn.yml @@ -0,0 +1,536 @@ +--- +- name: Recover Longhorn from Power Cut - CSI Driver Registration Loss + hosts: raspberries:&local + gather_facts: yes + become: yes + + vars: + # Backup locations + primary_backup_dir: "/mnt/backups/k3s_pvc" + fallback_backup_dir: "/home/pi/arcodange/backups/k3s_pvc" + scripts_dir: "/opt/k3s_volumes" + + # Longhorn configuration + longhorn_manifest_path: "/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml" + longhorn_namespace: "longhorn-system" + longhorn_chart_name: "longhorn-install" + longhorn_chart_namespace: "kube-system" + + # Data paths (DO NOT MODIFY - points to actual volume data) + longhorn_data_path: "/mnt/arcodange/longhorn" + + tasks: + # ======================================================================== + # PHASE 0: Pre-flight Checks + # ======================================================================== + + - name: Verify data directory exists on control plane + ansible.builtin.stat: + path: "{{ longhorn_data_path }}" + register: data_dir + when: inventory_hostname == 'pi1' + run_once: true + + - name: FAIL if data directory missing + ansible.builtin.fail: + msg: "CRITICAL: Longhorn data directory {{ longhorn_data_path }} does not exist. Aborting recovery." + when: inventory_hostname == 'pi1' and not data_dir.stat.exists + run_once: true + + - name: Check for fallback backups on pi1 + ansible.builtin.shell: ls {{ fallback_backup_dir }}/backup_*.volumes 2>/dev/null + register: fallback_backup_check + changed_when: false + when: inventory_hostname == 'pi1' + run_once: true + ignore_errors: yes + + - name: Check for primary backups on pi1 + ansible.builtin.shell: ls {{ primary_backup_dir }}/backup_*.volumes 2>/dev/null + register: primary_backup_check + changed_when: false + when: inventory_hostname == 'pi1' + run_once: true + ignore_errors: yes + + - name: Set backup fact + ansible.builtin.set_fact: + has_backups: "{{ (fallback_backup_check.rc == 0 and fallback_backup_check.stdout | trim != '') or (primary_backup_check.rc == 0 and primary_backup_check.stdout | trim != '') }}" + when: inventory_hostname == 'pi1' + run_once: true + + - name: FAIL if no backups found + ansible.builtin.fail: + msg: "No backup files found in {{ primary_backup_dir }} or {{ fallback_backup_dir }}. Cannot proceed." + when: inventory_hostname == 'pi1' and not has_backups | bool + run_once: true + + # ======================================================================== + # PHASE 1: Diagnosis - Check Current State + # ======================================================================== + + - name: Gather Longhorn namespace status + block: + - name: Check if longhorn-system namespace exists + kubernetes.core.k8s_info: + kind: Namespace + name: "{{ longhorn_namespace }}" + register: longhorn_ns + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Check CSI driver registration + kubernetes.core.k8s_info: + kind: CSIDriver + name: driver.longhorn.io + register: csi_driver + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Check Longhorn manager pods + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + label_selectors: + - app=longhorn-manager + register: managers + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Set recovery_phase fact + ansible.builtin.set_fact: + recovery_phase: "none" + run_once: true + delegate_to: localhost + + - name: Determine recovery phase needed + ansible.builtin.set_fact: + recovery_phase: >- + {% if csi_driver.failed %} + soft + {% elif managers.failed or managers.resources | default([]) | selectattr('status.phase', 'defined') | selectattr('status.phase', 'ne', 'Running') | list | length > 0 %} + hard + {% elif longhorn_ns.failed %} + none + {% else %} + none + {% endif %} + run_once: true + delegate_to: localhost + + - name: Display recovery diagnosis + ansible.builtin.debug: + msg: "Diagnosis: recovery_phase={{ recovery_phase | default('none') }}. CSI Driver exists: {{ not csi_driver.failed | bool }}, Managers healthy: {{ managers.failed | ternary('unknown', managers.resources | default([]) | selectattr('status.phase', 'defined') | selectattr('status.phase', 'eq', 'Running') | list | length >= 3) | bool }}" + run_once: true + delegate_to: localhost + + when: inventory_hostname == 'pi1' + run_once: true + + # ======================================================================== + # PHASE 2: Soft Recovery - Touch Manifest + # ======================================================================== + + - name: Execute soft recovery - touch Longhorn manifest + block: + - name: Touch longhorn-install.yaml manifest + ansible.builtin.file: + path: "{{ longhorn_manifest_path }}" + state: touch + register: manifest_touch + when: inventory_hostname == 'pi1' + + - name: Wait for k3s to detect manifest change + ansible.builtin.pause: + minutes: 1 + when: manifest_touch is changed + + - name: Check if Longhorn pods are recreating + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + register: longhorn_pods + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Verify soft recovery success + ansible.builtin.set_fact: + soft_recovery_success: >- + {{ (longhorn_pods.resources | default([]) | selectattr('metadata.creationTimestamp', 'defined') | list | length) >= 10 }} + run_once: true + delegate_to: localhost + + when: recovery_phase == 'soft' and inventory_hostname == 'pi1' + run_once: true + + # ======================================================================== + # PHASE 3: Hard Recovery - Delete Driver-Deployer + # ======================================================================== + + - name: Execute hard recovery - delete driver-deployer pods + block: + - name: Get driver-deployer pods + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + label_selectors: + - app=longhorn-driver-deployer + register: driver_deployer_pods + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Delete driver-deployer pods + kubernetes.core.k8s: + state: absent + kind: Pod + namespace: "{{ longhorn_namespace }}" + name: "{{ item.metadata.name }}" + force: yes + grace_period: 0 + loop: "{{ driver_deployer_pods.resources | default([]) }}" + when: driver_deployer_pods.resources | default([]) | length > 0 + run_once: true + delegate_to: localhost + + - name: Wait for HelmChart to recreate driver-deployer + ansible.builtin.pause: + minutes: 2 + + - name: Check driver-deployer status + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + label_selectors: + - app=longhorn-driver-deployer + register: new_driver_deployer + ignore_errors: yes + run_once: true + delegate_to: localhost + + when: (recovery_phase == 'hard' or (recovery_phase == 'soft' and not soft_recovery_success | default(false))) and inventory_hostname == 'pi1' + run_once: true + + # ======================================================================== + # PHASE 4: Nuclear Recovery - Full Reinstall + # ======================================================================== + + - name: Execute nuclear recovery - full Longhorn reinstall + block: + # Step 1: Delete HelmChart + - name: Delete Longhorn HelmChart + kubernetes.core.k8s: + state: absent + kind: HelmChart + namespace: "{{ longhorn_chart_namespace }}" + name: "{{ longhorn_chart_name }}" + force: yes + grace_period: 0 + register: helmchart_deleted + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Wait for HelmChart to be fully removed + ansible.builtin.pause: + seconds: 30 + when: helmchart_deleted is changed + run_once: true + + # Step 2: Remove Longhorn manifest from filesystem + - name: Remove Longhorn manifest file + ansible.builtin.file: + path: "{{ longhorn_manifest_path }}" + state: absent + when: inventory_hostname == 'pi1' + register: manifest_removed + + # Step 3: Remove finalizers from all Longhorn resources + - name: Get list of all Longhorn CRDs + kubernetes.core.k8s_info: + kind: CustomResourceDefinition + label_selectors: + - app=longhorn + register: longhorn_crds + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Get all Longhorn CR instances + kubernetes.core.k8s_info: + kind: "{{ item.spec.names.kind }}" + namespace: "{{ longhorn_namespace }}" + api_version: "{{ item.spec.group ~ '/' ~ item.spec.versions[0].name }}" + register: cr_instances + ignore_errors: yes + loop: "{{ longhorn_crds.resources | default([]) }}" + run_once: true + delegate_to: localhost + + - name: Remove finalizers from all Longhorn CR instances + kubernetes.core.k8s_json_patch: + kind: "{{ item.0.spec.names.kind }}" + namespace: "{{ longhorn_namespace }}" + name: "{{ item.1.metadata.name }}" + api_version: "{{ item.0.spec.group ~ '/' ~ item.0.spec.versions[0].name }}" + patch: + - op: replace + path: /metadata/finalizers + value: [] + loop: >- + {% set results = [] %} + {% for crd in longhorn_crds.resources | default([]) %} + {% for instance in hostvars['localhost']['cr_instances'].results | default([]) %} + {% if instance.crd == crd %} + {% set results = results.append([crd, instance.resources[0] if instance.resources else {}]) %} + {% endif %} + {% endfor %} + {% endfor %} + {{ results }} + when: cr_instances.results | default([]) | length > 0 + run_once: true + delegate_to: localhost + ignore_errors: yes + + # Step 4: Remove finalizers from PVCs + - name: Get all PVCs with longhorn storage class + kubernetes.core.k8s_info: + kind: PersistentVolumeClaim + register: all_pvcs + ignore_errors: yes + run_once: true + delegate_to: localhost + + - name: Remove finalizers from PVCs + kubernetes.core.k8s_json_patch: + kind: PersistentVolumeClaim + namespace: "{{ item.metadata.namespace }}" + name: "{{ item.metadata.name }}" + patch: + - op: replace + path: /metadata/finalizers + value: [] + loop: "{{ all_pvcs.resources | default([]) | selectattr('spec.storageClassName', 'defined') | selectattr('spec.storageClassName', 'match', 'longhorn.*') | list }}" + run_once: true + delegate_to: localhost + ignore_errors: yes + + # Step 5: Remove namespace finalizers + - name: Remove finalizers from longhorn-system namespace + kubernetes.core.k8s_json_patch: + kind: Namespace + name: "{{ longhorn_namespace }}" + patch: + - op: replace + path: /spec/finalizers + value: [] + run_once: true + delegate_to: localhost + ignore_errors: yes + + - name: Delete longhorn-system namespace + kubernetes.core.k8s: + state: absent + kind: Namespace + name: "{{ longhorn_namespace }}" + force: yes + grace_period: 0 + run_once: true + delegate_to: localhost + ignore_errors: yes + + - name: Wait for namespace deletion + ansible.builtin.pause: + seconds: 15 + run_once: true + + # Step 6: Reinstall Longhorn via manifest + - name: Deploy Longhorn HelmChart manifest + ansible.builtin.copy: + dest: "{{ longhorn_manifest_path }}" + content: | + apiVersion: helm.cattle.io/v1 + kind: HelmChart + metadata: + annotations: + helmcharts.cattle.io/managed-by: helm-controller + finalizers: + - wrangler.cattle.io/on-helm-chart-remove + name: longhorn-install + namespace: kube-system + spec: + version: v1.9.1 + chart: longhorn + repo: https://charts.longhorn.io + failurePolicy: abort + targetNamespace: longhorn-system + createNamespace: true + valuesContent: |- + defaultSettings: + defaultDataPath: {{ longhorn_data_path }} + when: inventory_hostname == 'pi1' + register: manifest_deployed + + - name: Trigger k3s reconcile by touching manifest + ansible.builtin.file: + path: "{{ longhorn_manifest_path }}" + state: touch + when: manifest_deployed is changed and inventory_hostname == 'pi1' + + - name: Wait for Longhorn pods to be created + ansible.builtin.pause: + minutes: 3 + when: manifest_deployed is changed + run_once: true + + when: >- + (recovery_phase == 'hard' and not new_driver_deployer.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length > 0) + or (recovery_phase == 'soft' and not soft_recovery_success | default(false) and not new_driver_deployer.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length > 0) + or recovery_phase == 'none' + run_once: true + + # ======================================================================== + # PHASE 5: Restore from Backup + # ======================================================================== + + - name: Execute restore from backup + block: + - name: Determine backup directory to use + ansible.builtin.set_fact: + backup_dir_to_use: >- + {% if fallback_backup_dir and lookup('fileglob', fallback_backup_dir ~ '/backup_*.volumes') | length > 0 %} + {{ fallback_backup_dir }} + {% elif primary_backup_dir and lookup('fileglob', primary_backup_dir ~ '/backup_*.volumes') | length > 0 %} + {{ primary_backup_dir }} + {% else %} + "" + {% endif %} + run_once: true + delegate_to: localhost + + - name: FAIL if no backup directory found + ansible.builtin.fail: + msg: "No valid backup directory found with backup_*.volumes files" + when: backup_dir_to_use == "" + run_once: true + + - name: Find latest backup file + ansible.builtin.set_fact: + latest_backup: >- + {% set files = lookup('fileglob', backup_dir_to_use ~ '/backup_*.volumes', wantlist=True) | sort(attribute='stat.mtime', reverse=True) %} + {% if files | length > 0 %} + {{ files[0].path }} + {% endif %} + run_once: true + delegate_to: localhost + + - name: FAIL if no backup files found + ansible.builtin.fail: + msg: "No backup files found in {{ backup_dir_to_use }}" + when: latest_backup | default('') == '' + run_once: true + + - name: Wait for Longhorn managers to be ready + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + label_selectors: + - app=longhorn-manager + register: managers_status + until: >- + {{ (managers_status.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length) >= 1 }} + retries: 30 + delay: 10 + run_once: true + delegate_to: localhost + + - name: Apply PV/PVC backup + kubernetes.core.k8s: + state: present + src: "{{ latest_backup }}" + run_once: true + delegate_to: localhost + + - name: Find Longhorn metadata backup + ansible.builtin.set_fact: + longhorn_backup: >- + {% set lh_files = lookup('fileglob', backup_dir_to_use ~ '/longhorn_metadata_*.yaml', wantlist=True) | sort(attribute='stat.mtime', reverse=True) %} + {% if lh_files | length > 0 %} + {{ lh_files[0].path }} + {% endif %} + run_once: true + delegate_to: localhost + + - name: Apply Longhorn metadata backup (if exists) + kubernetes.core.k8s: + state: present + src: "{{ longhorn_backup | default(omit) }}" + namespace: "{{ longhorn_namespace }}" + when: longhorn_backup | default('') != '' + run_once: true + delegate_to: localhost + + when: inventory_hostname == 'pi1' + run_once: true + + # ======================================================================== + # PHASE 6: Post-Recovery Verification + # ======================================================================== + + - name: Verify recovery success + block: + - name: Check CSI driver registration + kubernetes.core.k8s_info: + kind: CSIDriver + name: driver.longhorn.io + register: csi_final + until: csi_final.resources | length > 0 + retries: 10 + delay: 10 + run_once: true + delegate_to: localhost + + - name: Check Longhorn manager health + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + label_selectors: + - app=longhorn-manager + register: managers_final + until: >- + {{ (managers_final.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length) >= 3 }} + retries: 15 + delay: 10 + run_once: true + delegate_to: localhost + + - name: Check CSI socket exists (on pi1) + ansible.builtin.stat: + path: /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock + register: csi_socket + when: inventory_hostname == 'pi1' + + - name: Verify volume data is still present + ansible.builtin.stat: + path: "{{ longhorn_data_path }}/replicas" + register: replicas_dir + when: inventory_hostname == 'pi1' + + - name: Display recovery summary + ansible.builtin.debug: + msg: | + ===== Longhorn Recovery Summary ===== + CSI Driver Registered: {{ not csi_final.failed | bool | ternary('āœ“', 'āœ—') }} + Managers Running: {{ (managers_final.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length) }}/3 + CSI Socket Exists: {{ csi_socket.stat.exists | default(false) | bool | ternary('āœ“', 'āœ—') }} + Volume Data Present: {{ replicas_dir.stat.exists | default(false) | bool | ternary('āœ“', 'āœ—') }} + Backup Used: {{ latest_backup | default('none') }} + ====================================== + run_once: true + + when: inventory_hostname == 'pi1' + run_once: true diff --git a/ansible/arcodange/factory/playbooks/recover/longhorn_data.yml b/ansible/arcodange/factory/playbooks/recover/longhorn_data.yml new file mode 100644 index 0000000..80cc348 --- /dev/null +++ b/ansible/arcodange/factory/playbooks/recover/longhorn_data.yml @@ -0,0 +1,914 @@ +--- +# Longhorn Block-Device Data Recovery Playbook +# +# PURPOSE: +# Recover application data directly from raw Longhorn replica files when Volume CRDs +# are missing (e.g. after a nuclear cleanup + reinstall). Bypasses k8s objects entirely +# and works at the block-device level. +# +# WHEN TO USE: +# - Longhorn has been fully reinstalled (Volume CRDs are gone) +# - Application PVCs are stuck Terminating / Lost +# - The raw replica .img files still exist on disk +# → See docs/runbooks/longhorn-block-device-recovery.md for the manual equivalent +# +# WHEN NOT TO USE: +# - Volume CRDs still exist → use playbooks/recover/longhorn.yml instead +# - All replica dirs were zeroed by Longhorn reconciliation (data is unrecoverable) +# +# USAGE: +# ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \ +# -e @vars/recovery_volumes.yml +# +# VARS FILE FORMAT (vars/recovery_volumes.yml): +# longhorn_recovery_volumes: +# - pv_name: pvc-abc123 # Longhorn volume name (== PV name) +# pvc_name: myapp-data # PVC name in the namespace +# namespace: myapp # namespace where the PVC lives +# size_bytes: "134217728" # volume size in bytes (string) +# size_human: 128Mi # human-readable, used in PVC spec +# access_mode: ReadWriteOnce # ReadWriteOnce or ReadWriteMany +# workload_kind: Deployment # Deployment or StatefulSet +# workload_name: myapp # name of the workload to scale down/up +# source_node: pi3 # [OPTIONAL] node with untouched replica dir +# source_dir: pvc-abc123-998f49ff # [OPTIONAL] exact replica dir name +# verify_cmd: "" # optional: command to run inside pod to verify data after recovery +# +# source_node and source_dir are auto-discovered (largest dir >16K across all nodes) +# when not specified. Override manually only to force a specific replica dir. +# +# REQUIREMENTS: +# - python3 on all cluster nodes +# - kubectl configured on the Ansible controller (localhost) +# - longhorn-system namespace running and healthy before this playbook starts +# - kubernetes.core collection: ansible-galaxy collection install kubernetes.core +# +# TESTED SCENARIO: +# 2026-04-13 power cut — nuclear Longhorn reinstall — url-shortener SQLite recovery +# Proven working as of 2026-04-14. + +- name: Longhorn Block-Device Data Recovery + hosts: localhost + gather_facts: no + + vars: + longhorn_data_path: /mnt/arcodange/longhorn + longhorn_namespace: longhorn-system + longhorn_nodes: [pi1, pi2, pi3] + merge_tool_local: "{{ playbook_dir }}/../../docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py" + merge_tool_remote: /home/pi/merge-longhorn-layers.py + backup_base: /home/pi/arcodange/backups/longhorn-recovery + merged_base: /tmp/longhorn-recovery-merged + recovery_mount: /mnt/recovery-src + live_mount: /mnt/recovery-live + longhorn_recovery_volumes: [] # override with -e @vars/recovery_volumes.yml + + tasks: + + # ========================================================================= + # PRE-FLIGHT + # ========================================================================= + + - name: "Pre-flight | Fail fast if no volumes defined" + ansible.builtin.fail: + msg: > + No recovery volumes defined. Pass -e @vars/recovery_volumes.yml with a + longhorn_recovery_volumes list. See playbook header for format. + when: longhorn_recovery_volumes | length == 0 + + - name: "Pre-flight | Verify merge tool exists locally" + ansible.builtin.stat: + path: "{{ merge_tool_local }}" + register: merge_tool_stat + delegate_to: localhost + + - name: "Pre-flight | Fail if merge tool missing" + ansible.builtin.fail: + msg: "merge-longhorn-layers.py not found at {{ merge_tool_local }}" + when: not merge_tool_stat.stat.exists + + - name: "Pre-flight | Check Longhorn is healthy" + kubernetes.core.k8s_info: + kind: Pod + namespace: "{{ longhorn_namespace }}" + label_selectors: + - app=longhorn-manager + register: lh_managers + delegate_to: localhost + + - name: "Pre-flight | Fail if Longhorn managers are not running" + ansible.builtin.fail: + msg: > + Longhorn managers not running (found {{ lh_managers.resources | default([]) | + selectattr('status.phase', 'eq', 'Running') | list | length }} Running pods). + Ensure Longhorn is healthy before attempting data recovery. + when: > + (lh_managers.resources | default([]) | + selectattr('status.phase', 'eq', 'Running') | list | length) < 1 + + - name: "Pre-flight | Summary" + ansible.builtin.debug: + msg: > + Longhorn healthy ({{ lh_managers.resources | + selectattr('status.phase', 'eq', 'Running') | list | length }} managers running). + Recovering {{ longhorn_recovery_volumes | length }} volume(s): + {{ longhorn_recovery_volumes | map(attribute='pv_name') | list | join(', ') }} + + # ========================================================================= + # PHASE 0 — AUTO-DISCOVER BEST REPLICA DIR (when source_node/source_dir absent) + # ========================================================================= + + - name: "Phase 0 | Scan replica dirs on all nodes" + ansible.builtin.shell: | + result="" + for dir in {{ longhorn_data_path }}/replicas/{{ item.1.pv_name }}-*; do + [ -d "$dir" ] || continue + # Skip replicas that were being rebuilt — their data is incomplete + meta="$dir/volume.meta" + if [ -f "$meta" ]; then + rebuilding=$(python3 -c "import json; d=json.load(open('$meta')); print(d.get('Rebuilding', False))" 2>/dev/null) + [ "$rebuilding" = "True" ] && continue + fi + # Use actual disk usage (not apparent/sparse size) to rank replicas + size=$(du -sk "$dir" 2>/dev/null | cut -f1) + name=$(basename "$dir") + result="$result\n$size $name" + done + printf '%b' "$result" | grep -v '^$' || true + delegate_to: "{{ item.0 }}" + become: yes + loop: "{{ longhorn_nodes | product(longhorn_recovery_volumes) | list }}" + loop_control: + label: "{{ item.0 }}: {{ item.1.pv_name }}" + register: dir_scan_raw + changed_when: false + when: item.1.source_node | default('') == '' or item.1.source_dir | default('') == '' + + - name: "Phase 0 | Pick best source (largest dir with data, >16K)" + ansible.builtin.set_fact: + _discovered_sources: "{{ _build | from_json }}" + vars: + _build: >- + {% set ns = namespace(result={}) %} + {% for res in dir_scan_raw.results | default([]) %} + {% if not res.skipped | default(false) and res.stdout | default('') != '' %} + {% set node = res.item.0 %} + {% set vol = res.item.1.pv_name %} + {% for line in res.stdout_lines %} + {% set parts = line.split() %} + {% if parts | length == 2 %} + {% set size = parts[0] | int %} + {% set dir = parts[1] %} + {% if size > 16384 and (vol not in ns.result or size > ns.result[vol].size) %} + {# size is in KB (from du -sk); 16384 KB = 16 MiB minimum real replica #} + {% set _ = ns.result.update({vol: {'node': node, 'dir': dir, 'size': size}}) %} + {% endif %} + {% endif %} + {% endfor %} + {% endif %} + {% endfor %} + {{ ns.result | to_json }} + + - name: "Phase 0 | Show discovered sources" + ansible.builtin.debug: + msg: >- + {% for vol in longhorn_recovery_volumes %} + {{ vol.pv_name }}: + {% if vol.source_node | default('') != '' %} + source: MANUAL → {{ vol.source_node }}/{{ vol.source_dir }} + {% elif vol.pv_name in _discovered_sources %} + source: AUTO → {{ _discovered_sources[vol.pv_name].node }}/{{ _discovered_sources[vol.pv_name].dir }} + ({{ (_discovered_sources[vol.pv_name].size / 1024 / 1024) | round(0) | int }} MiB) + {% else %} + source: NOT FOUND — no dir >16K on any node for this volume + {% endif %} + {% endfor %} + + - name: "Phase 0 | Fail if source not found for any volume" + ansible.builtin.fail: + msg: > + No replica dir with data found for {{ item.pv_name }} on any node + ({{ longhorn_nodes | join(', ') }}). Check that the replica files survived. + loop: "{{ longhorn_recovery_volumes }}" + loop_control: + label: "{{ item.pv_name }}" + when: > + item.source_node | default('') == '' and + item.source_dir | default('') == '' and + item.pv_name not in _discovered_sources + + - name: "Phase 0 | Initialize merged volume list" + ansible.builtin.set_fact: + _merged_volumes: [] + + - name: "Phase 0 | Append each volume with resolved source" + ansible.builtin.set_fact: + _merged_volumes: "{{ _merged_volumes + [item | combine(_source)] }}" + vars: + _manual: "{{ item.source_node | default('') != '' and item.source_dir | default('') != '' }}" + _source: "{{ _manual | bool | ternary( + {'source_node': item.source_node, 'source_dir': item.source_dir}, + {'source_node': _discovered_sources[item.pv_name].node, + 'source_dir': _discovered_sources[item.pv_name].dir}) }}" + loop: "{{ longhorn_recovery_volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 0 | Apply resolved volume list" + ansible.builtin.set_fact: + _volumes: "{{ _merged_volumes }}" + + # ========================================================================= + # PHASE 1 — UPLOAD MERGE TOOL AND BACK UP REPLICA DIRS + # ========================================================================= + + - name: "Phase 1 | Upload merge tool to source nodes" + ansible.builtin.command: > + scp -o StrictHostKeyChecking=no + {{ merge_tool_local }} + pi@{{ item.source_node }}.home:{{ merge_tool_remote }} + delegate_to: localhost + become: no + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }} → {{ item.source_node }}" + changed_when: true + + - name: "Phase 1 | Create backup directory on source node" + ansible.builtin.file: + path: "{{ backup_base }}/{{ item.pvc_name }}" + state: directory + mode: "0755" + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + + - name: "Phase 1 | Check if backup already exists (skip if re-running)" + ansible.builtin.stat: + path: "{{ backup_base }}/{{ item.pvc_name }}/{{ item.source_dir }}/volume.meta" + register: backup_exists + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + + - name: "Phase 1 | Back up untouched replica dir (safe copy before any operation)" + ansible.builtin.shell: > + cp -a {{ longhorn_data_path }}/replicas/{{ item.item.source_dir }} + {{ backup_base }}/{{ item.item.pvc_name }}/ + delegate_to: "{{ item.item.source_node }}" + become: yes + loop: "{{ backup_exists.results }}" + loop_control: + label: "{{ item.item.pvc_name }}" + when: not item.stat.exists + changed_when: true + + - name: "Phase 1 | Verify backup contains volume.meta" + ansible.builtin.stat: + path: "{{ backup_base }}/{{ item.pvc_name }}/{{ item.source_dir }}/volume.meta" + register: backup_meta + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + + - name: "Phase 1 | Fail if backup is incomplete" + ansible.builtin.fail: + msg: > + Backup for {{ item.item.pvc_name }} is missing volume.meta — the source dir + {{ item.item.source_dir }} may not exist or backup copy failed. + loop: "{{ backup_meta.results }}" + loop_control: + label: "{{ item.item.pvc_name }}" + when: not item.stat.exists + + # ========================================================================= + # PHASE 2 — RECONSTRUCT FILESYSTEMS FROM REPLICA LAYERS + # ========================================================================= + + - name: "Phase 2 | Create merged output directory" + ansible.builtin.file: + path: "{{ merged_base }}" + state: directory + mode: "0755" + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + + - name: "Phase 2 | Check if merged image already exists" + ansible.builtin.stat: + path: "{{ merged_base }}/{{ item.pvc_name }}.img" + register: merged_exists + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + + - name: "Phase 2 | Merge snapshot + head layers into single image" + ansible.builtin.command: > + python3 {{ merge_tool_remote }} + {{ backup_base }}/{{ item.item.pvc_name }}/{{ item.item.source_dir }} + {{ merged_base }}/{{ item.item.pvc_name }}.img + delegate_to: "{{ item.item.source_node }}" + become: yes + loop: "{{ merged_exists.results }}" + loop_control: + label: "{{ item.item.pvc_name }}" + when: not item.stat.exists + changed_when: true + register: merge_output + + - name: "Phase 2 | Show merge output" + ansible.builtin.debug: + msg: "{{ item.stdout_lines | default([]) }}" + loop: "{{ merge_output.results | default([]) }}" + loop_control: + label: "{{ item.item.item.pvc_name | default('') }}" + when: item.stdout_lines is defined + + - name: "Phase 2 | Test mount merged image to verify filesystem" + ansible.builtin.shell: | + mkdir -p {{ recovery_mount }}-{{ item.pvc_name }} + mount -o loop,ro,noload {{ merged_base }}/{{ item.pvc_name }}.img {{ recovery_mount }}-{{ item.pvc_name }} + ls {{ recovery_mount }}-{{ item.pvc_name }}/ + umount {{ recovery_mount }}-{{ item.pvc_name }} + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + register: mount_test + changed_when: false + + - name: "Phase 2 | Show filesystem contents" + ansible.builtin.debug: + msg: "{{ item.item.pvc_name }}: {{ item.stdout_lines }}" + loop: "{{ mount_test.results }}" + loop_control: + label: "{{ item.item.pvc_name }}" + + # ========================================================================= + # PHASE 3 — CREATE LONGHORN VOLUME CRDs + # ========================================================================= + + # Scale down StatefulSets BEFORE removing PVC finalizers. + # StatefulSet controllers auto-recreate PVCs as soon as they are deleted; if we + # remove finalizers while the StatefulSet is still running, the controller + # immediately provisions a new empty PVC (bound to a fresh volume), making the + # PVC spec immutable by the time Phase 8 tries to pin it to our recovered PV. + # Deployments are less urgent here but scaled early for consistency. + + - name: "Phase 3 | Pre-scale down Deployments (before PVC finalizer removal)" + kubernetes.core.k8s_scale: + kind: Deployment + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + replicas: 0 + wait: yes + wait_timeout: 60 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_kind == 'Deployment' and item.workload_name != '' + ignore_errors: yes + + - name: "Phase 3 | Pre-scale down StatefulSets (before PVC finalizer removal)" + kubernetes.core.k8s_scale: + kind: StatefulSet + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + replicas: 0 + wait: yes + wait_timeout: 60 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_kind == 'StatefulSet' and item.workload_name != '' + ignore_errors: yes + + # Clear any stuck Terminating PVs/PVCs BEFORE creating Volume CRDs. + # If old Terminating PVCs still exist when we create the Volume CRD, Longhorn + # associates them and deletes the Volume CRD when the PVC finishes terminating. + + - name: "Phase 3 | Check PVC state before touching finalizers" + ansible.builtin.shell: > + kubectl get pvc {{ item.pvc_name }} -n {{ item.namespace }} + -o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true + register: pvc_deletion_ts + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.pvc_name }}" + changed_when: false + + - name: "Phase 3 | Remove finalizers from stuck PV (if Terminating)" + ansible.builtin.shell: > + kubectl patch pv {{ item.pv_name }} --type=merge + -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + changed_when: false + + - name: "Phase 3 | Remove finalizers from stuck PVC (if Terminating)" + ansible.builtin.shell: > + kubectl patch pvc {{ item.pvc_name }} -n {{ item.namespace }} + --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true + delegate_to: localhost + loop: "{{ pvc_deletion_ts.results }}" + loop_control: + label: "{{ item.item.namespace }}/{{ item.item.pvc_name }}" + when: item.stdout != '' + changed_when: false + + - name: "Phase 3 | Wait for stuck PVCs to fully delete before creating Volume CRDs" + kubernetes.core.k8s_info: + kind: PersistentVolumeClaim + name: "{{ item.item.pvc_name }}" + namespace: "{{ item.item.namespace }}" + register: pvc_pre_check + until: pvc_pre_check.resources | default([]) | length == 0 + retries: 12 + delay: 5 + delegate_to: localhost + loop: "{{ pvc_deletion_ts.results }}" + loop_control: + label: "{{ item.item.namespace }}/{{ item.item.pvc_name }}" + when: item.stdout != '' + + - name: "Phase 3 | Check if Longhorn Volume CRD already exists" + kubernetes.core.k8s_info: + kind: Volume + api_version: longhorn.io/v1beta2 + namespace: "{{ longhorn_namespace }}" + name: "{{ item.pv_name }}" + register: volume_crd_check + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 3 | Create Longhorn Volume CRD" + kubernetes.core.k8s: + state: present + definition: + apiVersion: longhorn.io/v1beta2 + kind: Volume + metadata: + name: "{{ item.item.pv_name }}" + namespace: "{{ longhorn_namespace }}" + spec: + accessMode: "{{ item.item.access_mode | lower | replace('readwriteonce', 'rwo') | replace('readwritemany', 'rwx') }}" + dataEngine: v1 + frontend: blockdev + numberOfReplicas: 3 + size: "{{ item.item.size_bytes }}" + delegate_to: localhost + loop: "{{ volume_crd_check.results }}" + loop_control: + label: "{{ item.item.pv_name }}" + when: item.resources | default([]) | length == 0 + + - name: "Phase 3 | Wait for Longhorn replicas to appear (stopped state)" + kubernetes.core.k8s_info: + kind: Replica + api_version: longhorn.io/v1beta2 + namespace: "{{ longhorn_namespace }}" + label_selectors: + - "longhornvolume={{ item.pv_name }}" + register: replicas_check + until: replicas_check.resources | default([]) | length >= 1 + retries: 24 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 3 | Wait for Volume status to be populated (webhook cache)" + kubernetes.core.k8s_info: + kind: Volume + api_version: longhorn.io/v1beta2 + namespace: "{{ longhorn_namespace }}" + name: "{{ item.pv_name }}" + register: vol_ready + until: > + (vol_ready.resources | default([]) | first | default({}) ).status.state | default('') != '' + retries: 24 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + # ========================================================================= + # PHASE 4 — SCALE DOWN WORKLOADS + # ========================================================================= + + - name: "Phase 4 | Scale down Deployments" + kubernetes.core.k8s_scale: + kind: Deployment + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + replicas: 0 + wait: yes + wait_timeout: 60 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_kind == 'Deployment' and item.workload_name != '' + ignore_errors: yes + + - name: "Phase 4 | Scale down StatefulSets" + kubernetes.core.k8s_scale: + kind: StatefulSet + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + replicas: 0 + wait: yes + wait_timeout: 60 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_kind == 'StatefulSet' and item.workload_name != '' + ignore_errors: yes + + - name: "Phase 4 | Delete any lingering Error-state pods that may hold volume attachments" + ansible.builtin.shell: | + kubectl get pods -n {{ item.namespace }} \ + --field-selector='status.phase=Failed' -o name | xargs -r kubectl delete -n {{ item.namespace }} + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}" + changed_when: false + ignore_errors: yes + + # ========================================================================= + # PHASE 5 — ATTACH VOLUME VIA MAINTENANCE TICKET + # ========================================================================= + + - name: "Phase 5 | Create VolumeAttachment maintenance ticket" + kubernetes.core.k8s: + state: present + definition: + apiVersion: longhorn.io/v1beta2 + kind: VolumeAttachment + metadata: + name: "{{ item.pv_name }}" + namespace: "{{ longhorn_namespace }}" + spec: + attachmentTickets: + recovery: + generation: 0 + id: recovery + nodeID: "{{ item.source_node }}" + parameters: + disableFrontend: "false" + type: longhorn-api + volume: "{{ item.pv_name }}" + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }} → {{ item.source_node }}" + + - name: "Phase 5 | Wait for volume to reach attached state" + kubernetes.core.k8s_info: + kind: Volume + api_version: longhorn.io/v1beta2 + namespace: "{{ longhorn_namespace }}" + name: "{{ item.pv_name }}" + register: vol_state + until: > + (vol_state.resources | default([]) | first | default({}) ).status.state | default('') == 'attached' + retries: 24 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 5 | Verify block device exists on target node" + ansible.builtin.stat: + path: "/dev/longhorn/{{ item.pv_name }}" + register: blockdev_check + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 5 | Fail if block device not present" + ansible.builtin.fail: + msg: > + Block device /dev/longhorn/{{ item.item.pv_name }} not found on + {{ item.item.source_node }} after volume attached — check Longhorn logs. + loop: "{{ blockdev_check.results }}" + loop_control: + label: "{{ item.item.pv_name }}" + when: not item.stat.exists + + # ========================================================================= + # PHASE 6 — INJECT DATA INTO LIVE BLOCK DEVICE + # ========================================================================= + + - name: "Phase 6 | Inject data via block device (mount, rsync, umount)" + ansible.builtin.shell: | + LIVE="{{ live_mount }}-{{ item.pvc_name }}" + SRC="{{ recovery_mount }}-{{ item.pvc_name }}" + BLOCKDEV="/dev/longhorn/{{ item.pv_name }}" + MERGED="{{ merged_base }}/{{ item.pvc_name }}.img" + + # Always unmount on exit (success or partial failure) + cleanup() { + mountpoint -q "$SRC" && umount "$SRC" || true + mountpoint -q "$LIVE" && umount "$LIVE" || true + } + trap cleanup EXIT + + mkdir -p "$LIVE" "$SRC" + + # Format if not already formatted (idempotent — safe on re-run) + if ! blkid "$BLOCKDEV" | grep -q 'TYPE='; then + mkfs.ext4 -F "$BLOCKDEV" + fi + + # Mount live block device if not already mounted + if ! mountpoint -q "$LIVE"; then + mount "$BLOCKDEV" "$LIVE" + fi + + # Mount merged recovery image read-only if not already mounted + if ! mountpoint -q "$SRC"; then + mount -o loop,ro,noload "$MERGED" "$SRC" + fi + + # Sync data — exclude lost+found + # --ignore-errors: continue past unreadable files (e.g. corrupted parts from power cut) + # rc=23 (partial transfer) is treated as success — bulk data transferred + rsync -av --ignore-errors --exclude='lost+found' "$SRC/" "$LIVE/" || \ + { RC=$?; [ $RC -eq 23 ] && echo "WARNING: rsync rc=23 (some files unreadable in source — expected for power-cut partitions)" || exit $RC; } + delegate_to: "{{ item.source_node }}" + become: yes + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pvc_name }}" + register: inject_output + changed_when: true + + - name: "Phase 6 | Show rsync output" + ansible.builtin.debug: + msg: "{{ item.stdout_lines | default([]) }}" + loop: "{{ inject_output.results }}" + loop_control: + label: "{{ item.item.pvc_name }}" + + # ========================================================================= + # PHASE 7 — DETACH VOLUME + # ========================================================================= + + - name: "Phase 7 | Remove recovery attachment ticket" + kubernetes.core.k8s_json_patch: + kind: VolumeAttachment + api_version: longhorn.io/v1beta2 + namespace: "{{ longhorn_namespace }}" + name: "{{ item.pv_name }}" + patch: + - op: remove + path: /spec/attachmentTickets/recovery + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + ignore_errors: yes + + - name: "Phase 7 | Wait for recovery ticket to be gone" + kubernetes.core.k8s_info: + kind: VolumeAttachment + api_version: longhorn.io/v1beta2 + namespace: "{{ longhorn_namespace }}" + name: "{{ item.pv_name }}" + register: va_state + until: > + (va_state.resources | default([]) | first | default({}) ).spec.attachmentTickets.recovery is not defined + retries: 24 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + # ========================================================================= + # PHASE 8 — RESTORE PV AND PVC + # ========================================================================= + + - name: "Phase 8 | Create PersistentVolume (Retain, no claimRef)" + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolume + metadata: + name: "{{ item.pv_name }}" + annotations: + pv.kubernetes.io/provisioned-by: driver.longhorn.io + spec: + accessModes: + - "{{ item.access_mode }}" + capacity: + storage: "{{ item.size_human }}" + csi: + driver: driver.longhorn.io + fsType: ext4 + volumeHandle: "{{ item.pv_name }}" + volumeAttributes: + dataEngine: v1 + dataLocality: disabled + disableRevisionCounter: "true" + numberOfReplicas: "3" + staleReplicaTimeout: "30" + persistentVolumeReclaimPolicy: Retain + storageClassName: longhorn + volumeMode: Filesystem + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 8 | Wait for PV to be Available or Bound" + kubernetes.core.k8s_info: + kind: PersistentVolume + name: "{{ item.pv_name }}" + register: pv_state + until: > + (pv_state.resources | default([]) | first | default({}) ).status.phase | default('') + in ['Available', 'Bound'] + retries: 12 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.pv_name }}" + + - name: "Phase 8 | Check if PVC already bound to correct PV" + ansible.builtin.shell: > + kubectl get pvc {{ item.pvc_name }} -n {{ item.namespace }} + -o jsonpath='{.spec.volumeName}' 2>/dev/null || true + register: pvc_current_volume + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.pvc_name }}" + changed_when: false + + - name: "Phase 8 | Create PersistentVolumeClaim pinned to PV" + kubernetes.core.k8s: + state: present + definition: + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: "{{ item.item.pvc_name }}" + namespace: "{{ item.item.namespace }}" + spec: + accessModes: + - "{{ item.item.access_mode }}" + resources: + requests: + storage: "{{ item.item.size_human }}" + storageClassName: longhorn + volumeMode: Filesystem + volumeName: "{{ item.item.pv_name }}" + delegate_to: localhost + loop: "{{ pvc_current_volume.results }}" + loop_control: + label: "{{ item.item.namespace }}/{{ item.item.pvc_name }}" + when: item.stdout != item.item.pv_name + + - name: "Phase 8 | Wait for PVC to be Bound" + kubernetes.core.k8s_info: + kind: PersistentVolumeClaim + namespace: "{{ item.namespace }}" + name: "{{ item.pvc_name }}" + register: pvc_state + until: > + (pvc_state.resources | default([]) | first | default({}) ).status.phase | default('') == 'Bound' + retries: 12 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.pvc_name }}" + + # ========================================================================= + # PHASE 9 — SCALE UP AND VERIFY + # ========================================================================= + + - name: "Phase 9 | Scale up Deployments" + kubernetes.core.k8s_scale: + kind: Deployment + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + replicas: 1 + wait: yes + wait_timeout: 120 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_kind == 'Deployment' and item.workload_name != '' + ignore_errors: yes + + - name: "Phase 9 | Scale up StatefulSets" + kubernetes.core.k8s_scale: + kind: StatefulSet + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + replicas: 1 + wait: yes + wait_timeout: 120 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_kind == 'StatefulSet' and item.workload_name != '' + ignore_errors: yes + + - name: "Phase 9 | Wait for workload to report ready replicas" + kubernetes.core.k8s_info: + kind: "{{ item.workload_kind }}" + name: "{{ item.workload_name }}" + namespace: "{{ item.namespace }}" + register: workload_state + until: > + (workload_state.resources | default([]) | first | default({}) ).status.readyReplicas | default(0) | int >= 1 + retries: 24 + delay: 5 + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.workload_name != '' + ignore_errors: yes + + - name: "Phase 9 | Run optional verification command in pod" + ansible.builtin.shell: > + kubectl exec -n {{ item.namespace }} + $(kubectl get pod -n {{ item.namespace }} + -l statefulset.kubernetes.io/pod-name={{ item.workload_name }}-0 + --no-headers -o custom-columns=':metadata.name' 2>/dev/null || + kubectl get pod -n {{ item.namespace }} {{ item.workload_name }}-0 + --no-headers -o custom-columns=':metadata.name' 2>/dev/null) + -- sh -c '{{ item.verify_cmd }}' + delegate_to: localhost + loop: "{{ _volumes }}" + loop_control: + label: "{{ item.namespace }}/{{ item.workload_name }}" + when: item.verify_cmd | default('') != '' + register: verify_output + changed_when: false + ignore_errors: yes + + - name: "Phase 9 | Show verification output" + ansible.builtin.debug: + msg: "{{ item.stdout_lines | default([]) }}" + loop: "{{ verify_output.results | default([]) }}" + loop_control: + label: "{{ item.item.pvc_name | default('') }}" + when: item.stdout_lines is defined and item.item.verify_cmd | default('') != '' + + # ========================================================================= + # RECOVERY SUMMARY + # ========================================================================= + + - name: "Summary | Recovery complete" + ansible.builtin.debug: + msg: | + ╔══════════════════════════════════════════════════════╗ + ā•‘ Longhorn Block-Device Recovery Complete ā•‘ + ā•šā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā•ā• + Volumes recovered: + {% for v in _volumes %} + • {{ v.pvc_name }} ({{ v.namespace }}) ← {{ v.source_node }}:{{ v.source_dir }} + {% endfor %} + + Backups retained at: {{ backup_base }}// + Merged images at: {{ merged_base }}/.img + + Next steps: + 1. Verify application data through the app UI / API + 2. Repeat for remaining volumes (update vars file) + 3. Run a fresh k8s_pvc backup once all volumes are healthy diff --git a/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars.example.yml b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars.example.yml new file mode 100644 index 0000000..404862a --- /dev/null +++ b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars.example.yml @@ -0,0 +1,84 @@ +--- +# Example vars file for playbooks/recover/longhorn_data.yml +# +# Usage: +# ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \ +# -e @playbooks/recover/longhorn_data_vars.example.yml +# +# HOW TO FILL THIS IN: +# +# 1. Find untouched replica dirs across all nodes: +# for node in pi1 pi2 pi3; do +# echo "=== $node ===" +# ssh $node "sudo du -sh /mnt/arcodange/longhorn/replicas/pvc--* 2>/dev/null" +# done +# Pick the dir with the largest size (>16K) and oldest timestamps (from before the incident). +# +# 2. Get pv_name and pvc_name from PV/PVC backup: +# cat /home/pi/arcodange/backups/k3s_pvc/backup_*.volumes | grep -A5 "kind: PersistentVolume" +# +# 3. Get size_bytes from Longhorn volume spec or from: +# cat /mnt/arcodange/longhorn/replicas//volume.meta +# +# 4. source_node = the node where the untouched dir lives +# source_dir = the exact directory name (e.g. pvc-abc123-998f49ff) +# +# Fields: +# pv_name — Longhorn volume name, equals the PV name (pvc-) [REQUIRED] +# pvc_name — PVC name in the namespace [REQUIRED] +# namespace — namespace where the PVC lives [REQUIRED] +# size_bytes — volume capacity in bytes as a string (from volume spec) [REQUIRED] +# size_human — human-readable size for PVC spec (e.g. 128Mi, 8Gi) [REQUIRED] +# access_mode — ReadWriteOnce or ReadWriteMany [REQUIRED] +# workload_kind — Deployment or StatefulSet [REQUIRED] +# workload_name — name of the workload to scale down/up [REQUIRED] +# source_node — node holding the untouched replica dir (pi1/pi2/pi3) [OPTIONAL — auto-discovered] +# source_dir — exact replica dir name on source_node [OPTIONAL — auto-discovered] +# verify_cmd — shell command to run inside pod to confirm data after restore [OPTIONAL] +# +# source_node and source_dir are auto-discovered by Phase 0 (largest dir >16K across all +# nodes). Override them manually only if you want to force a specific replica dir. + +longhorn_recovery_volumes: + + # --- url-shortener (example, already recovered 2026-04-14) --- + - pv_name: pvc-cdd434d1-c8b4-4a75-acde-2978ec9febd4 + pvc_name: url-shortener-data + namespace: url-shortener + size_bytes: "134217728" + size_human: 128Mi + access_mode: ReadWriteOnce + workload_kind: Deployment + workload_name: url-shortener + source_node: pi3 + source_dir: pvc-cdd434d1-c8b4-4a75-acde-2978ec9febd4-998f49ff + verify_cmd: "sqlite3 /data/urls.db 'SELECT COUNT(*) FROM urls;'" + + # --- traefik (example, already recovered 2026-04-14) --- + # - pv_name: pvc- + # pvc_name: traefik-data + # namespace: traefik + # size_bytes: "134217728" + # size_human: 128Mi + # access_mode: ReadWriteOnce + # workload_kind: Deployment + # workload_name: traefik + # source_node: pi3 + # source_dir: pvc-- + # verify_cmd: "" + + # --- vault (uncomment and fill for recovery) --- + # - pv_name: pvc- + # pvc_name: vault-data + # namespace: vault + # size_bytes: "1073741824" + # size_human: 1Gi + # access_mode: ReadWriteOnce + # workload_kind: StatefulSet + # workload_name: vault + # source_node: pi2 + # source_dir: pvc-- + # verify_cmd: "" + + # Add more volumes here following the same pattern. + # Process one at a time first to validate, then batch. diff --git a/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_clickhouse.yml b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_clickhouse.yml new file mode 100644 index 0000000..146a923 --- /dev/null +++ b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_clickhouse.yml @@ -0,0 +1,17 @@ +--- +# Recovery vars for Clickhouse +# Source: pi3, dir pvc-1251909b-...-1163420b (2.6G — largest, snapshot verified non-zero) +# Generated: 2026-04-14 + +longhorn_recovery_volumes: + - pv_name: pvc-1251909b-3cef-40c6-881c-3bb6e929a596 + pvc_name: clickhouse-storage-clickhouse-0 + namespace: tools + size_bytes: "17179869184" # 16Gi + size_human: 16Gi + access_mode: ReadWriteOnce + workload_kind: StatefulSet + workload_name: clickhouse + source_node: pi3 + source_dir: pvc-1251909b-3cef-40c6-881c-3bb6e929a596-1163420b + verify_cmd: "clickhouse-client --query 'SHOW DATABASES'" diff --git a/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_erp_vault.yml b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_erp_vault.yml new file mode 100644 index 0000000..a60ce23 --- /dev/null +++ b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_erp_vault.yml @@ -0,0 +1,38 @@ +--- +# Recovery vars for erp and hashicorp-vault volumes +# source_node/source_dir omitted — auto-discovered by Phase 0 + +longhorn_recovery_volumes: + + - pv_name: pvc-7971918e-e47f-4739-a976-965ea2d770b4 + pvc_name: erp + namespace: erp + size_bytes: "53687091200" + size_human: 50Gi + access_mode: ReadWriteMany + workload_kind: Deployment + workload_name: "" # intentionally blank — ERP needs Vault unsealed first; scale up manually + verify_cmd: "" + + # hashicorp-vault StatefulSet has two PVCs (audit + data). + # workload_name is set only on the last entry so the StatefulSet is scaled up + # once after both volumes are ready, not between them. + - pv_name: pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70 + pvc_name: audit-hashicorp-vault-0 + namespace: tools + size_bytes: "10737418240" + size_human: 10Gi + access_mode: ReadWriteOnce + workload_kind: StatefulSet + workload_name: "" + verify_cmd: "" + + - pv_name: pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635 + pvc_name: data-hashicorp-vault-0 + namespace: tools + size_bytes: "10737418240" + size_human: 10Gi + access_mode: ReadWriteOnce + workload_kind: StatefulSet + workload_name: hashicorp-vault + verify_cmd: "" diff --git a/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_remaining.yml b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_remaining.yml new file mode 100644 index 0000000..790bd03 --- /dev/null +++ b/ansible/arcodange/factory/playbooks/recover/longhorn_data_vars_remaining.yml @@ -0,0 +1,47 @@ +--- +# Recovery vars for remaining volumes (prometheus, alertmanager, redis, backups-rwx) +# source_node and source_dir intentionally omitted — auto-discovered by Phase 0 + +longhorn_recovery_volumes: + + - pv_name: pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9 + pvc_name: prometheus-server + namespace: tools + size_bytes: "8589934592" + size_human: 8Gi + access_mode: ReadWriteOnce + workload_kind: Deployment + workload_name: prometheus-server + source_node: pi2 + source_dir: pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9-910583f6 + verify_cmd: "" + + - pv_name: pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4 + pvc_name: storage-prometheus-alertmanager-0 + namespace: tools + size_bytes: "2147483648" + size_human: 2Gi + access_mode: ReadWriteOnce + workload_kind: StatefulSet + workload_name: prometheus-alertmanager + verify_cmd: "" + + - pv_name: pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce + pvc_name: redis-storage-redis-0 + namespace: tools + size_bytes: "1073741824" + size_human: 1Gi + access_mode: ReadWriteOnce + workload_kind: StatefulSet + workload_name: redis + verify_cmd: "redis-cli ping" + + - pv_name: pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa + pvc_name: backups-rwx + namespace: longhorn-system + size_bytes: "53687091200" + size_human: 50Gi + access_mode: ReadWriteMany + workload_kind: Deployment + workload_name: "" + verify_cmd: ""