Compare commits
41 Commits
a836419bdf
...
vibe/batch
| Author | SHA1 | Date | |
|---|---|---|---|
| 1a1d7da329 | |||
| 9e821e1626 | |||
| 69b7e9ddcb | |||
| 069edd72f1 | |||
| a644436746 | |||
| a3526e51f8 | |||
| 01f0f37691 | |||
| f114d7e6f0 | |||
| 1688fe0dfd | |||
| 499410a160 | |||
| e3e0decd98 | |||
| 1ae28cb944 | |||
| 934b62d922 | |||
| 09a270d179 | |||
| 0ce004cc6a | |||
| e6fc24c101 | |||
| 355ab11c4d | |||
| ad70b424cf | |||
| b299469d00 | |||
| fc9164f11e | |||
| c751b621ba | |||
| 07a619b274 | |||
| 9931f81998 | |||
| 437fd506ed | |||
| 943915be74 | |||
| 8a82d14797 | |||
| 0285d171ff | |||
| 55d137132f | |||
| 451dfa5133 | |||
| 17e99db641 | |||
| 07e5ff460b | |||
| 5b3c896a25 | |||
| 91219c49f1 | |||
| 74b8676244 | |||
| 1fd47e9d97 | |||
| 0fbfbd589f | |||
| 8d6be311ae | |||
| 2b4aa30a64 | |||
| cd3c4d86ff | |||
| 45d39d13b4 | |||
| f4cb04c9c9 |
@@ -19,10 +19,11 @@ concurrency:
|
||||
|
||||
.vault_step: &vault_step
|
||||
name: read vault secret
|
||||
uses: https://gitea.arcodange.duckdns.org/arcodange-org/vault-action.git@main
|
||||
uses: https://gitea.arcodange.lab/arcodange-org/vault-action.git@main
|
||||
id: vault-secrets
|
||||
with:
|
||||
url: https://vault.arcodange.duckdns.org
|
||||
url: https://vault.arcodange.lab
|
||||
caCertificate: ${{ secrets.HOMELAB_CA_CERT }}
|
||||
jwtGiteaOIDC: ${{ needs.gitea_vault_auth.outputs.gitea_vault_jwt }}
|
||||
role: gitea_cicd
|
||||
method: jwt
|
||||
@@ -53,9 +54,12 @@ jobs:
|
||||
env:
|
||||
OPENTOFU_VERSION: 1.8.2
|
||||
TERRAFORM_VAULT_AUTH_JWT: ${{ needs.gitea_vault_auth.outputs.gitea_vault_jwt }}
|
||||
VAULT_CACERT: "${{ github.workspace }}/homelab.pem"
|
||||
steps:
|
||||
- *vault_step
|
||||
- uses: actions/checkout@v4
|
||||
- name: prepare vault self signed cert
|
||||
run: echo -n "${{ secrets.HOMELAB_CA_CERT }}" | base64 -d > $VAULT_CACERT
|
||||
- name: terraform apply
|
||||
uses: dflook/terraform-apply@v1
|
||||
with:
|
||||
|
||||
@@ -17,10 +17,11 @@ concurrency:
|
||||
|
||||
.vault_step: &vault_step
|
||||
name: read vault secret
|
||||
uses: https://gitea.arcodange.duckdns.org/arcodange-org/vault-action.git@main
|
||||
uses: https://gitea.arcodange.lab/arcodange-org/vault-action.git@main
|
||||
id: vault-secrets
|
||||
with:
|
||||
url: https://vault.arcodange.duckdns.org
|
||||
url: https://vault.arcodange.lab
|
||||
caCertificate: ${{ secrets.HOMELAB_CA_CERT }}
|
||||
jwtGiteaOIDC: ${{ needs.gitea_vault_auth.outputs.gitea_vault_jwt }}
|
||||
role: gitea_cicd
|
||||
method: jwt
|
||||
@@ -50,9 +51,12 @@ jobs:
|
||||
env:
|
||||
OPENTOFU_VERSION: 1.8.2
|
||||
TERRAFORM_VAULT_AUTH_JWT: ${{ needs.gitea_vault_auth.outputs.gitea_vault_jwt }}
|
||||
VAULT_CACERT: "${{ github.workspace }}/homelab.pem"
|
||||
steps:
|
||||
- *vault_step
|
||||
- uses: actions/checkout@v4
|
||||
- name: prepare vault self signed cert
|
||||
run: echo -n "${{ secrets.HOMELAB_CA_CERT }}" | base64 -d > $VAULT_CACERT
|
||||
- name: terraform apply
|
||||
uses: dflook/terraform-apply@v1
|
||||
with:
|
||||
|
||||
3
.gitignore
vendored
3
.gitignore
vendored
@@ -1,4 +1,5 @@
|
||||
.terraform
|
||||
.terraform.*
|
||||
.DS_Store
|
||||
node_modules/
|
||||
node_modules/
|
||||
.venv/
|
||||
@@ -1,5 +1,17 @@
|
||||
# Use Ansible
|
||||
|
||||
## Run locally (uv)
|
||||
|
||||
A project-local venv is defined in `pyproject.toml` at the repo root (ansible-core + the `kubernetes`, `jmespath`, `dnspython` libraries that `kubernetes.core` and friends need at runtime).
|
||||
|
||||
```sh
|
||||
uv sync # creates .venv/ and installs ansible-core + python deps
|
||||
uv run ansible-galaxy collection install -r ansible/requirements.yml
|
||||
uv run ansible-playbook -i ansible/arcodange/factory/inventory ansible/arcodange/factory/playbooks/<playbook>.yml
|
||||
```
|
||||
|
||||
The localhost entry in the inventory uses `ansible_python_interpreter: "{{ ansible_playbook_python }}"`, so `uv run` is enough — Ansible picks up the venv's Python automatically without any hardcoded path.
|
||||
|
||||
## Run with docker ssh agent side proxy
|
||||
|
||||
### build docker images
|
||||
@@ -67,31 +79,25 @@ ansible -i ,localhost -c local localhost -m raw -a "echo hello world {{ inventor
|
||||
|
||||
### local python environment with uv
|
||||
|
||||
#### Install UV
|
||||
|
||||
`python3 -m pip install uv`
|
||||
`python3 -m uv python install 3.10 3.11 3.12`
|
||||
`echo "export PATH=\"$(find ~/Library/Python/*/bin/uv | xargs dirname)\"" >> ~/.zshenv`
|
||||
`echo 'export PATH="~/.local/bin:$PATH"' >> ~/.zshenv`
|
||||
|
||||
#### Set python version to 3.12
|
||||
|
||||
`uv python pin 3.12` (edit .python-version file)
|
||||
|
||||
#### Install ansible
|
||||
|
||||
`uv tool install ansible-core --with dnspython --with jmespath --with kubernetes`
|
||||
`echo 'export PATH="~/.local/share/uv/tools/ansible-core/bin:$PATH"' >> ~/.zshenv`
|
||||
|
||||
#### Install this project depedencies
|
||||
#### Install UV (one-time)
|
||||
|
||||
```sh
|
||||
python3 -m pip install uv
|
||||
python3 -m uv python install 3.12
|
||||
echo 'export PATH="$HOME/.local/bin:$PATH"' >> ~/.zshenv
|
||||
```
|
||||
ansible-galaxy collection install --token 11bebd8fd1ad4009f700bdedbeb80b19743ce3d3 -r ansible/requirements.yml # token is used by a rate limiter and can be sensitive
|
||||
|
||||
#### Bootstrap the project venv
|
||||
|
||||
```sh
|
||||
uv sync # honors .python-version (3.12) and pyproject.toml
|
||||
uv run ansible-galaxy collection install -r ansible/requirements.yml
|
||||
# `--token <token>` is only needed if you hit galaxy.ansible.com rate limits
|
||||
```
|
||||
|
||||
#### Run
|
||||
|
||||
```
|
||||
ansible-galaxy collection install ./ansible/arcodange/factory -f
|
||||
ansible-playbook -i ansible/arcodange/factory/inventory ansible/arcodange/factory/playbooks/02_setup.yml
|
||||
```sh
|
||||
uv run ansible-galaxy collection install ./ansible/arcodange/factory -f
|
||||
uv run ansible-playbook -i ansible/arcodange/factory/inventory ansible/arcodange/factory/playbooks/02_setup.yml
|
||||
```
|
||||
@@ -10,41 +10,68 @@ kubectl create secret generic traefik-duckdns-token --from-literal="DUCKDNS_TOKE
|
||||
```mermaid
|
||||
%%{init: { 'logLevel': 'debug', 'theme': 'dark' } }%%
|
||||
timeline
|
||||
title ordre des playbook
|
||||
section Setup DNS, OS, ...
|
||||
configuration manuelle
|
||||
: installer OS, réserver IP statique, configurer SSH,VNC
|
||||
: formater et créer des partitions avec gparted
|
||||
section Docker & K3S
|
||||
system
|
||||
: install Docker
|
||||
: install K3S working with docker
|
||||
: configure Traefik
|
||||
|
||||
section Volume, NFS
|
||||
setup hard_disk
|
||||
: monter les partitions
|
||||
: installer NFS
|
||||
system
|
||||
: déployer provisionner NFS
|
||||
|
||||
section postgres
|
||||
setup
|
||||
: postgres
|
||||
section gitea
|
||||
setup
|
||||
: gitea
|
||||
section gitea action runner
|
||||
setup
|
||||
: gitea action runner
|
||||
section argo cd
|
||||
argo_cd
|
||||
: argo cd
|
||||
section hello world app
|
||||
setup git repository
|
||||
: terraform
|
||||
setup CI
|
||||
deploy
|
||||
: dev : list exposed deployments with label and port as a landpage
|
||||
: expose (as ngrock ? direct ? port ? )
|
||||
title Playbook Execution Sequence
|
||||
section 01_system
|
||||
rpi
|
||||
: set hostname
|
||||
dns
|
||||
: install pi-hole
|
||||
ssl
|
||||
: step-ca
|
||||
: fetch root certificate
|
||||
: build docker image with CA
|
||||
prepare_disks
|
||||
: list partitions
|
||||
: format disk
|
||||
: mount disk
|
||||
system_docker
|
||||
: install docker
|
||||
: configure docker storage
|
||||
: restart docker
|
||||
longhorn
|
||||
: deploy longhorn
|
||||
k3s
|
||||
: prepare inventory
|
||||
: install k3s collection
|
||||
: install socat
|
||||
: deploy k3s cluster
|
||||
: configure kubeconfig
|
||||
: configure traefik
|
||||
: configure cert-manager
|
||||
section 02_setup
|
||||
backup_nfs
|
||||
: create RWX volume
|
||||
: create recurring job
|
||||
: deploy NFS
|
||||
: mount NFS
|
||||
postgres
|
||||
: create database
|
||||
: create user
|
||||
gitea
|
||||
: deploy gitea
|
||||
: create admin user
|
||||
: create organization
|
||||
section 03_cicd
|
||||
cicd : CI/CD
|
||||
gitea_token
|
||||
: generate token
|
||||
deploy_docker_compose
|
||||
: deploy gitea action
|
||||
argocd
|
||||
: generate token
|
||||
: deploy argocd
|
||||
section 04_tools
|
||||
Hashicorp Vault
|
||||
: gitea_token
|
||||
: hashicorp_vault
|
||||
Crowdsec
|
||||
: crowdsec
|
||||
section 05_backup
|
||||
Gitea Backup
|
||||
: gitea
|
||||
K3s PVC Backup
|
||||
: k3s_pvc
|
||||
Postgres Backup
|
||||
: create backup script
|
||||
: create restore script
|
||||
```
|
||||
5
ansible/arcodange/factory/ansible.cfg
Normal file
5
ansible/arcodange/factory/ansible.cfg
Normal file
@@ -0,0 +1,5 @@
|
||||
[defaults]
|
||||
collections_path = ~/.ansible/collections
|
||||
|
||||
[ssh_connection]
|
||||
scp_if_ssh = True
|
||||
160
ansible/arcodange/factory/docs/adr/20260407-cicd-architecture.md
Normal file
160
ansible/arcodange/factory/docs/adr/20260407-cicd-architecture.md
Normal file
@@ -0,0 +1,160 @@
|
||||
# ADR 20260407: CI/CD Architecture with ArgoCD, Gitea, and Vault
|
||||
|
||||
## Status
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
The home lab requires a secure and automated CI/CD pipeline to deploy applications to the k3s cluster. The pipeline must integrate with:
|
||||
- **Gitea**: For Git repository management and CI runners.
|
||||
- **ArgoCD**: For GitOps-based continuous deployment.
|
||||
- **Vault**: For secrets management and OIDC authentication.
|
||||
- **Gitea Act Runner**: For executing CI jobs.
|
||||
|
||||
## Decision
|
||||
We will implement a **GitOps-driven CI/CD pipeline** with the following components:
|
||||
|
||||
### 1. Gitea OIDC Authentication with Vault
|
||||
- Gitea is registered as an OIDC application in Vault.
|
||||
- Vault issues short-lived tokens for Gitea users.
|
||||
- The `gitea_oidc_auth.yml` playbook automates this setup using Playwright and OpenTofu.
|
||||
- **OIDC Workflow**:
|
||||
1. The `oidc_jwt_token.sh` script (base64-encoded in `secrets.vault_oauth__sh_b64`) handles the OIDC flow.
|
||||
2. Gitea Act Runner executes the script to obtain an ID token from Gitea.
|
||||
3. The ID token is used to authenticate with Vault and retrieve secrets.
|
||||
|
||||
### 2. Gitea Act Runner
|
||||
- Deployed on `pi1` and `pi3` (not on the Gitea host, which is `pi2`).
|
||||
- Uses Docker-in-Docker for job execution.
|
||||
- **Custom Runner Image (`ubuntu-latest-ca`)**: Required due to the self-signed `.lab` domain. The custom image includes the local CA certificate to trust the Gitea instance (`gitea.arcodange.lab`).
|
||||
- Managed via Docker Compose (`03_cicd.yml`).
|
||||
|
||||
### 3. ArgoCD
|
||||
- Deployed on the k3s cluster (via HelmChart in `/var/lib/rancher/k3s/server/manifests/argocd.yaml`).
|
||||
- Uses Gitea as the source of truth for GitOps.
|
||||
- Synchronizes the `factory` repository to deploy applications.
|
||||
- Configured with Traefik for TLS termination.
|
||||
|
||||
### 4. Vault Secrets Operator
|
||||
- Deployed in the `tools` namespace.
|
||||
- Manages secrets for applications deployed via ArgoCD.
|
||||
- Integrates with Gitea OIDC for authentication.
|
||||
- **Helm Chart Integration**:
|
||||
- `VaultAuth`: Authenticates with Vault using Kubernetes service accounts.
|
||||
- `VaultStaticSecret`: Retrieves static secrets (e.g., `kvv2/webapp/config`).
|
||||
- `VaultDynamicSecret`: Generates dynamic secrets (e.g., PostgreSQL credentials).
|
||||
|
||||
### 5. Security
|
||||
- **TLS**: Traefik terminates TLS using Let's Encrypt.
|
||||
- **OIDC**: Gitea authentication via Vault.
|
||||
- **Secrets**: Stored in Vault, injected via the Vault Secrets Operator.
|
||||
|
||||
## Architecture Diagram
|
||||
|
||||
```mermaid
|
||||
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#333333', 'edgeLabelBackground':'#f0f0f0', 'tertiaryColor': '#e67e22'}}}%%
|
||||
graph TD
|
||||
%% Styles
|
||||
classDef gitea fill:#ffcc99,stroke:#cc9966,color:#333;
|
||||
classDef argocd fill:#99ffcc,stroke:#66cc99,color:#333;
|
||||
classDef vault fill:#ccccff,stroke:#6666cc,color:#333;
|
||||
classDef k3s fill:#ff9999,stroke:#cc0000,color:#333;
|
||||
classDef runner fill:#ffff99,stroke:#cccc00,color:#333;
|
||||
|
||||
%% Components
|
||||
Gitea["Gitea (pi2)"]:::gitea
|
||||
ArgoCD["ArgoCD (k3s)"]:::argocd
|
||||
Vault["Vault (k3s/tools)"]:::vault
|
||||
Runner1["Gitea Act Runner (pi1)"]:::runner
|
||||
Runner2["Gitea Act Runner (pi3)"]:::runner
|
||||
VaultOperator["Vault Secrets Operator (k3s/tools)"]:::vault
|
||||
k3s["k3s Cluster"]:::k3s
|
||||
|
||||
%% Workflow
|
||||
Gitea -->|OIDC Auth| Vault
|
||||
Gitea -->|Trigger CI| Runner1
|
||||
Gitea -->|Trigger CI| Runner2
|
||||
Runner1 -->|Deploy to| k3s
|
||||
Runner2 -->|Deploy to| k3s
|
||||
ArgoCD -->|GitOps Sync| Gitea
|
||||
ArgoCD -->|Deploy Apps| k3s
|
||||
VaultOperator -->|Inject Secrets| k3s
|
||||
Vault -->|Secrets| VaultOperator
|
||||
|
||||
%% Annotations
|
||||
linkStyle 0,1,2,3,4,5,6,7 stroke:#999,stroke-width:1px;
|
||||
```
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
- **Automated Deployments**: ArgoCD ensures the cluster state matches Git.
|
||||
- **Secure Secrets**: Vault centralizes secret management.
|
||||
- **Scalable CI**: Gitea Act Runners can be added to any host.
|
||||
- **OIDC Integration**: Secure authentication via Vault.
|
||||
|
||||
### Negative
|
||||
- **Complexity**: Multiple moving parts (Gitea, ArgoCD, Vault).
|
||||
- **Dependency on Vault**: If Vault fails, CI/CD may be disrupted.
|
||||
- **Learning Curve**: Requires familiarity with GitOps and Vault.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### Alternative 1: GitHub Actions
|
||||
- **Rejected**: Self-hosted Gitea aligns better with the home lab's privacy goals.
|
||||
|
||||
### Alternative 2: Jenkins
|
||||
- **Rejected**: ArgoCD + Gitea Act Runner is lighter and more GitOps-native.
|
||||
|
||||
### Alternative 3: No CI/CD
|
||||
- **Rejected**: Manual deployments are error-prone and unscalable.
|
||||
|
||||
## Sequence Diagrams
|
||||
|
||||
### 1. CI/CD Workflow for OpenTofu/Terraform
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Gitea
|
||||
participant Runner as Gitea Act Runner (pi1/pi3)
|
||||
participant Vault
|
||||
participant WebApp as WebApp (k3s)
|
||||
|
||||
Gitea->>Runner: Trigger vault.yaml workflow
|
||||
Runner->>Gitea: Execute vault_oauth__sh_b64 (OIDC)
|
||||
Gitea-->>Runner: Return ID Token
|
||||
Runner->>Vault: Authenticate with ID Token
|
||||
Vault-->>Runner: Return Vault Token
|
||||
Runner->>Runner: Run OpenTofu/Terraform
|
||||
Runner->>Vault: Fetch Secrets (via Vault Action)
|
||||
Vault-->>Runner: Return Secrets
|
||||
Runner->>WebApp: Deploy Changes
|
||||
```
|
||||
|
||||
### 2. Vault Secrets Operator Workflow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant ArgoCD
|
||||
participant WebApp as WebApp (k3s)
|
||||
participant VaultOperator as Vault Secrets Operator
|
||||
participant Vault
|
||||
|
||||
ArgoCD->>WebApp: Deploy Helm Chart
|
||||
WebApp->>VaultOperator: Create VaultAuth (K8s Auth)
|
||||
VaultOperator->>Vault: Authenticate (K8s Service Account)
|
||||
Vault-->>VaultOperator: Return Vault Token
|
||||
WebApp->>VaultOperator: Create VaultStaticSecret (kvv2/webapp/config)
|
||||
VaultOperator->>Vault: Fetch Static Secret
|
||||
Vault-->>VaultOperator: Return Secret
|
||||
VaultOperator->>WebApp: Inject Secret (secretkv)
|
||||
WebApp->>VaultOperator: Create VaultDynamicSecret (postgres/creds/webapp)
|
||||
VaultOperator->>Vault: Generate Dynamic Secret
|
||||
Vault-->>VaultOperator: Return Credentials
|
||||
VaultOperator->>WebApp: Inject Credentials (vso-db-credentials)
|
||||
WebApp->>WebApp: Restart Pods (Rollout)
|
||||
```
|
||||
|
||||
## Success Metrics
|
||||
- Gitea Act Runners successfully execute CI jobs.
|
||||
- ArgoCD synchronizes the `factory` repository without errors.
|
||||
- Vault Secrets Operator injects secrets into deployed applications.
|
||||
@@ -0,0 +1,152 @@
|
||||
# ADR 20260407: Docker Storage Optimization for Gitea Act Runner
|
||||
|
||||
## Status
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
The `pi3` machine (Raspberry Pi) is running both Docker and k3s, with the following storage constraints:
|
||||
- Root filesystem (`/dev/mmcblk0p2`): 58G total, 89% used (6.4G free)
|
||||
- External disk (`/dev/sda1`): 458G total, 22G used (413G free)
|
||||
|
||||
Gitea Act Runner images (`ubuntu-latest` and `ubuntu-latest-ca`) are frequently deleted, likely due to Docker's automatic garbage collection triggered by low disk space. This disrupts CI/CD pipelines.
|
||||
|
||||
### Current Setup
|
||||
- Docker is configured via Ansible (`system_docker.yml`) using the `geerlingguy.docker` role.
|
||||
- k3s is configured to use Docker as the container runtime (`--docker` flag).
|
||||
- Longhorn is used for persistent storage in k3s, and we want to preserve its performance.
|
||||
|
||||
## Decision
|
||||
We will implement a **hybrid storage strategy** to prevent Gitea Act Runner image deletion while maintaining Longhorn performance:
|
||||
|
||||
### Docker Storage Optimization Flow
|
||||
|
||||
```mermaid
|
||||
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#333333', 'edgeLabelBackground':'#f0f0f0', 'tertiaryColor': '#e67e22'}}}%%
|
||||
sequenceDiagram
|
||||
participant Ansible
|
||||
participant Docker
|
||||
participant ExternalDisk
|
||||
participant GiteaRunner
|
||||
participant Longhorn
|
||||
|
||||
Ansible->>Docker: Configure /etc/docker/daemon.json
|
||||
Docker->>ExternalDisk: Use /mnt/arcodange/docker for storage
|
||||
Ansible->>Docker: Restart Docker
|
||||
Docker->>GiteaRunner: Pull ubuntu-latest-ca image
|
||||
Ansible->>Docker: Pin image (dummy container)
|
||||
Docker->>GiteaRunner: Start CI job
|
||||
GiteaRunner->>Longhorn: Use persistent storage (unaffected)
|
||||
Docker->>ExternalDisk: Store images (413G free)
|
||||
Docker->>Docker: Skip garbage collection (pinned)
|
||||
```
|
||||
|
||||
### 1. Pin Critical Images
|
||||
Use a dummy container to pin the Gitea Act Runner images:
|
||||
```yaml
|
||||
# Add to system_docker.yml or a new playbook
|
||||
- name: Pin Gitea Act Runner images
|
||||
community.docker.docker_container:
|
||||
name: pin-gitea-runner-ubuntu-latest-ca
|
||||
image: gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca
|
||||
state: present
|
||||
command: ["sh", "-c", "sleep infinity"]
|
||||
auto_remove: false
|
||||
restart_policy: unless-stopped
|
||||
```
|
||||
|
||||
### 2. Configure Docker Storage with Overlay on External Disk
|
||||
Modify `/etc/docker/daemon.json` to use the external disk for storage while keeping the root filesystem for metadata:
|
||||
```json
|
||||
{
|
||||
"data-root": "/mnt/arcodange/docker",
|
||||
"storage-driver": "overlay2",
|
||||
"storage-opts": ["overlay2.override_kernel_check=true"]
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Ansible Implementation
|
||||
Update `system_docker.yml` to:
|
||||
1. Create `/mnt/arcodange/docker` if it doesn't exist.
|
||||
2. Configure Docker to use the external disk.
|
||||
3. Pin critical images post-installation.
|
||||
|
||||
```yaml
|
||||
# Add to system_docker.yml tasks
|
||||
- name: Ensure Docker storage directory exists on external disk
|
||||
ansible.builtin.file:
|
||||
path: /mnt/arcodange/docker
|
||||
state: directory
|
||||
mode: '0755'
|
||||
owner: root
|
||||
group: docker
|
||||
|
||||
- name: Configure Docker to use external storage
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/docker/daemon.json
|
||||
content: |
|
||||
{
|
||||
"data-root": "/mnt/arcodange/docker",
|
||||
"storage-driver": "overlay2",
|
||||
"storage-opts": ["overlay2.override_kernel_check=true"],
|
||||
"log-driver": "json-file",
|
||||
"log-opts": {
|
||||
"max-size": "10m",
|
||||
"max-file": "5"
|
||||
}
|
||||
}
|
||||
mode: '0644'
|
||||
notify: Redémarrer Docker
|
||||
|
||||
- name: Pin Gitea Act Runner images
|
||||
community.docker.docker_container:
|
||||
name: "{{ item.name }}"
|
||||
image: "{{ item.image }}"
|
||||
state: present
|
||||
command: ["sh", "-c", "sleep infinity"]
|
||||
auto_remove: false
|
||||
restart_policy: unless-stopped
|
||||
loop:
|
||||
- { name: "pin-gitea-runner-ubuntu-latest", image: "gitea/runner-images:ubuntu-latest" }
|
||||
- { name: "pin-gitea-runner-ubuntu-latest-ca", image: "gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca" }
|
||||
```
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
- **Prevents Image Deletion**: Critical images are pinned and won't be garbage-collected.
|
||||
- **Preserves Longhorn Performance**: Longhorn continues to use the root filesystem for its operations, maintaining performance.
|
||||
- **Scalable Storage**: Docker images are stored on the external disk (413G free), preventing root filesystem exhaustion.
|
||||
- **No k3s Changes Required**: k3s continues to use Docker as the runtime without modification.
|
||||
|
||||
### Negative
|
||||
- **Migration Effort**: Existing Docker data must be migrated to the external disk (one-time operation).
|
||||
- **Dependency on External Disk**: If `/dev/sda1` fails, Docker will not function until the disk is remounted or the configuration is reverted.
|
||||
- **Slight Performance Overhead**: Accessing images from the external disk may be slightly slower than the root filesystem (mitigated by SSD/HDD performance).
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### Alternative 1: Increase Root Filesystem Size
|
||||
- **Rejected**: The SD card is already at capacity, and expanding it is not feasible.
|
||||
|
||||
### Alternative 2: Disable Docker Garbage Collection
|
||||
- **Rejected**: This would risk filling the root filesystem completely, causing system instability.
|
||||
|
||||
### Alternative 3: Use k3s Image Garbage Collection
|
||||
- **Rejected**: k3s does not provide fine-grained control over image retention for non-k8s workloads (e.g., Gitea Act Runner).
|
||||
|
||||
### Alternative 4: Save/Load Images Manually
|
||||
- **Rejected**: Manual intervention is not scalable and does not address the root cause.
|
||||
|
||||
## Migration Plan
|
||||
1. **Backup**: Save critical images to `/mnt/arcodange`:
|
||||
```bash
|
||||
docker save gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca -o /mnt/arcodange/gitea-runner-backup.tar
|
||||
```
|
||||
2. **Update Ansible**: Apply the changes to `system_docker.yml`.
|
||||
3. **Run Playbook**: Execute the playbook to reconfigure Docker.
|
||||
4. **Verify**: Ensure Gitea Act Runner functions correctly post-migration.
|
||||
|
||||
## Success Metrics
|
||||
- Gitea Act Runner images are no longer deleted between runs.
|
||||
- Root filesystem usage drops below 80%.
|
||||
- CI/CD pipelines complete without image pull errors.
|
||||
@@ -0,0 +1,576 @@
|
||||
# ADR 20260407: Network Architecture
|
||||
|
||||
## Status
|
||||
Proposed
|
||||
|
||||
## Context
|
||||
The home lab requires a secure and resilient network architecture to support:
|
||||
- Internal services (`.lab` domain).
|
||||
- External services (`.arcodange.fr` domain).
|
||||
- DNS resolution and ad-blocking (Pi-hole).
|
||||
- TLS certificate management (Step CA).
|
||||
- Ingress routing (Traefik).
|
||||
- CDN and DDoS protection (Cloudflare).
|
||||
|
||||
## Decision
|
||||
We will implement a **multi-layered network architecture** with the following components:
|
||||
|
||||
### 1. External Layer (Internet)
|
||||
- **Cloudflare**: CDN, DDoS protection, and DNS for `.arcodange.fr`.
|
||||
- **DuckDNS**: Dynamic DNS for external access.
|
||||
- **Livebox**: ISP-provided gateway (NAT, DHCP, firewall).
|
||||
|
||||
### 2. Internal Layer (Home Lab)
|
||||
- **Pi-hole (pi1, pi3)**: DNS sinkhole for ad-blocking and internal DNS resolution.
|
||||
- **Step CA (pi1)**: Internal certificate authority for `.lab` domain.
|
||||
- **Traefik (k3s)**: Ingress controller with TLS termination.
|
||||
- **k3s Cluster**: Hosts internal services with Longhorn storage.
|
||||
|
||||
### 3. DNS Architecture
|
||||
- **Pi-hole**: Primary DNS for internal clients.
|
||||
- Forwards `.lab` queries to Step CA.
|
||||
- Forwards external queries to Cloudflare (1.1.1.1).
|
||||
- **Step CA**: Issues certificates for `.lab` services.
|
||||
- **Cloudflare**: Manages `.arcodange.fr` DNS records.
|
||||
|
||||
### 4. Ingress and TLS
|
||||
- **Traefik**: Terminates TLS for both `.lab` and `.arcodange.fr` domains.
|
||||
- Uses Let's Encrypt for `.arcodange.fr`.
|
||||
- Uses Step CA for `.lab`.
|
||||
- **Helm Chart Annotations**:
|
||||
- `traefik.ingress.kubernetes.io/router.entrypoints: websecure`
|
||||
- `traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt`
|
||||
- `traefik.ingress.kubernetes.io/router.middlewares: localIp@file`
|
||||
|
||||
### 5. Security
|
||||
- **Cloudflare Tunnel**: Securely exposes internal services without port forwarding.
|
||||
- **CrowdSec**: Intrusion detection and banning.
|
||||
- **Traefik Middlewares**: IP filtering, rate limiting, and authentication.
|
||||
- **Cloudflare Turnstile**: CAPTCHA protection for public-facing services.
|
||||
|
||||
## Architecture Diagrams
|
||||
|
||||
### 0. High-Level Network Architecture (Architecture Beta)
|
||||
|
||||
```mermaid
|
||||
%%{init: {'theme': 'neutral', 'themeVariables': {
|
||||
'primaryColor': '#f0f0f0',
|
||||
'primaryBorderColor': '#333333',
|
||||
'primaryTextColor': '#333333',
|
||||
'lineColor': '#333333',
|
||||
'tertiaryColor': '#e67e22'
|
||||
}}}%%
|
||||
architectureBeta
|
||||
%% External Layer
|
||||
box "Internet" #f9f9f9
|
||||
component Cloudflare["Cloudflare\n(CDN/DNS)"] #f9f9f9
|
||||
component DuckDNS["DuckDNS\n(DDNS)"] #f9f9f9
|
||||
end
|
||||
|
||||
%% External Gateway
|
||||
box "External Gateway" #e6e6e6
|
||||
component Livebox["Livebox\n(NAT/Firewall)"] #e6e6e6
|
||||
end
|
||||
|
||||
%% Internal Layer
|
||||
box "Internal Network\n(192.168.1.0/24)" #d4d4d4
|
||||
%% DNS Layer
|
||||
box "DNS" #ffff99
|
||||
component PiHole1["Pi-hole\n(pi1)"] #ffff99
|
||||
component PiHole3["Pi-hole\n(pi3)"] #ffff99
|
||||
component StepCA["Step CA\n(pi1)"] #ccccff
|
||||
end
|
||||
|
||||
%% k3s Layer
|
||||
box "k3s Cluster" #ff9999
|
||||
component Traefik["Traefik\n(Ingress)"] #ff9999
|
||||
component CrowdSec["CrowdSec\n(Security)"] #ff9999
|
||||
component Gitea["Gitea\n(pi2)"] #ffcc99
|
||||
component Vault["Vault\n(Secrets)"] #ccccff
|
||||
end
|
||||
end
|
||||
|
||||
%% Connections
|
||||
Cloudflare --> Livebox : "DNS"
|
||||
DuckDNS --> Livebox : "DDNS"
|
||||
Livebox --> PiHole1 : "NAT"
|
||||
Livebox --> PiHole3 : "NAT"
|
||||
Livebox --> Traefik : "NAT"
|
||||
PiHole1 --> StepCA : "Forward .lab"
|
||||
PiHole1 --> Cloudflare : "Forward External"
|
||||
PiHole3 --> StepCA : "Forward .lab"
|
||||
PiHole3 --> Cloudflare : "Forward External"
|
||||
Traefik --> Cloudflare : "TLS (Let's Encrypt)"
|
||||
Traefik --> StepCA : "TLS (Step CA)"
|
||||
CrowdSec --> Traefik : "Ban IPs"
|
||||
Traefik --> Gitea : "Route"
|
||||
Traefik --> Vault : "Route"
|
||||
```
|
||||
|
||||
### 1. High-Level Network Architecture
|
||||
|
||||
```mermaid
|
||||
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#333333', 'edgeLabelBackground':'#f0f0f0', 'tertiaryColor': '#f89136'}}}%%
|
||||
graph TD
|
||||
%% Styles
|
||||
classDef internet fill:#f9f9f9,stroke:#999,color:#333;
|
||||
classDef external fill:#e6e6e6,stroke:#555,color:#333;
|
||||
classDef internal fill:#d4d4d4,stroke:#777,color:#333;
|
||||
classDef security fill:#ff9999,stroke:#cc0000,color:#333;
|
||||
classDef dns fill:#ffff99,stroke:#cccc00,color:#333;
|
||||
classDef ca fill:#ccccff,stroke:#6666cc,color:#333;
|
||||
|
||||
%% Internet
|
||||
subgraph "Internet"
|
||||
Cloudflare["Cloudflare (CDN/DNS)"]:::internet
|
||||
DuckDNS["DuckDNS (DDNS)"]:::internet
|
||||
end
|
||||
|
||||
%% External Gateway
|
||||
subgraph "External Gateway"
|
||||
Livebox["Livebox (NAT/Firewall)"]:::external
|
||||
end
|
||||
|
||||
%% Internal Network
|
||||
subgraph "Internal Network (192.168.1.0/24)"
|
||||
%% Pi-hole DNS
|
||||
PiHole1["Pi-hole (pi1)"]:::dns
|
||||
PiHole3["Pi-hole (pi3)"]:::dns
|
||||
|
||||
%% Step CA
|
||||
StepCA["Step CA (pi1)"]:::ca
|
||||
|
||||
%% k3s Cluster
|
||||
k3s["k3s Cluster"]:::internal
|
||||
Traefik["Traefik (k3s)"]:::internal
|
||||
CrowdSec["CrowdSec (k3s)"]:::security
|
||||
|
||||
%% Services
|
||||
Gitea["Gitea (pi2)"]:::internal
|
||||
Vault["Vault (k3s)"]:::internal
|
||||
end
|
||||
|
||||
%% Connections
|
||||
Cloudflare -->|DNS| Livebox
|
||||
DuckDNS -->|DDNS| Livebox
|
||||
Livebox -->|NAT| PiHole1
|
||||
Livebox -->|NAT| PiHole3
|
||||
Livebox -->|NAT| k3s
|
||||
|
||||
%% Internal DNS
|
||||
PiHole1 -->|Forward .lab| StepCA
|
||||
PiHole1 -->|Forward External| Cloudflare
|
||||
PiHole3 -->|Forward .lab| StepCA
|
||||
PiHole3 -->|Forward External| Cloudflare
|
||||
|
||||
%% Ingress
|
||||
Traefik -->|"TLS (Let's Encrypt)"| Cloudflare
|
||||
Traefik -->|"TLS (Step CA)"| StepCA
|
||||
CrowdSec -->|Ban IPs| Traefik
|
||||
|
||||
%% Service Access
|
||||
Traefik -->|Route| Gitea
|
||||
Traefik -->|Route| Vault
|
||||
```
|
||||
|
||||
### 2. DNS Resolution Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Client
|
||||
participant PiHole
|
||||
participant StepCA
|
||||
participant Cloudflare
|
||||
participant ExternalDNS
|
||||
|
||||
Client->>PiHole: Query example.lab
|
||||
PiHole->>StepCA: Forward .lab query
|
||||
StepCA-->>PiHole: Return A record
|
||||
PiHole-->>Client: Return response
|
||||
|
||||
Client->>PiHole: Query example.com
|
||||
PiHole->>Cloudflare: Forward to 1.1.1.1
|
||||
Cloudflare->>ExternalDNS: Resolve externally
|
||||
ExternalDNS-->>Cloudflare: Return response
|
||||
Cloudflare-->>PiHole: Return response
|
||||
PiHole-->>Client: Return response
|
||||
```
|
||||
|
||||
### 3. Ingress and TLS Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant Cloudflare
|
||||
participant Traefik
|
||||
participant StepCA
|
||||
participant Service
|
||||
|
||||
User->>Cloudflare: HTTPS Request (webapp.arcodange.fr)
|
||||
Cloudflare->>Traefik: Forward to internal IP
|
||||
Traefik->>Let's Encrypt: Request Certificate
|
||||
Let's Encrypt-->>Traefik: Issue Certificate
|
||||
Traefik->>Service: Route request
|
||||
Service-->>Traefik: Return response
|
||||
Traefik-->>Cloudflare: Return HTTPS response
|
||||
Cloudflare-->>User: Return response
|
||||
|
||||
User->>Traefik: HTTPS Request (webapp.arcodange.lab)
|
||||
Traefik->>StepCA: Request Certificate
|
||||
StepCA-->>Traefik: Issue Certificate
|
||||
Traefik->>Service: Route request
|
||||
Service-->>Traefik: Return response
|
||||
Traefik-->>User: Return HTTPS response
|
||||
```
|
||||
|
||||
### 4. Security Flow (CrowdSec + Traefik)
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Attacker
|
||||
participant Traefik
|
||||
participant CrowdSec
|
||||
participant BannedIPs
|
||||
|
||||
Attacker->>Traefik: Malicious Request
|
||||
Traefik->>CrowdSec: Log suspicious activity
|
||||
CrowdSec->>BannedIPs: Add IP to ban list
|
||||
BannedIPs-->>Traefik: Update middleware
|
||||
Traefik-->>Attacker: Block request (403)
|
||||
```
|
||||
|
||||
## Playbook and Role Analysis
|
||||
|
||||
### 1. Pi-hole Deployment
|
||||
- **Playbook**: `playbooks/system/pihole.yml`
|
||||
- **Role**: `arcodange.factory.pihole`
|
||||
- **Configuration**:
|
||||
- Upstream DNS: Cloudflare (1.1.1.1) and Step CA for `.lab`.
|
||||
- Blocklists: Ad-blocking and malware domains.
|
||||
|
||||
### 2. Step CA Deployment
|
||||
- **Playbook**: `playbooks/ssl/ssl.yml`
|
||||
- **Role**: `step_ca`
|
||||
- **Configuration**:
|
||||
- Internal CA for `.lab` domain.
|
||||
- Short-lived certificates (default: 24h).
|
||||
|
||||
### 3. Traefik Deployment
|
||||
- **Playbook**: `playbooks/system/system_k3s.yml` (via k3s)
|
||||
- **Helm Chart**: `traefik` (installed via k3s)
|
||||
- **Key Annotations**:
|
||||
```yaml
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.middlewares: localIp@file
|
||||
```
|
||||
|
||||
### 4. CrowdSec Deployment
|
||||
- **Playbook**: `playbooks/tools/crowdsec.yml`
|
||||
- **Role**: `arcodange.factory.crowdsec`
|
||||
- **Configuration**:
|
||||
- Bouncer integration with Traefik.
|
||||
- Custom scenarios for brute-force and bot detection.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
- **Resilient DNS**: Pi-hole provides ad-blocking and internal DNS resolution.
|
||||
- **Secure TLS**: Step CA for internal services, Let's Encrypt for external.
|
||||
- **DDoS Protection**: Cloudflare absorbs external attacks.
|
||||
- **Intrusion Detection**: CrowdSec bans malicious IPs automatically.
|
||||
|
||||
### Negative
|
||||
- **Complexity**: Multiple layers require careful configuration.
|
||||
- **Single Point of Failure**: Pi-hole is critical for internal DNS.
|
||||
- **Certificate Management**: Step CA requires maintenance for `.lab` domain.
|
||||
|
||||
## Alternatives Considered
|
||||
|
||||
### Alternative 1: Public DNS for `.lab`
|
||||
- **Rejected**: Exposing internal domains is a security risk.
|
||||
|
||||
### Alternative 2: No Ad-Blocking
|
||||
- **Rejected**: Pi-hole provides essential security and privacy.
|
||||
|
||||
### Alternative 3: Self-Signed Certificates
|
||||
- **Rejected**: Step CA provides better usability with short-lived certs.
|
||||
|
||||
### 5. Cloudflare Turnstile + CrowdSec Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant Cloudflare
|
||||
participant Turnstile
|
||||
participant Traefik
|
||||
participant CrowdSec
|
||||
participant BannedIPs
|
||||
|
||||
User->>Cloudflare: Request protected endpoint
|
||||
Cloudflare->>Turnstile: Challenge (CAPTCHA)
|
||||
Turnstile-->>Cloudflare: Return token
|
||||
Cloudflare->>Traefik: Forward request with token
|
||||
|
||||
alt Valid Token
|
||||
Traefik->>Service: Route request
|
||||
Service-->>Traefik: Return response
|
||||
Traefik-->>Cloudflare: Return response
|
||||
Cloudflare-->>User: Return success
|
||||
else Invalid Token
|
||||
Traefik->>CrowdSec: Log suspicious activity
|
||||
CrowdSec->>BannedIPs: Add IP to ban list
|
||||
BannedIPs-->>Traefik: Update middleware
|
||||
Traefik-->>Cloudflare: Block request (403)
|
||||
Cloudflare-->>User: Return "Access Denied"
|
||||
end
|
||||
```
|
||||
|
||||
## Deep Dive: `.lab` Domain SSL/TLS Architecture
|
||||
|
||||
### Overview
|
||||
The `.lab` domain relies on a **zero-trust internal PKI** (Public Key Infrastructure) powered by **Step CA**, integrated with **k3s**, **Traefik**, and **cert-manager**. This section details the components, interactions, and operational workflows.
|
||||
|
||||
### Core Components
|
||||
|
||||
#### 1. **Step CA (Certificate Authority)**
|
||||
- **Host**: `pi1` (primary), with standby nodes for resilience.
|
||||
- **Ports**: `8443` (HTTPS), `443` (ACME).
|
||||
- **Provisioners**:
|
||||
- `cert-manager`: Dedicated for k3s workloads.
|
||||
- `admin`: For manual certificate issuance.
|
||||
- **Certificate Lifecycle**:
|
||||
- **Short-lived certificates** (default: 24h).
|
||||
- **Automatic renewal** via cert-manager.
|
||||
- **OCSP stapling** for revocation checks.
|
||||
|
||||
#### 2. **cert-manager**
|
||||
- **Namespace**: `cert-manager`.
|
||||
- **CRDs**:
|
||||
- `Certificate`: Defines desired certificates.
|
||||
- `CertificateRequest`: Requests signed by Step CA.
|
||||
- `ClusterIssuer`/`Issuer`: References Step CA.
|
||||
- `StepClusterIssuer`: Custom resource for Step CA integration.
|
||||
|
||||
#### 3. **StepClusterIssuer**
|
||||
- **Purpose**: Bridges cert-manager with Step CA.
|
||||
- **Configuration**:
|
||||
```yaml
|
||||
apiVersion: certmanager.step.sm/v1beta1
|
||||
kind: StepClusterIssuer
|
||||
metadata:
|
||||
name: step-issuer
|
||||
namespace: cert-manager
|
||||
spec:
|
||||
url: "https://ssl-ca.arcodange.lab:8443"
|
||||
caBundle: "<base64-encoded-root-ca>"
|
||||
provisioner:
|
||||
name: cert-manager
|
||||
kid: "<key-id>"
|
||||
passwordRef:
|
||||
name: step-jwk-password
|
||||
key: password
|
||||
```
|
||||
- **Workflow**:
|
||||
1. cert-manager creates a `CertificateRequest`.
|
||||
2. `StepClusterIssuer` forwards the request to Step CA.
|
||||
3. Step CA signs the certificate and returns it to cert-manager.
|
||||
4. cert-manager stores the certificate in a Kubernetes `Secret`.
|
||||
|
||||
#### 4. **Traefik Ingress Controller**
|
||||
- **Namespace**: `kube-system`.
|
||||
- **TLS Configuration**:
|
||||
- **EntryPoints**: `websecure` (HTTPS), `web` (HTTP → redirect).
|
||||
- **Certificate Resolvers**:
|
||||
- `letsencrypt`: For `.arcodange.fr` (public).
|
||||
- `step-ca`: For `.lab` (internal).
|
||||
- **Middlewares**:
|
||||
- `localIp@file`: IP allowlisting.
|
||||
- `crowdsec-bouncer`: Intrusion prevention.
|
||||
|
||||
#### 5. **Certificate and CertificateRequest**
|
||||
- **Example `Certificate` for `.lab`**:
|
||||
```yaml
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: wildcard-arcodange-lab
|
||||
namespace: kube-system
|
||||
spec:
|
||||
secretName: wildcard-arcodange-lab-tls
|
||||
issuerRef:
|
||||
name: step-issuer
|
||||
kind: StepClusterIssuer
|
||||
group: certmanager.step.sm
|
||||
dnsNames:
|
||||
- "*.arcodange.lab"
|
||||
- "arcodange.lab"
|
||||
```
|
||||
- **Generated `CertificateRequest`**:
|
||||
- Automatically created by cert-manager.
|
||||
- References the `StepClusterIssuer`.
|
||||
- Status transitions: `Pending` → `Approved` → `Ready`.
|
||||
|
||||
#### 6. **k3s Cluster Integration**
|
||||
- **Nodes**: `pi1` (control plane), `pi2`, `pi3` (workers).
|
||||
- **Storage**: Longhorn for persistent volumes.
|
||||
- **Networking**:
|
||||
- **CNI**: Flannel.
|
||||
- **Service Mesh**: Traefik for ingress, Linkerd (optional).
|
||||
|
||||
### Workflow: Certificate Issuance and Renewal
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant App as Application (e.g., Gitea)
|
||||
participant Cert as Certificate
|
||||
participant CR as CertificateRequest
|
||||
participant SCI as StepClusterIssuer
|
||||
participant StepCA as Step CA
|
||||
participant Secret as Kubernetes Secret
|
||||
participant Traefik as Traefik
|
||||
|
||||
App->>Cert: Declare desired certificate
|
||||
Cert->>CR: Create CertificateRequest
|
||||
CR->>SCI: Forward to StepClusterIssuer
|
||||
SCI->>StepCA: Sign CSR (via JWK provisioner)
|
||||
StepCA-->>SCI: Return signed certificate
|
||||
SCI->>Secret: Store certificate/key
|
||||
Secret-->>Traefik: Mount as TLS secret
|
||||
Traefik->>App: Route traffic with TLS
|
||||
|
||||
loop Every 2/3 of certificate lifetime
|
||||
Cert->>CR: Trigger renewal
|
||||
CR->>SCI: Re-sign CSR
|
||||
SCI->>StepCA: Request new certificate
|
||||
StepCA-->>SCI: Return signed certificate
|
||||
SCI->>Secret: Update secret
|
||||
end
|
||||
```
|
||||
|
||||
### Device Trust: Adding `.lab` CA to External Devices
|
||||
|
||||
#### **Manual Trust Installation**
|
||||
1. **Export Root CA**:
|
||||
```bash
|
||||
scp pi1:/home/step/.step/certs/root_ca.crt ./arcodange-lab-ca.crt
|
||||
```
|
||||
2. **Install on Devices**:
|
||||
- **macOS**:
|
||||
```bash
|
||||
sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain ./arcodange-lab-ca.crt
|
||||
```
|
||||
- **Linux (Debian/Ubuntu)**:
|
||||
```bash
|
||||
sudo cp arcodange-lab-ca.crt /usr/local/share/ca-certificates/
|
||||
sudo update-ca-certificates
|
||||
```
|
||||
- **Windows**:
|
||||
- Import via `certmgr.msc` → **Trusted Root Certification Authorities**.
|
||||
- **Android/iOS**:
|
||||
- Email the `.crt` and install via device settings.
|
||||
- **Raspberry Pi**:
|
||||
```bash
|
||||
sudo cp arcodange-lab-ca.crt /etc/ssl/certs/
|
||||
sudo update-ca-certificates
|
||||
```
|
||||
|
||||
#### **Automated Trust via Ansible**
|
||||
- **Playbook**: `playbooks/system/trust_ca.yml`
|
||||
- **Role**: `arcodange.factory.trust_ca`
|
||||
- **Targets**: All nodes in `raspberries` group.
|
||||
|
||||
### Troubleshooting Common Issues
|
||||
|
||||
#### 1. **Certificate Not Issued**
|
||||
- **Symptoms**: `CertificateRequest` stuck in `Pending`.
|
||||
- **Causes**:
|
||||
- Step CA unreachable.
|
||||
- Incorrect `caBundle` or provisioner `kid`.
|
||||
- Network policies blocking egress to Step CA.
|
||||
- **Fixes**:
|
||||
```bash
|
||||
# Check StepClusterIssuer status
|
||||
kubectl -n cert-manager describe stepclusterissuer step-issuer
|
||||
|
||||
# Verify Step CA connectivity
|
||||
kubectl -n cert-manager logs -l app.kubernetes.io/name=step-issuer
|
||||
|
||||
# Test Step CA manually
|
||||
step ca certificate --ca-url https://ssl-ca.arcodange.lab:8443 \
|
||||
--root /home/step/.step/certs/root_ca.crt \
|
||||
test.lab test.crt test.key
|
||||
```
|
||||
|
||||
#### 2. **Traefik TLS Errors**
|
||||
- **Symptoms**: `502 Bad Gateway` or TLS handshake failures.
|
||||
- **Causes**:
|
||||
- Missing certificate in `Secret`.
|
||||
- Incorrect SNI routing.
|
||||
- Expired certificates.
|
||||
- **Fixes**:
|
||||
```bash
|
||||
# Check Traefik logs
|
||||
kubectl -n kube-system logs -l app.kubernetes.io/name=traefik
|
||||
|
||||
# Verify certificate secret
|
||||
kubectl -n kube-system get secret wildcard-arcodange-lab-tls -o yaml
|
||||
|
||||
# Restart Traefik
|
||||
kubectl -n kube-system rollout restart deployment/traefik
|
||||
```
|
||||
|
||||
#### 3. **Device Trust Issues**
|
||||
- **Symptoms**: Browser warnings (`NET::ERR_CERT_AUTHORITY_INVALID`).
|
||||
- **Causes**:
|
||||
- CA not installed in device trust store.
|
||||
- Clock skew (certificate validity).
|
||||
- **Fixes**:
|
||||
- Reinstall CA certificate.
|
||||
- Sync device clock with NTP:
|
||||
```bash
|
||||
sudo ntpdate pool.ntp.org
|
||||
```
|
||||
|
||||
### Security Considerations
|
||||
|
||||
#### 1. **Provisioner Security**
|
||||
- **JWK Provisioner**: Encrypted with a password stored in Kubernetes `Secret`.
|
||||
- **Password Rotation**:
|
||||
```bash
|
||||
# Rotate JWK password via Ansible
|
||||
ansible-playbook playbooks/ssl/rotate_jwk_password.yml
|
||||
```
|
||||
|
||||
#### 2. **Certificate Revocation**
|
||||
- **OCSP**: Step CA supports Online Certificate Status Protocol.
|
||||
- **Manual Revocation**:
|
||||
```bash
|
||||
step ca revoke <serial> --reason superseded
|
||||
```
|
||||
|
||||
#### 3. **Network Isolation**
|
||||
- **Step CA Access**: Restricted to k3s cluster IPs via firewall rules.
|
||||
- **Traefik Middlewares**: Enforce IP allowlisting for internal services.
|
||||
|
||||
### Future Enhancements
|
||||
|
||||
1. **Automated Device Onboarding**:
|
||||
- MDM (Mobile Device Management) integration for CA trust.
|
||||
- Ansible playbook for bulk device enrollment.
|
||||
|
||||
2. **Step CA High Availability**:
|
||||
- Multi-node Step CA with RAFT consensus.
|
||||
- Automatic failover for provisioners.
|
||||
|
||||
3. **Certificate Transparency**:
|
||||
- Log all `.lab` certificates to a private CT log.
|
||||
|
||||
4. **Short-Lived Certificates**:
|
||||
- Reduce default TTL to 1h for critical services.
|
||||
|
||||
### References
|
||||
|
||||
- [Step CA Documentation](https://smallstep.com/docs/step-ca/)
|
||||
- [cert-manager Step Issuer](https://smallstep.com/docs/step-certificates/kubernetes/)
|
||||
- [Traefik TLS Configuration](https://doc.traefik.io/traefik/https/tls/)
|
||||
@@ -0,0 +1,126 @@
|
||||
# ADR 20260414: Internal DNS Architecture
|
||||
|
||||
## Status
|
||||
Accepted
|
||||
|
||||
## Context
|
||||
|
||||
During the 2026-04-13 power cut incident, cluster recovery was blocked by DNS resolution failures. The investigation revealed:
|
||||
|
||||
1. **CoreDNS forwarding loop**: CoreDNS was configured to forward queries to `/etc/resolv.conf`, which on the node (pi3) pointed to itself (`192.168.1.203`) - a host without a running DNS service
|
||||
2. **Pi-hole HA misconfiguration**: Both pi1 and pi3 run Pi-hole (pihole-FTL) but:
|
||||
- pi1's `dnsmasq` service was in a **failed state** due to missing `dip` group membership
|
||||
- pi3's Pi-hole was running but CoreDNS couldn't reach it due to the forwarding configuration
|
||||
3. **No explicit upstream DNS**: Pi-hole instances lacked explicitly configured upstream DNS servers
|
||||
|
||||
The cluster's HelmChart controller requires external DNS resolution to fetch charts from `charts.longhorn.io`, making DNS a critical dependency for storage provisioning and thus the entire cluster recovery process.
|
||||
|
||||
## Decision
|
||||
|
||||
### 1. DNS Service Hierarchy
|
||||
|
||||
```
|
||||
┌─────────────────┐ ┌─────────────────┐
|
||||
│ CoreDNS Pod │────▶│ Pi-hole (pi1) │──┐
|
||||
│ (kube-system) │ │ Pi-hole (pi3) │ │
|
||||
└─────────────────┘ └─────────────────┘ │
|
||||
▼
|
||||
┌──────────────┐
|
||||
│ 8.8.8.8 │
|
||||
│ 1.1.1.1 │
|
||||
│ 8.8.4.4 │
|
||||
└──────────────┘
|
||||
```
|
||||
|
||||
### 2. CoreDNS Configuration
|
||||
|
||||
CoreDNS will forward **all non-cluster DNS queries** to **both Pi-hole instances** in HA configuration:
|
||||
|
||||
```coredns
|
||||
.:53 {
|
||||
errors
|
||||
health
|
||||
ready
|
||||
kubernetes cluster.local in-addr.arpa ip6.arpa {
|
||||
pods insecure
|
||||
fallthrough in-addr.arpa ip6.arpa
|
||||
}
|
||||
hosts /etc/coredns/NodeHosts {
|
||||
ttl 60
|
||||
reload 15s
|
||||
fallthrough
|
||||
}
|
||||
prometheus :9153
|
||||
cache 30
|
||||
loop
|
||||
reload
|
||||
import /etc/coredns/custom/*.override
|
||||
import /etc/coredns/custom/*.server
|
||||
forward . 192.168.1.201:53 192.168.1.203:53
|
||||
}
|
||||
```
|
||||
|
||||
### 3. Pi-hole HA Configuration
|
||||
|
||||
- **Primary**: pi1 (192.168.1.201)
|
||||
- **Secondary**: pi3 (192.168.1.203)
|
||||
- **Synchronization**: Gravity Sync for configuration consistency
|
||||
- **Upstream DNS**: Explicitly configured to Cloudflare (1.1.1.1) and Google (8.8.8.8, 8.8.4.4)
|
||||
|
||||
### 4. Pi-hole DNS Service Fix
|
||||
|
||||
The `dnsmasq` user must be a member of the `dip` group to bind to privileged port 53:
|
||||
|
||||
```bash
|
||||
usermod -aG dip dnsmasq
|
||||
```
|
||||
|
||||
This is managed via Ansible in `playbooks/system/rpi.yml`.
|
||||
|
||||
## Consequences
|
||||
|
||||
### Positive
|
||||
- **Resilience**: DNS resolution continues if one Pi-hole node fails
|
||||
- **Consistency**: Both Pi-hole instances maintain synchronized configuration via Gravity Sync
|
||||
- **Recovery**: Cluster can recover from power failures without manual DNS intervention
|
||||
- **Explicit configuration**: Upstream DNS servers are explicitly defined, avoiding reliance on DHCP-provided config
|
||||
|
||||
### Negative
|
||||
- **Complexity**: Additional Ansible tasks required to maintain DNS infrastructure
|
||||
- **Dependency**: Cluster recovery depends on Pi-hole availability (mitigated by HA)
|
||||
|
||||
## Implementation
|
||||
|
||||
See related changes in:
|
||||
- `playbooks/system/rpi.yml` - dnsmasq group membership fix
|
||||
- `playbooks/dns/k3s_dns.yml` - CoreDNS forwarding to HA Pi-hole instances
|
||||
- `playbooks/dns/roles/pihole/defaults/main.yml` - Explicit upstream DNS configuration
|
||||
|
||||
## Post-Implementation Notes
|
||||
|
||||
### Issue Encountered: dnsmasq vs pihole-FTL Port Conflict
|
||||
|
||||
During execution, we discovered that **dnsmasq** and **pihole-FTL** both attempt to bind to port 53. On pi1:
|
||||
- pihole-FTL was running and handling DNS on port 53
|
||||
- dnsmasq service was failing because port 53 was already in use
|
||||
|
||||
**Resolution**: The dnsmasq service on Pi-hole nodes is **not needed** when pihole-FTL is running, as pihole-FTL includes its own DNS server (dnsmasq) internally. The system dnsmasq service should remain **disabled** on Pi-hole nodes to avoid conflicts.
|
||||
|
||||
### Verification Commands
|
||||
|
||||
Check DNS resolution from cluster:
|
||||
```bash
|
||||
kubectl run dns-test --image=busybox:1.28 -it --rm --restart=Never -- \
|
||||
nslookup charts.longhorn.io 192.168.1.201
|
||||
|
||||
# Check CoreDNS forward to both Pi-holes
|
||||
kubectl get cm -n kube-system coredns -o yaml
|
||||
|
||||
# Check Pi-hole instances
|
||||
ssh pi1 "dig @127.0.0.1 google.com +short"
|
||||
ssh pi3 "dig @127.0.0.1 google.com +short"
|
||||
```
|
||||
|
||||
## Related Incidents
|
||||
|
||||
- [2026-04-13-power-cut](../incidents/2026-04-13-power-cut/README.md) - Power cut caused DNS resolution failure, blocking Longhorn reinstall and Traefik recovery
|
||||
@@ -0,0 +1,550 @@
|
||||
# ADR 20260414: Longhorn PVC Recovery When Reinstalled
|
||||
|
||||
---
|
||||
|
||||
## 📋 **Executive Summary**
|
||||
|
||||
After the April 13, 2026 power cut incident and subsequent cluster recovery, we discovered a **critical gap** in Longhorn volume restoration. While the **raw replica data files** (`volume-head-*.img`) remain intact on disk across all nodes, Longhorn cannot automatically **re-associate** them with new Volume CRDs due to its internal engine ID naming scheme. This document explains the problem and provides three recovery approaches.
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🔍 **The Root Problem**
|
||||
|
||||
### **What Happened**
|
||||
|
||||
1. **Power cut** → Longhorn CSI driver lost connection
|
||||
2. **Force-deletion of Longhorn pods** → Webhook circular dependency
|
||||
3. **Nuclear cleanup** → All Longhorn CRDs (Volume, Engine, Replica) were deleted
|
||||
4. **Reinstallation** → New Volume CRDs created with new engine IDs
|
||||
|
||||
### **Directory Structure Issue**
|
||||
|
||||
Longhorn stores replica data in directories named by **volume name + engine ID**:
|
||||
```
|
||||
/mnt/arcodange/longhorn/replicas/
|
||||
├── pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-cd16e459/ # ← OLD (orphaned)
|
||||
│ ├── volume-head-002.img # ← Actual Traefik data (128Mi)
|
||||
│ ├── volume-head-002.img.meta
|
||||
│ └── volume-snap-*.img
|
||||
│
|
||||
├── pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4/ # ← NEW (empty)
|
||||
│ ├── volume-head-002.img # ← Empty 128Mi
|
||||
│ └── volume-head-002.img.meta
|
||||
└── ...
|
||||
```
|
||||
|
||||
**The Problem:** When you recreate a Volume CRD, Longhorn generates a **new engine ID** (e.g., `8c7d8ab4`), creating a **new empty directory** instead of adopting the existing one (`cd16e459`).
|
||||
|
||||
### **Why This Matters**
|
||||
|
||||
| Component | Persistence | Recovery Path |
|
||||
|-----------|-------------|---------------|
|
||||
| **Replica `.img` files** | ✅ **Survives** on disk | Manual intervention required |
|
||||
| **Volume CRD** | ❌ **Deleted** | Must recreate |
|
||||
| **Engine/Replica CRDs** | ❌ **Deleted** | Auto-recreated by Longhorn |
|
||||
| **Engine ID** | ❌ **Changes** | ** Cannot be recovered without backup ** |
|
||||
|
||||
**Without the original Volume CRD backup, Longhorn cannot match orphaned replica directories to new Volume CRDs.**
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🎯 **Recovery Methods Comparison**
|
||||
|
||||
| Method | Complexity | Data Safety | Downtime | Best For |
|
||||
|--------|------------|-------------|----------|----------|
|
||||
| **[A: Manual `dd` Copy](#method-a-manual-dd-copy)** | ⭐⭐⭐⭐ | ✅✅✅✅ | Medium | Critical data, no app backup |
|
||||
| **[B: Directory Rename](#method-b-directory-rename)** | ⭐⭐⭐ | ✅✅ | Low | Small volumes, no Rebuilding replicas |
|
||||
| **[C: Fresh Volume + App Restore](#method-c-fresh-volume--app-restore)** | ⭐⭐ | ✅✅✅✅✅ | Low | Non-critical data, app backups exist |
|
||||
| **[D: Block-Device Injection (Automated)](#method-d-block-device-injection-automated)** | ⭐⭐⭐ | ✅✅✅✅ | Medium | **Recommended — any volume, no dir swap needed** |
|
||||
| **[E: Longhorn Google Storage Restore](#method-e-longhorn-google-storage-restore)** | ⭐⭐ | ✅✅✅✅✅ | Low | Volumes with Longhorn backup configured |
|
||||
|
||||
**Method B was proven risky** (2026-04-13 recovery): Longhorn reconciliation finds `Dirty: true`
|
||||
metadata + a clean empty pi1 replica → silently rebuilds from the empty source, destroying data.
|
||||
Use Method D for any volume larger than ~128Mi or with Rebuilding replicas.
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🛠️ **Method A: Manual `dd` Copy**
|
||||
|
||||
### **Concept**
|
||||
Manually copy the data from the orphaned `.img` file to the new replica directory that Longhorn created for the new Volume CRD.
|
||||
|
||||
### **Prerequisites**
|
||||
- Root access to all nodes
|
||||
- Volume CRD already recreated (with new engine ID)
|
||||
- Longhorn has created new empty replica directories
|
||||
- `dd` and `qemu-img` tools available
|
||||
|
||||
### **Steps**
|
||||
|
||||
```bash
|
||||
# 1. Identify source (old data) and destination (new empty)
|
||||
SOURCE_NODE=pi2
|
||||
SOURCE_DIR=/mnt/arcodange/longhorn/replicas/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-cd16e459
|
||||
SOURCE_IMG=$(ssh $SOURCE_NODE "ls $SOURCE_DIR/volume-head-*.img | head -1")
|
||||
|
||||
DEST_DIRS=(
|
||||
pi1:/mnt/arcodange/longhorn/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4
|
||||
pi2:/mnt/arcodange/longhorn/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4
|
||||
pi3:/mnt/arcodange/longhorn/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4
|
||||
)
|
||||
|
||||
# 2. Copy data to each node
|
||||
for DEST in "${DEST_DIRS[@]}"; do
|
||||
NODE=${DEST%%:*}
|
||||
PATH=${DEST#*:}
|
||||
ssh $NODE "sudo mkdir -p $PATH && sudo dd if=$SOURCE_IMG of=$PATH/volume-head-002.img bs=4M"
|
||||
done
|
||||
|
||||
# 3. Restart Longhorn engine pods to pick up new data
|
||||
kubectl delete pod -n longhorn-system -l longhorn.io/component=engine
|
||||
|
||||
# 4. Verify data is accessible
|
||||
kubectl get volume -n longhorn-system pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90
|
||||
# Should show: state=attached, robustness=healthy
|
||||
```
|
||||
|
||||
### **Pros**
|
||||
- ✅ Guaranteed data recovery
|
||||
- ✅ Works for any volume size
|
||||
- ✅ Preserves all snapshots and metadata
|
||||
|
||||
### **Cons**
|
||||
- ⚠️ Requires manual intervention on each node
|
||||
- ⚠️ Must know source and destination paths
|
||||
- ⚠️ Risk of data corruption if `dd` fails mid-copy
|
||||
- ⚠️ Volume must be in detached state during copy
|
||||
|
||||
### **Risk Mitigation**
|
||||
- Verify checksums after copy: `sha256sum /path/to/image.img`
|
||||
- Copy to one node at a time, verify between each
|
||||
- Use `pv` for progress: `pv $SOURCE_IMG | ssh $NODE "sudo dd of=$PATH/volume-head-002.img bs=4M"`
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🏷️ **Method B: Directory Rename**
|
||||
|
||||
### **Concept**
|
||||
Rename the orphaned replica directory to match the **engine ID** that Longhorn expects for the new Volume CRD.
|
||||
|
||||
### **Prerequisites**
|
||||
- Volume CRD already recreated
|
||||
- Longhorn has created engine CRDs (check: `kubectl get engines -n longhorn-system`)
|
||||
- Must act quickly before Longhorn initializes new empty replicas
|
||||
|
||||
### **Steps**
|
||||
|
||||
```bash
|
||||
# 1. Find the new engine ID for the volume
|
||||
ENGINE=$(kubectl get engines -n longhorn-system -l longhorn.io/volume=pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 -o jsonpath='{.items[0].metadata.name}')
|
||||
# Example: pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-e-0
|
||||
ENGINE_ID=${ENGINE#*-} # Extract suffix: e-0
|
||||
# But the directory uses a different format...
|
||||
|
||||
# 2. Check actual directory names
|
||||
kubectl get replicas -n longhorn-system | grep pvc-cc8a
|
||||
# Output: pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-r-8c7d8ab4
|
||||
|
||||
# 3. Rename on the node where orphaned data exists
|
||||
NEW_DIR_SUFFIX=$(kubectl get replicas -n longhorn-system pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-r-8c7d8ab4 -o jsonpath='{.metadata.labels.longhorn\.io/last-attached-node}')
|
||||
ssh $NEW_DIR_SUFFIX "sudo mv /mnt/arcodange/longhorn/replicas/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-cd16e459 \
|
||||
/mnt/arcodange/longhorn/replicas/pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90-8c7d8ab4"
|
||||
|
||||
# 4. Restart the replica pod
|
||||
kubectl delete pod -n longhorn-system $(kubectl get pods -n longhorn-system -o jsonpath='{.items[?(@.metadata.labels.longhorn\.io/replica)=pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90].metadata.name}')
|
||||
```
|
||||
|
||||
### **Pros**
|
||||
- ✅ Fastest method
|
||||
- ✅ No data copying required
|
||||
- ✅ Preserves all existing data and snapshots
|
||||
|
||||
### **Cons**
|
||||
- ⚠️ **High risk of mismatch** - wrong directory rename = data loss
|
||||
- ⚠️ Must identify correct engine ID for each node
|
||||
- ⚠️ Replica directories exist on multiple nodes - must rename on ALL
|
||||
- ⚠️ Longhorn may have already initialized new empty replicas
|
||||
|
||||
### **Critical Warning**
|
||||
**Each volume has replicas on ALL nodes.** You must:
|
||||
1. Identify which node has which orphaned directory
|
||||
2. Rename each to match the corresponding new engine's expected path
|
||||
3. Ensure consistency across all nodes
|
||||
|
||||
**Example for pvc-cc8a:**
|
||||
```bash
|
||||
# Orphaned dirs:
|
||||
# pi2: pvc-cc8a...-cd16e459
|
||||
# pi3: pvc-cc8a...-011b54b3
|
||||
|
||||
# New engine paths (from kubectl get replicas):
|
||||
# pi1: pvc-cc8a...-r-8c7d8ab4
|
||||
# pi2: pvc-cc8a...-r-32aa3e1e
|
||||
# pi3: pvc-cc8a...-r-3e84c460
|
||||
|
||||
# Must rename EACH orphaned dir to match new engine on SAME node
|
||||
```
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🆕 **Method C: Fresh Volume + App Restore** *(Recommended for Traefik)*
|
||||
|
||||
### **Concept**
|
||||
1. Let Longhorn create a **new empty volume** for the PVC
|
||||
2. Restore the **application data** (Traefik's `acme.json`) from application-level backups
|
||||
|
||||
### **Prerequisites**
|
||||
- Application-level backup exists (e.g., Traefik config, certificates)
|
||||
- Data is non-critical or easily restorable
|
||||
- Storage requirements are small (128Mi for Traefik)
|
||||
|
||||
### **Steps**
|
||||
|
||||
```bash
|
||||
# 1. Delete the problematic Volume CRD (if any)
|
||||
kubectl delete volume -n longhorn-system pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 --ignore-not-found
|
||||
|
||||
# 2. Delete the PVC
|
||||
kubectl delete pvc -n kube-system traefik
|
||||
|
||||
# 3. Let StorageClass provision a fresh volume
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: kube-system
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources: {requests: {storage: 128Mi}}
|
||||
storageClassName: longhorn
|
||||
volumeMode: Filesystem
|
||||
EOF
|
||||
|
||||
# 4. Wait for PV to be provisioned
|
||||
kubectl wait --for=jsonpath='{.status.phase}'=Bound pvc -n kube-system traefik
|
||||
|
||||
# 5. Restore Traefik data from backup
|
||||
BACKUP_FILE="/path/to/traefik-backup/acme.json"
|
||||
kubectl cp $BACKUP_FILE kube-system/traefik-XXXXXX-XXXX:/data/acme.json
|
||||
kubectl exec -n kube-system traefik-XXXXXX-XXXX -- chown 65532:65532 /data/acme.json
|
||||
kubectl exec -n kube-system traefik-XXXXXX-XXXX -- chmod 600 /data/acme.json
|
||||
```
|
||||
|
||||
### **Traefik-Specific Recovery**
|
||||
|
||||
For Traefik, the critical data is:
|
||||
- `/data/acme.json` - TLS certificates obtained from Let's Encrypt
|
||||
- `/data/tls.yml` - (if used)
|
||||
- Secrets in Kubernetes (separate from PVC)
|
||||
|
||||
**Backup locations to check:**
|
||||
```bash
|
||||
# Check if we have Traefik data backups
|
||||
ssh pi1 "ls -la /home/pi/arcodange/backups/traefik/ 2>/dev/null || echo 'No backup found'"
|
||||
|
||||
# Check ArgoCD apps (if Traefik was deployed via GitOps)
|
||||
kubectl get app -n argocd | grep traefik
|
||||
```
|
||||
|
||||
### **Pros**
|
||||
- ✅ **Simplest and safest** method
|
||||
- ✅ No risk of Longhorn directory mismatches
|
||||
- ✅ Works even without Longhorn CRD backups
|
||||
- ✅ Verifiable - you can confirm data was restored
|
||||
- ✅ Clean state - no orphaned directories
|
||||
|
||||
### **Cons**
|
||||
- ⚠️ Requires application-level backups
|
||||
- ⚠️ TLS certificates may have expired (need to re-issue)
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🏆 **Recommendation: Method C for Traefik**
|
||||
|
||||
### **Why Method C is Best for This Case**
|
||||
|
||||
| Factor | Assessment |
|
||||
|--------|------------|
|
||||
| **Volume Size** | 128Mi (small) |
|
||||
| **Data Criticality** | TLS certs can be re-generated |
|
||||
| **Backup Availability** | Likely exists in ArgoCD/Git |
|
||||
| **Complexity** | Low |
|
||||
| **Risk** | Minimal |
|
||||
| **Time Required** | ~5 minutes |
|
||||
|
||||
### **Data Loss Assessment for Traefik**
|
||||
|
||||
The **worst case** (no Traefik backup):
|
||||
- TLS certificates will be **re-issued** automatically by cert-manager + Let's Encrypt
|
||||
- No permanent data loss - certificates are ephemeral
|
||||
- Client impact: Brief TLS warning during re-issuance (~1-2 minutes)
|
||||
|
||||
**Verdict:** 🟢 **Method C is the safest and most practical approach.**
|
||||
|
||||
---
|
||||
|
||||
## 🔧 **Prevention: What We Must Fix**
|
||||
|
||||
### **1. Update Backup Playbook** (`playbooks/backup/k3s_pvc.yml`) ✅ Done 2026-04-16
|
||||
|
||||
`backup_cmd` now captures:
|
||||
1. All PersistentVolumes (PV)
|
||||
2. All PersistentVolumeClaims (PVC)
|
||||
3. **All Longhorn Volumes** (critical — enables fast restore via `kubectl apply` instead of block-device injection)
|
||||
4. All Longhorn Settings (backup target configuration)
|
||||
|
||||
### **2. Test Backups Regularly**
|
||||
|
||||
```bash
|
||||
# Monthly test: Restore a non-critical volume
|
||||
# Pick a test volume, delete it, restore from backup
|
||||
kubectl delete volume -n longhorn-system <test-volume>
|
||||
kubectl apply -f <backup-file>
|
||||
kubectl get volume -n longhorn-system <test-volume> -w
|
||||
```
|
||||
|
||||
### **3. Validate Backup Files**
|
||||
|
||||
```bash
|
||||
# Check backup contains Longhorn resources
|
||||
grep "longhorn.io/v1beta2" /path/to/backup-*.volumes
|
||||
grep "kind: Volume" /path/to/backup-*.volumes
|
||||
```
|
||||
|
||||
### **4. Document Recovery Procedure**
|
||||
|
||||
- [ ] Create `docs/admin/longhorn-recovery.md` with these steps
|
||||
- [ ] Add to team runbook
|
||||
- [ ] Include in incident response training
|
||||
|
||||
---
|
||||
|
||||
## 📊 **Test Scenario: Battle Testing PVC Recovery**
|
||||
|
||||
### **Test Setup**
|
||||
|
||||
```bash
|
||||
# 1. Create a test namespace
|
||||
kubectl create ns longhorn-test
|
||||
|
||||
# 2. Create a test PVC
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: test-longhorn-recovery
|
||||
namespace: longhorn-test
|
||||
labels:
|
||||
purpose: test
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources: {requests: {storage: 1Gi}}
|
||||
storageClassName: longhorn
|
||||
EOF
|
||||
|
||||
# 3. Deploy a test pod to write data
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: test-writer
|
||||
namespace: longhorn-test
|
||||
spec:
|
||||
containers:
|
||||
- name: writer
|
||||
image: alpine
|
||||
command: [sh, -c, "echo 'test data for recovery' > /data/testfile.txt && echo 'more data' >> /data/testfile.txt && tail -f /dev/null"]
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: test-longhorn-recovery
|
||||
EOF
|
||||
|
||||
# 4. Write and verify data
|
||||
kubectl exec -n longhorn-test test-writer -- cat /data/testfile.txt
|
||||
# Should show: "test data for recovery\nmore data"
|
||||
|
||||
# 5. Backup everything
|
||||
kubectl get -A pv,pvc -o yaml > /tmp/test-backup-pv-pvc.yaml
|
||||
kubectl get -A volumes.longhorn.io -o yaml >> /tmp/test-backup-pv-pvc.yaml
|
||||
echo '---' >> /tmp/test-backup-pv-pvc.yaml
|
||||
kubectl get -A settings.longhorn.io -o yaml >> /tmp/test-backup-pv-pvc.yaml
|
||||
```
|
||||
|
||||
### **Test Execution: Simulate Disaster**
|
||||
|
||||
```bash
|
||||
# 6. Simulate disaster - delete everything
|
||||
kubectl delete pvc -n longhorn-test test-longhorn-recovery
|
||||
kubectl delete pod -n longhorn-test test-writer
|
||||
kubectl delete volume -n longhorn-system pvc-$(kubectl get pvc -n longhorn-test test-longhorn-recovery -o jsonpath='{.spec.volumeName}')
|
||||
|
||||
# 7. Restore from backup
|
||||
kubectl apply -f /tmp/test-backup-pv-pvc.yaml
|
||||
|
||||
# 8. Verify recovery
|
||||
kubectl get pvc -n longhorn-test test-longhorn-recovery
|
||||
kubectl get volumes -n longhorn-system | grep test-longhorn-recovery
|
||||
|
||||
# 9. Deploy test reader pod
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: test-reader
|
||||
namespace: longhorn-test
|
||||
spec:
|
||||
containers:
|
||||
- name: reader
|
||||
image: alpine
|
||||
command: [sh, -c, "cat /data/testfile.txt && tail -f /dev/null"]
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: test-longhorn-recovery
|
||||
EOF
|
||||
|
||||
# 10. Check if data is recovered
|
||||
kubectl logs -n longhorn-test test-reader
|
||||
# Should show: "test data for recovery\nmore data"
|
||||
```
|
||||
|
||||
### **Expected Results**
|
||||
|
||||
| Test Step | Pass Criteria |
|
||||
|-----------|---------------|
|
||||
| Volume CRD restored | `kubectl get volumes` shows the test volume |
|
||||
| PVC bound | `kubectl get pvc` shows status=Bound |
|
||||
| Data accessible | Test reader pod shows original data |
|
||||
|
||||
### **Test Cleanup**
|
||||
|
||||
```bash
|
||||
kubectl delete ns longhorn-test
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🛠️ **Method D: Block-Device Injection (Automated)**
|
||||
|
||||
### **Concept**
|
||||
|
||||
Bypass Longhorn's replica reconciliation entirely. Create a fresh Volume CRD, attach it in
|
||||
maintenance mode, then inject the recovered filesystem directly into the live block device via
|
||||
`rsync`. The old replica dirs are never renamed or touched — the data is copied into the new
|
||||
Longhorn-managed volume.
|
||||
|
||||
### **Implementation**
|
||||
|
||||
See `playbooks/recover/longhorn_data.yml` — a 9-phase Ansible playbook that automates the full
|
||||
sequence for one or more volumes in a single run.
|
||||
|
||||
### **Key Steps**
|
||||
|
||||
```
|
||||
Phase 0: Auto-discover best replica dir (skip Rebuilding:true, rank by actual disk usage)
|
||||
Phase 1: Backup untouched replica dir
|
||||
Phase 2: Merge sparse snapshot+head layers → single flat image (merge-longhorn-layers.py)
|
||||
Phase 3: Create Longhorn Volume CRD, wait for replicas
|
||||
Phase 4: Scale down workload
|
||||
Phase 5: Attach via VolumeAttachment maintenance ticket
|
||||
Phase 6: mkfs.ext4 + mount + rsync from merged image
|
||||
Phase 7: Remove maintenance ticket
|
||||
Phase 8: Recreate PV (Retain, no claimRef) + PVC (volumeName pinned)
|
||||
Phase 9: Scale up, wait readyReplicas ≥ 1
|
||||
```
|
||||
|
||||
### **Usage**
|
||||
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \
|
||||
-e @playbooks/recover/longhorn_data_vars.yml
|
||||
```
|
||||
|
||||
Vars file format:
|
||||
```yaml
|
||||
longhorn_recovery_volumes:
|
||||
- pv_name: pvc-abc123
|
||||
pvc_name: myapp-data
|
||||
namespace: myapp
|
||||
size_bytes: "134217728"
|
||||
size_human: 128Mi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: Deployment
|
||||
workload_name: myapp
|
||||
# source_node and source_dir are auto-discovered if omitted
|
||||
verify_cmd: ""
|
||||
```
|
||||
|
||||
### **Pros**
|
||||
- ✅ Fully automated — handles all phases including PV/PVC recreation
|
||||
- ✅ Auto-discovers best replica (skips Rebuilding dirs)
|
||||
- ✅ Idempotent — safe to re-run (skips backup/merge if already done)
|
||||
- ✅ Works for RWO and RWX volumes
|
||||
|
||||
### **Cons**
|
||||
- ⚠️ Requires ~2× volume size in temporary disk space for merged image
|
||||
- ⚠️ The new volume has 3 fresh replicas (not the original topology) — Longhorn will resync
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 🗄️ **Method E: Longhorn Google Storage Restore**
|
||||
|
||||
### **Concept**
|
||||
|
||||
Some volumes are configured with Longhorn's built-in backup feature targeting a Google Storage
|
||||
bucket. For those volumes, a Longhorn backup can be restored into a new volume without needing
|
||||
the raw replica files.
|
||||
|
||||
### **Applicable Volumes**
|
||||
|
||||
- `backups-rwx` (`pvc-efda1d2f`) — the cluster backup volume itself has a Longhorn GCS backup configured
|
||||
|
||||
### **When to use**
|
||||
|
||||
Use when:
|
||||
- The local replica dirs are missing or corrupted (Method D cannot be used)
|
||||
- A clean point-in-time restore is preferred over a raw replica merge
|
||||
|
||||
### **Status**
|
||||
|
||||
A playbook for this method (`playbooks/recover/longhorn_gcs_restore.yml`) is **planned but not
|
||||
yet implemented**. In the 2026-04-13 incident, `backups-rwx` was successfully recovered via
|
||||
Method D (local replica merge), so Method E was not needed.
|
||||
|
||||
When the playbook is implemented, it will use `kubectl apply` of a `BackupVolume` + `Backup`
|
||||
restore CR pointing to the GCS bucket configured in Longhorn settings.
|
||||
|
||||
---
|
||||
---
|
||||
|
||||
## 📚 **References**
|
||||
|
||||
- [Longhorn Documentation: Disaster Recovery](https://longhorn.io/docs/1.6.0/deploy/uninstall/disaster-recovery/)
|
||||
- [Longhorn Volume CRD Spec](https://github.com/longhorn/longhorn/blob/master/types/types.go)
|
||||
- [Original Issue: Longhorn GitHub #4837](https://github.com/longhorn/longhorn/issues/4837) (Replica orphan handling)
|
||||
- [Related ADR: Internal DNS Architecture](./20260414-internal-dns-architecture.md)
|
||||
- [Related Incident: 2026-04-13 Power Cut](../incidents/2026-04-13-power-cut/README.md)
|
||||
|
||||
---
|
||||
---
|
||||
*Document created: 2026-04-14*
|
||||
*Last updated: 2026-04-15*
|
||||
*Status: Method D (block-device injection) implemented and battle-tested on 5 volumes (2026-04-14/15)*
|
||||
@@ -0,0 +1,420 @@
|
||||
---
|
||||
title: Power Cut - Longhorn Storage System Failure
|
||||
incident_id: 2026-04-13-001
|
||||
date: 2026-04-13
|
||||
time_start: 15:23:57 UTC
|
||||
time_end: "2026-04-15 (ongoing — Vault/ERP manual recovery deferred)"
|
||||
status: Mostly Resolved
|
||||
severity: SEV-1
|
||||
tags:
|
||||
- kubernetes
|
||||
- longhorn
|
||||
- storage
|
||||
- k3s
|
||||
- power-cut
|
||||
- csi-driver
|
||||
- block-device-recovery
|
||||
---
|
||||
|
||||
# Power Cut - Longhorn Storage System Failure
|
||||
|
||||
## Summary
|
||||
|
||||
A power cut caused a cascading failure of the Longhorn distributed storage system in the k3s cluster. The Longhorn CSI driver (`driver.longhorn.io`) lost its registration with kubelet, preventing all Persistent Volume Claims (PVCs) from mounting. This affected ~43 pods across 12 namespaces, including critical infrastructure like Traefik ingress controller, application pods, and monitoring tools.
|
||||
|
||||
The actual volume data stored in Longhorn replicas at `/mnt/arcodange/longhorn/replicas/` on each node **remains intact**. Recovery efforts are focused on restoring CSI driver registration and Longhorn manager functionality.
|
||||
|
||||
## Impact
|
||||
|
||||
### Affected Services
|
||||
- **Critical**: Longhorn storage system (all CSI components)
|
||||
- **Critical**: Traefik ingress controller (cannot mount PVC)
|
||||
- **High**: Application pods using Longhorn PVCs (cms, webapp, erp, clickhouse, etc.)
|
||||
- **High**: Tool pods (grafana, prometheus, hashicorp-vault, redis, crowdsec)
|
||||
- **Medium**: Docker storage corruption on nodes (overlay2)
|
||||
- **Low**: NFS backup mount unavailable
|
||||
|
||||
### User Impact
|
||||
- External access to services via Traefik: **DOWN**
|
||||
- Gitea registry image pulls: **FAILING**
|
||||
- Persistent data access: **DEGRADED** (data exists but inaccessible)
|
||||
- Monitoring dashboards: **DOWN**
|
||||
|
||||
### Metrics
|
||||
- **Failed Pods**: 43 pods in error state (CrashLoopBackOff, Error, ImagePullBackOff)
|
||||
- **Healthy Pods**: ~37 pods running
|
||||
- **Longhorn Pods**: 25 total, ~12 currently healthy
|
||||
- **Nodes**: 3/3 Ready (pi1 control-plane, pi2, pi3)
|
||||
|
||||
## Component Roles
|
||||
|
||||
### Longhorn Components
|
||||
|
||||
| Component | Role | Current Status | Importance |
|
||||
|-----------|------|----------------|------------|
|
||||
| **longhorn-manager** | Orchestrates Longhorn volumes, handles volume operations | 2/3 running, 1 partial | CRITICAL |
|
||||
| **longhorn-driver-deployer** | Deploys the CSI driver to nodes | Init:0/1 (BLOCKED) | CRITICAL |
|
||||
| **longhorn-csi-plugin** | CSI plugin daemonset - handles node-level CSI operations | 0/3 Error | CRITICAL |
|
||||
| **csi-attacher** | Handles volume attachment to nodes | 2/3 running, 1 Error | CRITICAL |
|
||||
| **csi-provisioner** | Creates volumes from PVC requests | 2/3 running, 1 Error | CRITICAL |
|
||||
| **csi-resizer** | Handles volume resizing | 1/3 running, 2 Error | HIGH |
|
||||
| **csi-snapshotter** | Handles volume snapshots | 2/3 running, 1 Error | MEDIUM |
|
||||
| **engine-image** | Pulls and manages engine binaries | 3/3 Running | HIGH |
|
||||
| **longhorn-ui** | Web UI for Longhorn management | 0/2 CrashLoopBackOff | Medium |
|
||||
| **rwx-nfs** | NFS server for backup volume | 0/1 ContainerCreating | Medium |
|
||||
| **share-manager** | Manages NFS shares for volumes | 0/2 Error | MEDIUM |
|
||||
|
||||
### Other Affected Components
|
||||
|
||||
| Component | Role | Dependencies | Status |
|
||||
|-----------|------|--------------|--------|
|
||||
| **Traefik** | Ingress controller, routes external traffic | Requires PVC for certs | Error (cannot mount PVC) |
|
||||
| **coredns** | Cluster DNS | Docker storage | Crashing (overlay2 corruption) |
|
||||
| **svclb-traefik** | Service load balancer for Traefik | Docker storage | Crashing (overlay2 corruption) |
|
||||
| **Application Pods** | Various services (cms, webapp, erp, etc.) | Longhorn PVCs | Error/ImagePullBackOff |
|
||||
|
||||
## Timeline
|
||||
|
||||
| Time (UTC) | Event | Owner | Notes |
|
||||
|------------|-------|-------|-------|
|
||||
| ~15:23 | Power cut occurred | - | Cluster lost power |
|
||||
| 15:23:57 | Incident detection started | Mistral Vibe | Initial assessment began |
|
||||
| 15:24:05 | Baseline documented | Mistral Vibe | 43 pods in error, Longhorn down |
|
||||
| 15:24:10 | Root cause identified | Mistral Vibe | CSI driver `driver.longhorn.io` not registered |
|
||||
| 15:24:30 | Recovery plan formulated | Mistral Vibe | HelmChart manifest touch, then pod deletion |
|
||||
| 15:24:50 | Step 1: Touch longhorn-install.yaml | Mistral Vibe | Manifest timestamp updated on pi1 |
|
||||
| 15:25:50 | Step 1 outcome: Insufficient | Mistral Vibe | Only 1 pod affected, CSI still down |
|
||||
| 15:32:15 | Step 2: Delete all longhorn-system pods | Mistral Vibe | Force deleted 24 pods — created webhook circular dependency |
|
||||
| 15:32:30 | Step 2 outcome: Partial recovery | Mistral Vibe | Managers recovering, CSI still failing |
|
||||
| 16:15:00 | Root cause 2 identified | Mistral Vibe | Webhook circular dependency — decided nuclear cleanup |
|
||||
| 16:30:00 | Backups secured | Mistral Vibe | PV/PVC and Longhorn CRDs backed up to pi1 |
|
||||
| 16:35:00 | Backup script bug fixed | Claude Code | `backup_cmd` fixed to produce valid YAML |
|
||||
| 17:00:00 | Nuclear cleanup executed | Claude Code | Removed all Longhorn CRDs, PVC finalizers, restarted k3s |
|
||||
| 17:08:00 | Longhorn namespace deleted | Claude Code | Clean slate confirmed |
|
||||
| 17:09:00 | Longhorn reinstall started | Claude Code | `playbooks/recover/longhorn.yml` run on pi1 |
|
||||
| 17:30:00 | Docker config corruption found | Claude Code | daemon.json had Python string not JSON |
|
||||
| 17:35:00 | Docker config fixed | Claude Code | Valid JSON deployed to all nodes |
|
||||
| 17:50:00 | DNS failure identified | Claude Code | CoreDNS cannot resolve external domains |
|
||||
| ~19:00 | DNS fixed | Claude Code | Pi-hole dnsmasq group + CoreDNS upstream config |
|
||||
| ~19:30 | Longhorn reinstall completed | Claude Code | All Longhorn pods Running, CSI registered |
|
||||
| 2026-04-14 00:00 | PVC recovery work started | Claude Code | Block-device recovery approach developed |
|
||||
| 2026-04-14 | Traefik recovered | Claude Code | Simple PV recreation (no data loss for certs) |
|
||||
| 2026-04-14 | url-shortener recovered | Claude Code | Method B (dir rename) + PV/PVC recreate |
|
||||
| 2026-04-14 | Block-device recovery developed | Claude Code | `merge-longhorn-layers.py` + 9-phase playbook |
|
||||
| 2026-04-14 | Clickhouse recovered | Claude Code | `longhorn_data.yml` playbook — first automated run |
|
||||
| 2026-04-15 | Automated recovery for 4 volumes | Claude Code | prometheus, alertmanager, redis, backups-rwx |
|
||||
| 2026-04-15 | Vault/ERP recovery deferred | - | Too sensitive for automated approach, manual later |
|
||||
|
||||
## Root Cause Analysis
|
||||
|
||||
### Primary Root Cause
|
||||
|
||||
**Power cut caused Longhorn CSI driver registration to be lost.**
|
||||
|
||||
The Longhorn CSI driver (`driver.longhorn.io`) is registered with the kubelet on each node. When the power cut occurred:
|
||||
|
||||
1. K3s/kubelet processes crashed
|
||||
2. Longhorn manager pods crashed
|
||||
3. CSI driver registration was lost
|
||||
4. On restart, Longhorn pods attempted to restart but:
|
||||
- The `longhorn-driver-deployer` pod has an init container (`wait-longhorn-manager`) that waits for managers to be ready
|
||||
- Longhorn managers were slow to recover (some still in CrashLoopBackOff)
|
||||
- CSI pods (attacher, provisioner, resizer, snapshotter) cannot start without the CSI socket at `/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock`
|
||||
- Custom Resource Definitions (Volumes, Replicas, etc.) exist but CSI driver cannot communicate with them
|
||||
|
||||
### Secondary Issues
|
||||
|
||||
1. **Docker overlay2 corruption**: Docker storage at `/mnt/arcodange/docker/overlay2/` was corrupted on at least pi1, affecting coredns and svclb-traefik pods
|
||||
2. **NFS backup mount unavailable**: The Longhorn share-manager pod (which exports NFS) is in Error state, making `/mnt/backups/` inaccessible
|
||||
3. **Backup scripts bug**: The `backup.volumes` file at `/opt/k3s_volumes/backup.volumes` is empty due to a script formatting bug
|
||||
|
||||
### Failure Propagation
|
||||
|
||||
```mermaid
|
||||
%%{init: { 'theme': 'forest' }}%%
|
||||
graph TD
|
||||
A[Power Cut] --> B[Kubelet Crashes]
|
||||
A --> C[Docker Daemon Crashes]
|
||||
B --> D[Longhorn Manager Pods Crash]
|
||||
B --> E[CSI Driver Registration Lost]
|
||||
C --> F[Overlay2 Filesystem Corrupt]
|
||||
D --> G[Driver-Deployer Init Container Waits]
|
||||
E --> H[CSI Socket Disappears]
|
||||
G --> I[CSI Driver Not Deployed]
|
||||
H --> J[CSI Pods Cannot Start]
|
||||
I --> J
|
||||
J --> K[PVC Mounts Fail]
|
||||
K --> L[Application Pods Crash]
|
||||
F --> M[Docker Containers Fail to Start]
|
||||
M --> N[CoreDNS Crashes]
|
||||
M --> O[Service Load Balancers Crash]
|
||||
N --> P[DNS Resolution Fails]
|
||||
O --> P
|
||||
P --> L
|
||||
K --> L
|
||||
```
|
||||
|
||||
### Why Data Is Safe
|
||||
|
||||
The Longhorn volume data is stored in replicas across all three nodes at `/mnt/arcodange/longhorn/replicas/`. Checking the Longhorn volumes shows:
|
||||
|
||||
```
|
||||
All 12 volumes: state="attached", robustness="healthy"
|
||||
```
|
||||
|
||||
This confirms that:
|
||||
1. Volume metadata is intact in etcd
|
||||
2. Replica data is intact on disk
|
||||
3. Once CSI driver is restored, volumes will be accessible again
|
||||
4. **No permanent data loss has occurred**
|
||||
|
||||
## Recovery Actions Taken
|
||||
|
||||
### Attempt 1: HelmChart Manifest Touch (15:24:50 - 15:25:50)
|
||||
**Action:** Touched `/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml` on pi1
|
||||
|
||||
**Command:**
|
||||
```bash
|
||||
ssh pi@pi1 "sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml"
|
||||
```
|
||||
|
||||
**Outcome:** Only triggered reconcile for 1 pod (longhorn-manager-w85v6). CSI driver still not registered.
|
||||
|
||||
**Decision:** Insufficient. Need more aggressive approach.
|
||||
|
||||
### Attempt 2: Force Delete All Longhorn Pods (15:32:15 - Present)
|
||||
**Action:** Force deleted all 24 pods in longhorn-system namespace
|
||||
|
||||
**Command:**
|
||||
```bash
|
||||
kubectl delete pods -n longhorn-system --all --force --grace-period=0
|
||||
```
|
||||
|
||||
**Outcome:**
|
||||
- HelmChart controller detected changes and recreated all pods
|
||||
- **Success**: 23/25 pods now in Running state (15:34:30)
|
||||
- **Blocking**: `longhorn-driver-deployer` stuck in Init:0/1
|
||||
- **Blocking**: All `longhorn-csi-plugin` pods in Error
|
||||
- **Investigation**: driver-deployer's `wait-longhorn-manager` init container waiting for manager readiness
|
||||
|
||||
### Current Investigation (15:34:30)
|
||||
**Focus:** Why driver-deployer is stuck in Init state
|
||||
|
||||
The `longhorn-driver-deployer` pod has an init container that waits for Longhorn manager to be ready before deploying the CSI driver. Despite 3 manager pods running, the wait condition is not being met.
|
||||
|
||||
**Hypotheses:**
|
||||
1. Manager pods are not fully healthy (readiness probes failing)
|
||||
2. Network connectivity between driver-deployer and managers
|
||||
3. RBAC or service account permissions issue
|
||||
4. Configuration mismatch in HelmChart values
|
||||
|
||||
## Current Status (2026-04-15)
|
||||
|
||||
### Longhorn System
|
||||
- **All Longhorn pods**: Running ✅ (reinstalled 2026-04-13)
|
||||
- **CSI driver**: Registered ✅
|
||||
|
||||
### Volume Recovery Status
|
||||
|
||||
| PVC | Namespace | Size | Status |
|
||||
|-----|-----------|------|--------|
|
||||
| `traefik` (kube-system) | kube-system | 128Mi | ✅ Recovered (2026-04-14) |
|
||||
| `url-shortener-data` | url-shortener | 128Mi | ✅ Recovered (2026-04-14) |
|
||||
| `clickhouse-storage-clickhouse-0` | tools | 16Gi | ✅ Recovered (2026-04-14) |
|
||||
| `prometheus-server` | tools | 8Gi | ⏳ In progress (2026-04-15) |
|
||||
| `storage-prometheus-alertmanager-0` | tools | 2Gi | ⏳ In progress (2026-04-15) |
|
||||
| `redis-storage-redis-0` | tools | 1Gi | ⏳ In progress (2026-04-15) |
|
||||
| `backups-rwx` | longhorn-system | 50Gi | ⏳ In progress (2026-04-15) |
|
||||
| `data-hashicorp-vault-0` | tools | 10Gi | 🔴 Deferred — manual recovery |
|
||||
| `audit-hashicorp-vault-0` | tools | 10Gi | 🔴 Deferred — manual recovery |
|
||||
| `erp` | erp | 50Gi | 🔴 Deferred — manual recovery |
|
||||
|
||||
## Next Steps
|
||||
|
||||
### Immediate
|
||||
1. Confirm prometheus, alertmanager, redis, backups-rwx fully recovered via `longhorn_data.yml`
|
||||
2. Verify monitoring stack (Grafana dashboards, alert routing) is functional
|
||||
|
||||
### Short-term
|
||||
3. Manual recovery of Vault (`data-hashicorp-vault-0`, `audit-hashicorp-vault-0`) — see Vault runbook
|
||||
4. Manual recovery of ERP (`erp`) — coordinate with application owner
|
||||
5. Update backup playbook to include Longhorn Volume CRDs (see ADR 20260414-longhorn-pvc-recovery)
|
||||
6. Prepare Longhorn Google Storage restore playbook for `backups-rwx` alternative recovery path
|
||||
|
||||
### Long-term
|
||||
- Implement UPS for the Raspberry Pi cluster
|
||||
- Add Longhorn volume health monitoring to Grafana
|
||||
- Regular backup restore drills
|
||||
|
||||
## Architecture Context
|
||||
|
||||
```mermaid
|
||||
%%{init: { 'theme': 'forest' }}%%
|
||||
flowchart TB
|
||||
subgraph K3s Control Plane
|
||||
A[pi1: Control Plane] -->|runs| B[kubelet]
|
||||
B --> C[k3s server]
|
||||
C --> D[HelmChart Controller]
|
||||
end
|
||||
|
||||
subgraph Storage Layer
|
||||
E[Longhorn HelmChart] --> F[Longhorn Manager Pods]
|
||||
F --> G[Driver Deployer]
|
||||
G --> H[CSI Driver Registration]
|
||||
H --> I[CSI Socket: /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock]
|
||||
F --> J[Longhorn Volumes]
|
||||
J --> K[Replicas on all 3 nodes]
|
||||
end
|
||||
|
||||
subgraph CSI Components
|
||||
H --> L[csi-attacher Pods]
|
||||
H --> M[csi-provisioner Pods]
|
||||
H --> N[csi-resizer Pods]
|
||||
H --> O[csi-snapshotter Pods]
|
||||
H --> P[csi-plugin DaemonSet]
|
||||
end
|
||||
|
||||
subgraph Data Path
|
||||
I --> Q[/mnt/arcodange/longhorn/]
|
||||
Q --> R[replicas/]
|
||||
end
|
||||
|
||||
subgraph Docker Storage
|
||||
S[Docker Daemon] --> T[/mnt/arcodange/docker/]
|
||||
T --> U[overlay2/]
|
||||
end
|
||||
|
||||
L -->|mounts volumes| V[Application Pods]
|
||||
M -->|creates volumes| J
|
||||
P -->|node-level ops| I
|
||||
|
||||
classDef critical fill:#c00,color:#fff,stroke:#000
|
||||
classDef healthy fill:#0a0,color:#000,stroke:#000
|
||||
classDef degraded fill:#ff0,color:#000,stroke:#000
|
||||
|
||||
class H,L,M,N,O,P critical
|
||||
class F,G,E degraded
|
||||
class I,J,Q,R,U healthy
|
||||
```
|
||||
|
||||
## Component Details
|
||||
|
||||
### Longhorn Manager
|
||||
- **Role**: Primary controller for Longhorn, manages volumes, replicas, snapshots
|
||||
- **Image**: `longhornio/longhorn-manager:v1.9.1`
|
||||
- **Ports**: 9500 (manager), 9501 (webhook health), 9502 (metrics)
|
||||
- **Data Path**: `/mnt/arcodange/longhorn` (configured in HelmChart values)
|
||||
- **Health Check**: `https://<pod-ip>:9501/v1/healthz`
|
||||
|
||||
### Longhorn Driver Deployer
|
||||
- **Role**: Deploys the CSI driver to each node
|
||||
- **Image**: `longhornio/longhorn-manager:v1.9.1`
|
||||
- **Init Container**: `wait-longhorn-manager` - waits for manager to be ready
|
||||
- **Blocker**: Currently stuck in init, preventing CSI driver deployment
|
||||
|
||||
### CSI Driver
|
||||
- **Role**: Implements the CSI (Container Storage Interface) specification for Longhorn
|
||||
- **Socket**: `/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock`
|
||||
- **Registration**: Must be registered with kubelet via CSINode
|
||||
- **Images**:
|
||||
- `longhornio/csi-attacher:v4.9.0-20250709`
|
||||
- `longhornio/csi-provisioner:v5.3.0-20250709`
|
||||
- `longhornio/csi-resizer:v1.14.0-20250709`
|
||||
- `longhornio/csi-snapshotter:v8.3.0-20250709`
|
||||
- `longhornio/csi-node-driver-registrar:v2.14.0-20250709`
|
||||
|
||||
### CSI Node Driver Registrar
|
||||
- **Role**: Registers the CSI driver with kubelet
|
||||
- **Image**: `longhornio/csi-node-driver-registrar:v2.14.0-20250709`
|
||||
- **Mechanism**: Creates a `CSINode` resource and registers via kubelet plugin registry
|
||||
|
||||
## Action Items
|
||||
|
||||
### Immediate (resolved)
|
||||
- [x] Investigate and resolve driver-deployer init container blocker
|
||||
- [x] Restore CSI driver registration
|
||||
- [x] Fix Docker overlay2 corruption / daemon.json on all nodes
|
||||
- [x] Fix DNS (CoreDNS + Pi-hole dnsmasq config)
|
||||
- [x] Longhorn reinstalled and healthy
|
||||
- [x] Traefik ingress controller functional
|
||||
- [x] Fix backup script (empty backup.volumes bug)
|
||||
|
||||
### Short-term (resolved)
|
||||
- [x] url-shortener data recovered
|
||||
- [x] Clickhouse data recovered
|
||||
- [x] Develop automated block-device recovery playbook (`playbooks/recover/longhorn_data.yml`)
|
||||
- [x] Backup restore procedure documented and tested
|
||||
|
||||
### Medium-term (in progress)
|
||||
- [ ] prometheus, alertmanager, redis, backups-rwx recovered (playbook running 2026-04-15)
|
||||
- [ ] Vault manual recovery
|
||||
- [ ] ERP manual recovery
|
||||
- [ ] Update backup playbook to include Longhorn Volume CRDs
|
||||
- [ ] Prepare Longhorn Google Storage restore playbook
|
||||
|
||||
### Long-term
|
||||
- [ ] Implement UPS for Raspberry Pi cluster
|
||||
- [ ] Add Longhorn volume health monitoring to Grafana
|
||||
- [ ] Add CSI socket health check to monitoring
|
||||
- [ ] Regular backup restore drills (monthly)
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
### What Went Well
|
||||
- Quick identification of root cause (CSI driver registration)
|
||||
- Longhorn volume data remained intact (good replica design)
|
||||
- Ability to force-pod-delete triggered partial recovery
|
||||
- K3s HelmChart approach allows easy manifest-based recovery
|
||||
|
||||
### What Could Be Improved
|
||||
- Need better CSI driver health monitoring and alerting
|
||||
- Longhorn driver-deployer init container timeout may be too short
|
||||
- Docker overlay2 on external storage needs better corruption recovery
|
||||
- Backup script has bugs that prevent reliable backups
|
||||
- No UPS protection for power cuts
|
||||
|
||||
### Technical Debt Identified
|
||||
- Backup script formatting bug (extra newlines create invalid YAML)
|
||||
- No automated Longhorn health checks
|
||||
- Manual intervention required for CSI driver recovery
|
||||
|
||||
## Related Files
|
||||
|
||||
- **Ansible Playbook**: `playbooks/system/k3s_config.yml` (Longhorn HelmChart creation)
|
||||
- **HelmChart Manifest**: `/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml` on pi1
|
||||
- **Backup Scripts**: `/opt/k3s_volumes/backup.sh` and `/opt/k3s_volumes/restore.sh` on pi1
|
||||
- **Inventory**: `inventory/hosts.yml` (required for all playbooks)
|
||||
|
||||
## Commands Reference
|
||||
|
||||
### Check Longhorn Status
|
||||
```bash
|
||||
kubectl get pods -n longhorn-system
|
||||
kubectl get volumes -n longhorn-system
|
||||
kubectl get replicas -n longhorn-system
|
||||
kubectl get settings -n longhorn-system
|
||||
```
|
||||
|
||||
### Force Longhorn Recovery (k3s-specific)
|
||||
```bash
|
||||
# Method 1: Touch manifest (soft reconcile)
|
||||
sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml
|
||||
|
||||
# Method 2: Delete all pods (force recreate)
|
||||
kubectl delete pods -n longhorn-system --all --force --grace-period=0
|
||||
|
||||
# Method 3: Delete specific pod
|
||||
kubectl delete pod -n longhorn-system longhorn-driver-deployer-*
|
||||
```
|
||||
|
||||
### Check CSI Driver Registration
|
||||
```bash
|
||||
kubectl get csidriver
|
||||
kubectl get csinodes
|
||||
kubectl describe csidriver driver.longhorn.io
|
||||
```
|
||||
|
||||
### Check Longhorn Manufacturer
|
||||
```bash
|
||||
kubectl describe cm -n longhorn-system longhorn-storageclass
|
||||
```
|
||||
@@ -0,0 +1,209 @@
|
||||
%%{init: { 'theme': 'forest', 'themeVariables': {
|
||||
'primaryColor': '#1e293b',
|
||||
'primaryTextColor': '#f8fafc',
|
||||
'lineColor': '#334155',
|
||||
'secondaryColor': '#475569',
|
||||
'tertiaryColor': '#94a3b8',
|
||||
'edgeLabelBackground':'#fff',
|
||||
'edgeLabelColor': '#1e293b'
|
||||
}}}%%
|
||||
|
||||
flowchart TD
|
||||
subgraph Cluster["K3s Cluster (v1.34.3+k3s1)"]
|
||||
direction TB
|
||||
|
||||
subgraph Nodes["Physical Nodes"]
|
||||
pi1["pi1: 192.168.1.201\nControl Plane"]
|
||||
pi2["pi2: 192.168.1.202\nWorker"]
|
||||
pi3["pi3: 192.168.1.203\nWorker"]
|
||||
end
|
||||
|
||||
subgraph K3sComponents["K3s Control Plane Components"]
|
||||
kubelet1["kubelet"]
|
||||
kubelet2["kubelet"]
|
||||
kubelet3["kubelet"]
|
||||
k3s_server["k3s server"]
|
||||
helm_controller["HelmChart Controller"]
|
||||
end
|
||||
|
||||
pi1 --> kubelet1
|
||||
pi2 --> kubelet2
|
||||
pi3 --> kubelet3
|
||||
pi1 --> k3s_server
|
||||
k3s_server --> helm_controller
|
||||
end
|
||||
|
||||
subgraph LonghornStorage["Longhorn Storage System"]
|
||||
direction TB
|
||||
|
||||
subgraph HelmChart["HelmChart Installation"]
|
||||
manifest[("longhorn-install.yaml")]
|
||||
end
|
||||
|
||||
subgraph Manager["Longhorn Manager layer"]
|
||||
lh_manager1["longhorn-manager-r6sd2\n2/2 Running\npi2"]
|
||||
lh_manager2["longhorn-manager-sjc56\n1/2 Running\npi3"]
|
||||
lh_manager3["longhorn-manager-t9b45\n1/2 Running\npi1"]
|
||||
webhook["Webhook Leader: pi2"]
|
||||
end
|
||||
|
||||
subgraph DriverDeployer["CSI Driver Deployer"]
|
||||
deployer["longhorn-driver-deployer\n0/1 Init:0/1\npi3"]
|
||||
wait_container["wait-longhorn-manager\nwaiting..."]
|
||||
end
|
||||
|
||||
subgraph CSIDriver["CSI Driver Components"]
|
||||
csi_socket[("/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock")]
|
||||
csi_registrar["CSI Node Driver Registrar"]
|
||||
end
|
||||
|
||||
subgraph CSIContainers["CSI Containers (Sidecars)"]
|
||||
attacher1["csi-attacher-54ld9\n1/1 Running\npi2"]
|
||||
attacher2["csi-attacher-dqq9v\n1/1 Running\npi3"]
|
||||
attacher3["csi-attacher-k5jmx\n0/1 Error\npi1"]
|
||||
provisioner1["csi-provisioner-9z79d\n0/1 Error\npi2"]
|
||||
provisioner2["csi-provisioner-zjwdr\n1/1 Running\npi1"]
|
||||
provisioner3["csi-provisioner-zk5kp\n1/1 Running\npi3"]
|
||||
resizer1["csi-resizer-8mrld\n1/1 Running\npi3"]
|
||||
resizer2["csi-resizer-ddhl2\n0/1 Error\npi1"]
|
||||
resizer3["csi-resizer-qv5n9\n0/1 Error\npi2"]
|
||||
snapshotter1["csi-snapshotter-9rzf4\n1/1 Running\npi3"]
|
||||
snapshotter2["csi-snapshotter-bqdtd\n0/1 Error\npi2"]
|
||||
snapshotter3["csi-snapshotter-jv6pj\n1/1 Running\npi1"]
|
||||
end
|
||||
|
||||
subgraph CSIPlugin["CSI Plugin DaemonSet"]
|
||||
plugin1["longhorn-csi-plugin-f44jp\n0/3 Error\npi3"]
|
||||
plugin2["longhorn-csi-plugin-q2sgh\n1/3 Error\npi1"]
|
||||
plugin3["longhorn-csi-plugin-vzld8\n2/3 Error\npi2"]
|
||||
end
|
||||
|
||||
subgraph DataLayer["Longhorn Data Layer"]
|
||||
engine1["engine-image-ei-8ktd9\n1/1 Running\npi1"]
|
||||
engine2["engine-image-ei-dcjq8\n1/1 Running\npi3"]
|
||||
engine3["engine-image-ei-m76jf\n1/1 Running\npi2"]
|
||||
|
||||
volumes[("12 Longhorn Volumes")]
|
||||
replicas[("/mnt/arcodange/longhorn/replicas/")]
|
||||
end
|
||||
|
||||
subgraph UIAndTools["UI & Backup"]
|
||||
ui1["longhorn-ui-8gb4s\n0/1 CrashLoop\npi1"]
|
||||
ui2["longhorn-ui-hmxz6\n0/1 CrashLoop\npi3"]
|
||||
share_mgr1["share-manager-...70b4\n0/1 Error\npi1"]
|
||||
share_mgr2["share-manager-...7ffa\n0/1 Error\npi3"]
|
||||
nfs["rwx-nfs-4cn9h\n0/1 ContainerCreating\npi3"]
|
||||
end
|
||||
|
||||
manifest --> lh_manager1 & lh_manager2 & lh_manager3
|
||||
helm_controller --> manifest
|
||||
|
||||
lh_manager1 & lh_manager2 & lh_manager3 --> webhook
|
||||
|
||||
deployer --> wait_container
|
||||
wait_container -.->|waits for| lh_manager1 & lh_manager2 & lh_manager3
|
||||
deployer --> csi_registrar
|
||||
csi_registrar --> csi_socket
|
||||
|
||||
csi_socket --> kubelet1
|
||||
csi_socket --> kubelet2
|
||||
csi_socket --> kubelet3
|
||||
|
||||
attacher1 & attacher2 & attacher3 --> csi_socket
|
||||
provisioner1 & provisioner2 & provisioner3 --> csi_socket
|
||||
resizer1 & resizer2 & resizer3 --> csi_socket
|
||||
snapshotter1 & snapshotter2 & snapshotter3 --> csi_socket
|
||||
|
||||
plugin1 & plugin2 & plugin3 --> csi_socket
|
||||
|
||||
lh_manager1 & lh_manager2 & lh_manager3 --> volumes
|
||||
volumes --> replicas
|
||||
|
||||
replicas --> pi1_disk[("pi1: /mnt/arcodange/longhorn")]
|
||||
replicas --> pi2_disk[("pi2: /mnt/arcodange/longhorn")]
|
||||
replicas --> pi3_disk[("pi3: /mnt/arcodange/longhorn")]
|
||||
|
||||
share_mgr1 & share_mgr2 --> nfs
|
||||
nfs --> backup_pvc[("PVC: backups-rwx\n50Gi")]
|
||||
end
|
||||
|
||||
subgraph DockerStorage["Docker Storage layer"]
|
||||
docker1["Docker daemon\npi1"]
|
||||
docker2["Docker daemon\npi2"]
|
||||
docker3["Docker daemon\npi3"]
|
||||
|
||||
storage1[("/mnt/arcodange/docker/overlay2/")]
|
||||
|
||||
docker1 --> storage1
|
||||
docker2 --> storage1
|
||||
docker3 --> storage1
|
||||
end
|
||||
|
||||
subgraph ApplicationLayer["Application Pods (Affected)"]
|
||||
traefik["traefik-5c67cb6889-8b5nk\n0/1 Error\nkube-system"]
|
||||
cms["cms-arcodange-cms-...\n0/1 ImagePullBackOff\ncms"]
|
||||
webapp["webapp-6588455979-...\n0/1 ImagePullBackOff\nwebapp"]
|
||||
erp["erp-648748b4f5-bntd9\n0/1 Error\nerp"]
|
||||
grafana["grafana-5d496f9668-...\n0/3 Error\ntools"]
|
||||
vault["hashicorp-vault-0\n0/1 Error\ntools"]
|
||||
end
|
||||
|
||||
subgraph NetworkServices["Network Services"]
|
||||
coredns["coredns-67476ddb48-jrcg2\n1/1 Running\nkube-system"]
|
||||
svclb["svclb-traefik-*\n3/3 Running\nkube-system"]
|
||||
end
|
||||
|
||||
%% Connections showing failure paths
|
||||
csi_socket --x-- traefik :x
|
||||
csi_socket --x-- cms :x
|
||||
csi_socket --x-- webapp :x
|
||||
csi_socket --x-- erp :x
|
||||
csi_socket --x-- grafana :x
|
||||
csi_socket --x-- vault :x
|
||||
|
||||
docker1 --x-- coredns :x
|
||||
docker1 --x-- svclb :x
|
||||
|
||||
%% Healthy connections
|
||||
volumes -->|provides storage| traefik
|
||||
volumes -->|provides storage| cms
|
||||
volumes -->|provides storage| webapp
|
||||
volumes -->|provides storage| erp
|
||||
volumes -->|provides storage| grafana
|
||||
volumes -->|provides storage| vault
|
||||
|
||||
classDef node fill:#0ea5e9,color:#000,stroke:#06b6d4
|
||||
classDef k3s fill:#84cc16,color:#000,stroke:#65a30d
|
||||
classDef longhorn fill:#a855f7,color:#fff,stroke:#8b5cf6
|
||||
classDef csi fill:#f59e0b,color:#000,stroke:#d97706
|
||||
classDef data fill:#10b981,color:#000,stroke:#059669
|
||||
classDef app fill:#ec4899,color:#fff,stroke:#db2777
|
||||
classDef network fill:#6366f1,color:#fff,stroke:#4f46e5
|
||||
classDef error fill:#ef4444,color:#fff,stroke:#dc2626
|
||||
classDef waiting fill:#fbbf24,color:#000,stroke:#f59e0b
|
||||
|
||||
class pi1,pi2,pi3 node
|
||||
class kubelet1,kubelet2,kubelet3,k3s_server,helm_controller k3s
|
||||
class manifest,webhook longhorn
|
||||
class lh_manager1,lh_manager2,lh_manager3,engine1,engine2,engine3,volumes,replicas,share_mgr1,share_mgr2 data
|
||||
class deployer,wait_container,csi_registrar,csi_socket longhorn
|
||||
class attacher1,attacher2,attacher3,provisioner1,provisioner2,provisioner3,resizer1,resizer2,resizer3,snapshotter1,snapshotter2,snapshotter3 csi
|
||||
class plugin1,plugin2,plugin3 csi
|
||||
class traefik,cms,webapp,erp,grafana,vault app
|
||||
class coredns,svclb network
|
||||
class docker1,docker2,docker3,data
|
||||
|
||||
class deployer,wait_container error
|
||||
class attacher3,provisioner1,resizer2,resizer3,snapshotter2 error
|
||||
class plugin1,plugin2,plugin3 error
|
||||
class ui1,ui2,share_mgr1,share_mgr2 error
|
||||
class traefik,cms,webapp,erp,grafana,vault error
|
||||
class nfs waiting
|
||||
class lh_manager2,lh_manager3 waiting
|
||||
|
||||
classDef clusterBox stroke:#334155,stroke-width:2px,color:#94a3b8
|
||||
class Cluster clusterBox
|
||||
class LonghornStorage clusterBox
|
||||
class DockerStorage clusterBox
|
||||
class ApplicationLayer clusterBox
|
||||
class NetworkServices clusterBox
|
||||
@@ -0,0 +1,200 @@
|
||||
%%{init: { 'theme': 'forest', 'themeVariables': {
|
||||
'primaryColor': '#7c3aed',
|
||||
'primaryTextColor': '#ffffff',
|
||||
'lineColor': '#6d28d9',
|
||||
'secondaryColor': '#8b5cf6',
|
||||
'tertiaryColor': '#a78bfa',
|
||||
'edgeLabelBackground':'#5b21b6',
|
||||
'edgeLabelColor': '#ffffff'
|
||||
}}}%%
|
||||
|
||||
mindmap
|
||||
root((Longhorn Storage System))
|
||||
|
||||
%% ===== CONTROL PLANE COMPONENTS =====
|
||||
ControlPlane[Control Plane]
|
||||
Manager[longhorn-manager]
|
||||
Role1["Role: Primary controller for Longhorn"]
|
||||
Responsibilities1["• Manages volumes, replicas, snapshots\n• Handles volume lifecycle\n• Coordinates with etcd\n• Exposes API (port 9500)"]
|
||||
Health1["Health Check: :9501/v1/healthz"]
|
||||
Webhook1["Webhook: :9502/metrics"]
|
||||
|
||||
DriverDeployer[longhorn-driver-deployer]
|
||||
Role2["Role: CSI driver deployment controller"]
|
||||
Responsibilities2["• Deploys CSI driver to each node\n• Runs via init container (wait-longhorn-manager)\n• Creates csi.sock on each node"]
|
||||
WaitCmd["Command: longhorn-manager wait -d <namespace>"]
|
||||
Blocking["⚠️ BLOCKED: Init container waiting for managers"]
|
||||
|
||||
%% ===== CSI COMPONENTS =====
|
||||
CSILayer[CSI Interface]
|
||||
CSISocket[("/var/lib/kubelet/plugins/driver.longhorn.io/csi.sock")]
|
||||
SocketRole["Role: Unix domain socket for CSI communication"]
|
||||
|
||||
Attacher[csi-attacher]
|
||||
AttacherRole["Role: Attaches volumes to nodes"]
|
||||
AttacherResp["• Monitors VolumeAttachment objects\n• Calls CSI ControllerPublishVolume\n• Handles detach operations"]
|
||||
AttacherStatus["Status: 2/3 Running, 1 Error"]
|
||||
|
||||
Provisioner[csi-provisioner]
|
||||
ProvisionerRole["Role: Creates volumes from PVCs"]
|
||||
ProvisionerResp["• Watches PVC objects\n• Calls CSI CreateVolume\n• Handles volume deletion"]
|
||||
ProvisionerStatus["Status: 2/3 Running, 1 Error"]
|
||||
|
||||
Resizer[csi-resizer]
|
||||
ResizerRole["Role: Handles volume resizing"]
|
||||
ResizerResp["• Watches PVC size changes\n• Calls CSI ExpandVolume"]
|
||||
ResizerStatus["Status: 1/3 Running, 2 Error"]
|
||||
|
||||
Snapshotter[csi-snapshotter]
|
||||
SnapshotterRole["Role: Manages volume snapshots"]
|
||||
SnapshotterResp["• Watches VolumeSnapshot objects\n• Calls CSI CreateSnapshot\n• Handles snapshot deletion"]
|
||||
SnapshotterStatus["Status: 2/3 Running, 1 Error"]
|
||||
|
||||
NodeRegistrar[csi-node-driver-registrar]
|
||||
RegistrarRole["Role: Registers driver with kubelet"]
|
||||
RegistrarResp["• Creates CSINode resource\n• Registers via kubelet plugin registry API"]
|
||||
|
||||
Plugin[csi-plugin]
|
||||
PluginRole["Role: Node-level CSI operations"]
|
||||
PluginResp["• Runs on each node (DaemonSet)\n• Handles NodePublish/UnpublishVolume\n• Manages mount/unmount operations"]
|
||||
PluginStatus["⚠️ BLOCKED: All 3 pods in Error (no CSI socket)"]
|
||||
|
||||
%% ===== DATA LAYER COMPONENTS =====
|
||||
DataLayer[Data Layer]
|
||||
Engine[engine-image]
|
||||
EngineRole["Role: Engine and instance manager"]
|
||||
EngineResp["• Pulls and manages engine binaries\n• Runs as sidecar in DaemonSet\n• Maintains engine processes"]
|
||||
EngineStatus["Status: ✅ 3/3 Running"]
|
||||
|
||||
Volumes[Longhorn Volumes]
|
||||
VolumeRole["Role: Logical volume representation"]
|
||||
VolumeResp["• Managed via Longhorn CRDs\n• Replicated across nodes\n• Supports RWO, RWX access modes"]
|
||||
VolumeStatus["Status: ✅ All 12 volumes attached & healthy"]
|
||||
|
||||
Replicas[Volume Replicas]
|
||||
ReplicaRole["Role: Physical data storage"]
|
||||
ReplicaResp["• 3-way replication across nodes\n• Stored at /mnt/arcodange/longhorn/replicas/\n• Data intact after power cut"]
|
||||
ReplicaPath["Path: pi1, pi2, pi3: /mnt/arcodange/longhorn/replicas/"]
|
||||
|
||||
Backups[Backup System]
|
||||
NFS[RWX NFS Share]
|
||||
NFSRole["Role: NFS export for backup volume"]
|
||||
NFSCreate["Created via: playbooks/setup/backup_nfs.yml"]
|
||||
NFSStatus["⚠️ OFFLINE: share-manager pods in Error"]
|
||||
|
||||
BackupPVC[Backup PVC]
|
||||
BackupPVCRole["Role: Persistent storage for backups"]
|
||||
BackupPVCDetails["Name: backups-rwx\nNamespace: longhorn-system\nSize: 50Gi\nClass: longhorn"]
|
||||
|
||||
ShareManager[share-manager]
|
||||
ShareRole["Role: Manages NFS exports for Longhorn volumes"]
|
||||
ShareStatus["⚠️ BLOCKED: 2 pods in Error"]
|
||||
|
||||
%% ===== UI & TOOLS =====
|
||||
UI[Web UI]
|
||||
UIRole["Role: Longhorn management dashboard"]
|
||||
UIAccess["Access: Port 9500 on manager pods"]
|
||||
UIStatus["⚠️ BLOCKED: 2 pods in CrashLoopBackOff"]
|
||||
|
||||
%% ===== INFRASTRUCTURE =====
|
||||
Infrastructure[Underlying Infrastructure]
|
||||
Nodes[Raspberry Pi Nodes]
|
||||
pi1["pi1: 192.168.1.201\nRole: Control Plane"]
|
||||
pi2["pi2: 192.168.1.202\nRole: Worker"]
|
||||
pi3["pi3: 192.168.1.203\nRole: Worker"]
|
||||
|
||||
K3s[Kubernetes (k3s v1.34.3+k3s1)]
|
||||
Kubelet["kubelet (3 instances)"]
|
||||
APIServer["API Server (on pi1)"]
|
||||
etcd["etcd (on pi1)"]
|
||||
HelmCtrl["HelmChart Controller"]
|
||||
|
||||
Docker[Docker Engine]
|
||||
DockerRole["Role: Container runtime"]
|
||||
DockerStorage["Storage: /mnt/arcodange/docker/"]
|
||||
Overlay2["⚠️ ISSUE: overlay2 filesystem corrupted"]
|
||||
|
||||
%% ===== EXTERNAL DEPENDENCIES =====
|
||||
Dependencies[External Dependencies]
|
||||
CSIRegistration[CSI Driver Registration]
|
||||
CSIRole["Role: k8s CSI registration"]
|
||||
CSIDriver["Driver: driver.longhorn.io"]
|
||||
CSIDriverStatus["⚠️ LOST: Not registered with kubelet"]
|
||||
|
||||
%% ===== CONNECTIONS =====
|
||||
root --> ControlPlane
|
||||
root --> CSILayer
|
||||
root --> DataLayer
|
||||
root --> UI
|
||||
root --> Infrastructure
|
||||
root --> Dependencies
|
||||
|
||||
ControlPlane --> Manager
|
||||
ControlPlane --> DriverDeployer
|
||||
|
||||
CSILayer --> CSISocket
|
||||
CSILayer --> Attacher
|
||||
CSILayer --> Provisioner
|
||||
CSILayer --> Resizer
|
||||
CSILayer --> Snapshotter
|
||||
CSILayer --> NodeRegistrar
|
||||
CSILayer --> Plugin
|
||||
|
||||
CSISocket --> Attacher
|
||||
CSISocket --> Provisioner
|
||||
CSISocket --> Resizer
|
||||
CSISocket --> Snapshotter
|
||||
CSISocket --> Plugin
|
||||
CSISocket --> NodeRegistrar
|
||||
|
||||
DriverDeployer --> NodeRegistrar
|
||||
NodeRegistrar --> CSISocket
|
||||
|
||||
DataLayer --> Engine
|
||||
DataLayer --> Volumes
|
||||
DataLayer --> Replicas
|
||||
DataLayer --> Backups
|
||||
|
||||
Backups --> NFS
|
||||
Backups --> BackupPVC
|
||||
Backups --> ShareManager
|
||||
|
||||
Infrastructure --> Nodes
|
||||
Infrastructure --> K3s
|
||||
Infrastructure --> Docker
|
||||
|
||||
Dependencies --> CSIRegistration
|
||||
CSIRegistration --> CSISocket
|
||||
|
||||
%% ===== YET TO BE RESTORED =====
|
||||
Dependencies --x EmptyCSI["⚠️ CSI Socket Missing"] :x
|
||||
EmptyCSI --x Attacher :x
|
||||
EmptyCSI --x Provisioner :x
|
||||
EmptyCSI --x Resizer :x
|
||||
EmptyCSI --x Snapshotter :x
|
||||
EmptyCSI --x Plugin :x
|
||||
|
||||
%% ===== STYLES =====
|
||||
classDef component fill:#8b5cf6,color:#fff,stroke:#7c3aed,stroke-width:2px
|
||||
classDef role fill:#a78bfa,color:#000,stroke:#8b5cf6
|
||||
classDef responsibility fill:#c4b5fd,color:#000,stroke:#8b5cf6
|
||||
classDef status_good fill:#10b981,color:#fff,stroke:#059669
|
||||
classDef status_bad fill:#ef4444,color:#fff,stroke:#dc2626
|
||||
classDef status_warn fill:#f59e0b,color:#000,stroke:#d97706
|
||||
classDef infinite fill:#3b82f6,color:#fff,stroke:#2563eb
|
||||
|
||||
class root infinite
|
||||
|
||||
class ControlPlane,CSILayer,DataLayer,UI,Infrastructure,Dependencies component
|
||||
class Manager,Attacher,Provisioner,Resizer,Snapshotter,NodeRegistrar,Plugin,Engine,Volumes,Replicas,NFS,BackupPVC,ShareManager,UIRole,Nodes,K3s,Docker,CSIRegistration component
|
||||
|
||||
class Role1,Role2,AttacherRole,ProvisionerRole,ResizerRole,SnapshotterRole,RegistrarRole,PluginRole,EngineRole,VolumeRole,ReplicaRole,NFSRole,ShareRole,UIRole,Kubelet,APIServer,etcd,HelmCtrl,DockerRole,CSIRole,CSIDriver component
|
||||
|
||||
class Responsibilities1,Responsibilities2,AttacherResp,ProvisionerResp,ResizerResp,SnapshotterResp,RegistrarResp,PluginResp,EngineResp,VolumeResp,ReplicaResp,NFSRole,BackupPVCDetails,ShareRole,UIAccess,ShareStatus,NFSStatus role
|
||||
|
||||
class EngineStatus,VolumeStatus,ReplicaPath status_good
|
||||
class Blocking,PluginStatus,UIStatus,ShareStatus,NFSCreate,ShareStatus,CSIDriverStatus status_bad
|
||||
class AttacherStatus,ProvisionerStatus,ResizerStatus,SnapshotterStatus status_warn
|
||||
|
||||
classDef mindmapTitle fill:#4c1d95,color:#fff,stroke:#5b21b6,font-size:20px,font-weight:bold
|
||||
class root mindmapTitle
|
||||
@@ -0,0 +1,131 @@
|
||||
%%{init: { 'theme': 'forest', 'themeVariables': {
|
||||
'primaryColor': '#059669',
|
||||
'primaryTextColor': '#fff',
|
||||
'lineColor': '#065f46',
|
||||
'secondaryColor': '#10b981',
|
||||
'edgeLabelBackground':'#064e3b',
|
||||
'edgeLabelColor': '#ffffff'
|
||||
}}}%%
|
||||
|
||||
flowchart TD
|
||||
%% ===== POWER CUT EVENT =====
|
||||
Start([Power Cut Event]) -->|Electricity Lost| Crash[Kubernetes Components Crash]
|
||||
|
||||
%% ===== IMMEDIATE IMPACT =====
|
||||
Crash --> KubeletCrash[Kubelet Processes Crash<br>on all 3 nodes]
|
||||
Crash --> DockerCrash[Docker Daemons Crash<br>on all 3 nodes]
|
||||
Crash --> K3sCrash[K3s Server Process Crash<br>on pi1]
|
||||
|
||||
%% ===== DOCKER STORAGE CORRUPTION =====
|
||||
DockerCrash --> Overlay2[ /mnt/arcodange/docker/overlay2/<br>Filesystem Corrupted]
|
||||
Overlay2 --> DockerFail[Docker containers cannot start<br>missing layer files]
|
||||
DockerFail --> CoreDNSPod[CoreDNS Pod<br>CrashLoopBackOff]
|
||||
DockerFail --> TraefikLB[svclb-traefik Pods<br>CrashLoopBackOff]
|
||||
|
||||
%% ===== LONGHORN IMPACT =====
|
||||
KubeletCrash --> CSIUnreg[CSI Driver Registration Lost<br>driver.longhorn.io unregistered]
|
||||
K3sCrash --> HelmCtrl[HelmChart Controller<br>Unresponsive]
|
||||
|
||||
CSIUnreg --> CSISocket[ /var/lib/kubelet/plugins/.../csi.sock<br>Disappears]
|
||||
|
||||
%% ===== LONGHORN MANAGER LOSS =====
|
||||
KubeletCrash --> LHManagers[Longhorn Manager Pods<br>Crash 3 pods ]
|
||||
LHManagers --> NoQuorum[No Manager Quorum<br>Cannot coordinate]
|
||||
NoQuorum --> VolumesFrozen[Existing Volumes<br>Still healthy but inaccessible]
|
||||
|
||||
CSISocket --> CSIChicago[CSI Pods Cannot Start<br>csi-attacher, provisioner, resizer, snapshotter]
|
||||
CSISocket --> CSIPlugin[CSI Plugin DaemonSet<br>Cannot register driver]
|
||||
|
||||
%% ===== VOLUME MOUNT FAILURES =====
|
||||
CSIChicago --> NoMounts[PVC Mounts Fail<br>All Longhorn PVs inaccessible]
|
||||
CSIPlugin --> NoMounts
|
||||
|
||||
%% ===== APPLICATION CASCADING FAILURES =====
|
||||
NoMounts --> TraefikDown[Traefik Pod<br>PVC mount failed<br>Error state]
|
||||
NoMounts --> AppPods1[Application Pods<br>PVC mount failed<br>Error state<br>cms, webapp, erp, clickhouse, etc.]
|
||||
|
||||
%% ===== BACKUP SYSTEM IMPACT =====
|
||||
NoQuorum --> NFSDown[NFS Share-Manager Pods<br>Error state]
|
||||
NFSDown --> BackupMount[ /mnt/backups/ NFS Mount<br>Unavailable]
|
||||
|
||||
%% ===== DISCOVERY & RECOVERY =====
|
||||
Discovery[15:23:57<br>Incident Discovered] --> Assessment[15:24:05<br>Assessment Complete]
|
||||
Assessment --> Identify[15:24:10<br>Root Cause: CSI Driver Unregistered]
|
||||
Identify --> CheckData[15:24:15<br>Verify Volume Health]
|
||||
CheckData --> DataIntact[All 12 volumes:<br>state=attached<br>robustness=healthy]
|
||||
|
||||
%% ===== RECOVERY ATTEMPTS =====
|
||||
Identify --> Attempt1[15:24:50<br>Attempt 1: Touch HelmChart Manifest]
|
||||
Attempt1 --> Partial1[Only 1 manager pod affected]
|
||||
Partial1 --> NeedMore[Insufficient recovery]
|
||||
|
||||
NeedMore --> Attempt2[15:32:15<br>Attempt 2: Delete All Longhorn Pods]
|
||||
Attempt2 --> HelmReconcile[HelmChart Controller<br>Recreates All 24 Pods]
|
||||
|
||||
HelmReconcile --> Progress[15+ Pods Running<br>Managers, Engine-Image, Some CSI]
|
||||
Progress --> Blocked[Driver-Deployer<br>Stuck in Init:0/1]
|
||||
|
||||
Blocked --> Investigate[15:34:30<br>Investigate wait-longhorn-manager]
|
||||
Investigate --> WaitLoop[Init container runs:<br>longhorn-manager wait -d longhorn-system]
|
||||
WaitLoop --> WaitingManagers[Waiting for all managers<br>to pass readiness probes]
|
||||
|
||||
%% ===== CURRENT STATE (15:35:30) =====
|
||||
WaitingManagers --> CurrentState
|
||||
|
||||
subgraph CurrentState["Current State<br>15:35:30 UTC"]
|
||||
direction TB
|
||||
|
||||
Resolved[Resolved ✅] --> ManagersOk[Manager Pods:<br>2/2, 1/2, 2/2 Running<br>pi1, pi2, pi3]
|
||||
Resolved --> EngineOk[Engine Image:<br>3/3 Running]
|
||||
Resolved --> CSIPartial[CSI Sidecars:<br>~50% Running]
|
||||
Resolved --> VolumeData[Volume Data:<br>All intact]
|
||||
|
||||
BlockedNow[Blocked ❌] --> DriverDeployer[Driver Deployer:<br>Init:0/1 8+ min<br>waiting for managers]
|
||||
BlockedNow --> CSIPluginAll[CSI Plugin:<br>0/3 Error all ]
|
||||
BlockedNow --> UI[Longhorn UI:<br>0/2 CrashLoop]
|
||||
BlockedNow --> ShareMgr[Share Manager:<br>0/2 Error]
|
||||
BlockedNow --> NFSPod[RWX NFS:<br>ContainerCreating]
|
||||
|
||||
BlockedNow --> AppImpact[Application Impact:<br>~30 pods still failed<br>down from 43]
|
||||
end
|
||||
|
||||
%% ===== RECOVERY PATH =====
|
||||
CurrentState --> NextStep[Next: Resolve driver-deployer<br>wait-longhorn-manager blockage]
|
||||
|
||||
NextStep --> CheckHealth[Check manager health endpoints<br>https://<ip>:9501/v1/healthz]
|
||||
CheckHealth -->|If healthy| WaitContainerIssue[Wait container bug/timeout]
|
||||
CheckHealth -->|If unhealthy| FixManagers[Investigate manager readiness]
|
||||
|
||||
WaitContainerIssue --> Option1[Option 1: Delete driver-deployer pod]
|
||||
WaitContainerIssue --> Option2[Option 2: Touch manifest again]
|
||||
|
||||
FixManagers --> CheckLogs[Check manager container logs]
|
||||
CheckLogs --> ResolveManagers[Fix manager readiness]
|
||||
|
||||
Option1 --> CSIDriver[CSI Driver deployed]
|
||||
Option2 --> CSIDriver
|
||||
ResolveManagers --> CSIDriver
|
||||
|
||||
CSIDriver --> CSISocketRestored[CSI Socket Restored]
|
||||
CSISocketRestored --> PodsRecover[All Longhorn pods recover]
|
||||
PodsRecover --> PVCMounts[PVC Mounts resume]
|
||||
PVCMounts --> AppRecovery[Application pods auto-recover]
|
||||
AppRecovery --> ResolvedState[Resolved ✅]
|
||||
|
||||
%% ===== STYLES =====
|
||||
classDef event fill:#10b981,color:#fff,stroke:#059669
|
||||
classDef impact fill:#d97706,color:#000,stroke:#b45309
|
||||
classDef action fill:#3b82f6,color:#fff,stroke:#2563eb
|
||||
classDef resolved fill:#10b981,color:#fff,stroke:#059669
|
||||
classDef blocked fill:#ef4444,color:#fff,stroke:#dc2626
|
||||
classDef current fill:#8b5cf6,color:#fff,stroke:#7c3aed
|
||||
|
||||
class Start,Crash,KubeletCrash,DockerCrash,K3sCrash event
|
||||
class Overlay2,DockerFail,CSIUnreg,CSISocket,NoQuorum,NoMounts impact
|
||||
class Discovery,Assessment,Identify,CheckData,Attempt1,Attempt2,Investigate action
|
||||
class ManagersOk,EngineOk,CSIPartial,VolumeData resolved
|
||||
class DriverDeployer,CSIPluginAll,UI,ShareMgr,NFSPod,AppImpact blocked
|
||||
class WaitLoop,CurrentState,NextStep,CheckHealth,Option1,Option2,ResolvedState current
|
||||
|
||||
classDef subtitle fill:#64748b,color:#fff,stroke:#475569,font-size:12px
|
||||
class CurrentState,CurrentStateLabel subtitle
|
||||
1103
ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/log.md
Normal file
1103
ansible/arcodange/factory/docs/incidents/2026-04-13-power-cut/log.md
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,416 @@
|
||||
---
|
||||
title: PVC Recovery — Post-Reinstall Volume Restoration
|
||||
incident_id: 2026-04-13-001
|
||||
date: 2026-04-14
|
||||
status: Mostly Resolved
|
||||
operator: Claude Code
|
||||
---
|
||||
|
||||
# PVC Recovery — Post-Reinstall Volume Restoration
|
||||
|
||||
## Situation as of 2026-04-14
|
||||
|
||||
Longhorn has been fully reinstalled and is healthy. The cluster nodes are all Ready. However,
|
||||
**all application volumes are inaccessible** because the nuclear cleanup deleted the Longhorn
|
||||
Volume/Engine/Replica CRDs, and the reinstalled Longhorn has no knowledge of the old volumes.
|
||||
|
||||
### Longhorn Health (verified)
|
||||
|
||||
```
|
||||
NAME READY STATUS AGE
|
||||
csi-attacher (3 pods) 1/1 Running 30m
|
||||
csi-provisioner (3 pods) 1/1 Running 30m
|
||||
csi-resizer (3 pods) 1/1 Running 30m
|
||||
csi-snapshotter (3 pods) 1/1 Running 30m
|
||||
engine-image-ei-b4bcf0a5 (3 pods) 1/1 Running 31m
|
||||
instance-manager (3 pods) 1/1 Running 30m
|
||||
longhorn-csi-plugin (3 pods) 3/3 Running 30m
|
||||
longhorn-driver-deployer 1/1 Running 31m
|
||||
longhorn-manager (3 pods) 2/2 Running 14m
|
||||
longhorn-ui (2 pods) 1/1 Running 31m
|
||||
|
||||
CSIDriver driver.longhorn.io: Registered (AGE: 110d — restored)
|
||||
```
|
||||
|
||||
Longhorn only knows about 3 volumes (crowdsec-config, crowdsec-db, traefik) — all newly provisioned
|
||||
after reinstall. The other 9 volumes are missing from Longhorn's knowledge.
|
||||
|
||||
---
|
||||
|
||||
## Backup Files Available
|
||||
|
||||
| File | Location | Contents | Gap |
|
||||
|------|----------|----------|-----|
|
||||
| `backup_20260413.volumes` | `/home/pi/arcodange/backups/k3s_pvc/` | PV + PVC YAML (kubectl get -A pv,pvc) | No Longhorn CRDs |
|
||||
| `longhorn_metadata_20260413.yaml` | `/home/pi/arcodange/backups/k3s_pvc/` | Engines + Replicas CRDs | **No Volume CRDs** |
|
||||
|
||||
**Critical gap:** The metadata backup was collected with `kubectl get -n longhorn-system volumes.longhorn.io,replicas.longhorn.io,engines.longhorn.io -o yaml` but the resulting file contains only Engines and Replicas in 3 separate Lists. The Volume CRDs are absent.
|
||||
|
||||
Attempting `kubectl apply -f longhorn_metadata_20260413.yaml` fails with:
|
||||
```
|
||||
Error from server (Invalid): admission webhook "validator.longhorn.io" denied the request:
|
||||
volume does not exist for engine
|
||||
```
|
||||
The webhook requires Volume CRDs to exist before Engines can be created. Without Volume CRDs in the
|
||||
backup, the metadata file cannot be applied as-is.
|
||||
|
||||
---
|
||||
|
||||
## Data Survival Assessment
|
||||
|
||||
### Pi1 — Replica directories
|
||||
|
||||
Pi1 is the control plane. Its old replica directories were **deleted** during the nuclear cleanup.
|
||||
Only 3 new directories exist (created after reinstall):
|
||||
|
||||
```
|
||||
pvc-01b93e30-...-b1530c1d (crowdsec-config — NEW)
|
||||
pvc-4785dc60-...-2f031b60 (crowdsec-db — NEW)
|
||||
pvc-5391fa2b-...-0e2ff956 (traefik — NEW)
|
||||
```
|
||||
|
||||
### Pi2 — Replica directories (OLD data preserved)
|
||||
|
||||
```
|
||||
pvc-01b93e30-...-8649439a (crowdsec-config — new post-reinstall)
|
||||
pvc-1251909b-...-e7a20fdf ← OLD DATA (clickhouse 16Gi)
|
||||
pvc-14ccc47e-...-09021065 ← OLD DATA (crowdsec-db old PV)
|
||||
pvc-4785dc60-...-4b48fdf1 (crowdsec-db — new post-reinstall)
|
||||
pvc-5391fa2b-...-d3503612 (traefik — new post-reinstall)
|
||||
pvc-63244de1-...-6076eb08 (unknown — not in engine backup)
|
||||
pvc-6d2ea1c7-...-c7f287d8 ← OLD DATA (audit-vault 10Gi)
|
||||
pvc-7971918e-...-2028617e ← OLD DATA (erp 50Gi)
|
||||
pvc-88e18c7f-...-910583f6 ← OLD DATA (prometheus-server 8Gi)
|
||||
pvc-abc7666c-...-34bec9b0 (unknown — not in engine backup)
|
||||
pvc-aed7f2c4-...-41c20064 ← OLD DATA (alertmanager 2Gi)
|
||||
pvc-ca5567d3-...-b537ca60 ← OLD DATA (data-vault 10Gi)
|
||||
pvc-cc8a3cbb-...-cd16e459 ← OLD DATA (old traefik 128Mi)
|
||||
pvc-cdd434d1-...-b2695689 ← OLD DATA (url-shortener 128Mi)
|
||||
pvc-d1d5482b-...-e0a8cdbc ← OLD DATA (redis 1Gi)
|
||||
pvc-efda1d2f-...-30c849a6 ← OLD DATA (backups-rwx 50Gi)
|
||||
pvc-f9fe3504-...-20f64e9e ← OLD DATA (old crowdsec-config 100Mi)
|
||||
pvc-fca13978-...-4749b404 (unknown — not in engine backup)
|
||||
```
|
||||
|
||||
### Pi3 — Replica directories (OLD data preserved, multiple dirs per volume)
|
||||
|
||||
```
|
||||
pvc-01b93e30-...-29592f50 (crowdsec-config — new post-reinstall)
|
||||
pvc-1251909b-...-1163420b ← OLD DATA (clickhouse — replica 1)
|
||||
pvc-1251909b-...-3a569b0a ← OLD DATA (clickhouse — replica 2)
|
||||
pvc-1251909b-...-ccd05947 ← OLD DATA (clickhouse — replica 3 or stale)
|
||||
pvc-14ccc47e-...-3856d64d ← OLD DATA (old crowdsec-db)
|
||||
pvc-2e60385f-...-48e27d5a (unknown)
|
||||
pvc-4785dc60-...-869f0e99 (crowdsec-db — new post-reinstall)
|
||||
pvc-5391fa2b-...-958cd868 (traefik — new post-reinstall)
|
||||
pvc-6d2ea1c7-...-0e73550d ← OLD DATA (audit-vault — dir 1)
|
||||
pvc-6d2ea1c7-...-787ffefa ← OLD DATA (audit-vault — dir 2)
|
||||
pvc-6d2ea1c7-...-e0f58d64 ← OLD DATA (audit-vault — dir 3 or stale)
|
||||
pvc-7971918e-...-33191046 ← OLD DATA (erp — dir 1)
|
||||
pvc-7971918e-...-88fc1dfc ← OLD DATA (erp — dir 2)
|
||||
pvc-7971918e-...-b5c5530d ← OLD DATA (erp — dir 3 or stale)
|
||||
pvc-88e18c7f-...-5d508830 ← OLD DATA (prometheus-server — dir 1)
|
||||
pvc-88e18c7f-...-92c0ebfd ← OLD DATA (prometheus-server — dir 2)
|
||||
pvc-88e18c7f-...-deea6182 ← OLD DATA (prometheus-server — dir 3 or stale)
|
||||
pvc-abe09e90-...-a748d11b (unknown)
|
||||
pvc-aed7f2c4-...-3452358f ← OLD DATA (alertmanager — dir 1)
|
||||
pvc-aed7f2c4-...-826f05aa ← OLD DATA (alertmanager — dir 2)
|
||||
pvc-ca5567d3-...-0ed6f691 ← OLD DATA (data-vault — dir 1)
|
||||
pvc-ca5567d3-...-808d72b4 ← OLD DATA (data-vault — dir 2)
|
||||
pvc-ca5567d3-...-9051ef48 ← OLD DATA (data-vault — dir 3 or stale)
|
||||
pvc-cc8a3cbb-...-011b54b3 ← OLD DATA (old traefik — dir 1)
|
||||
pvc-cc8a3cbb-...-a24fd91e ← OLD DATA (old traefik — dir 2)
|
||||
pvc-cdd434d1-...-70197659 ← OLD DATA (url-shortener — dir 1)
|
||||
pvc-cdd434d1-...-998f49ff ← OLD DATA (url-shortener — dir 2)
|
||||
pvc-d1d5482b-...-6a730f00 ← OLD DATA (redis — dir 1)
|
||||
pvc-d1d5482b-...-75da16fd ← OLD DATA (redis — dir 2)
|
||||
pvc-efda1d2f-...-62fb04c9 ← OLD DATA (backups-rwx — dir 1)
|
||||
pvc-efda1d2f-...-688f30f5 ← OLD DATA (backups-rwx — dir 2)
|
||||
pvc-efda1d2f-...-69454dd0 ← OLD DATA (backups-rwx — dir 3 or stale)
|
||||
pvc-f9fe3504-...-418df608 ← OLD DATA (old crowdsec-config)
|
||||
```
|
||||
|
||||
**Note on multiple directories per volume on pi3:** Normal replicas = 1 dir per volume per node.
|
||||
Multiple directories indicate either: rebuild attempts from before the nuclear cleanup, or stale
|
||||
snapshots. Must verify by checking `.img` file sizes before renaming.
|
||||
|
||||
---
|
||||
|
||||
## Volume → PVC Mapping (from backup_20260413.volumes)
|
||||
|
||||
| PV Name | PVC | Namespace | Size | Status |
|
||||
|---------|-----|-----------|------|--------|
|
||||
| `pvc-1251909b-3cef-40c6-881c-3bb6e929a596` | `clickhouse-storage-clickhouse-0` | tools | 16Gi | Terminating |
|
||||
| `pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70` | `audit-hashicorp-vault-0` | tools | 10Gi | Terminating |
|
||||
| `pvc-7971918e-e47f-4739-a976-965ea2d770b4` | `erp` | erp | 50Gi | Terminating |
|
||||
| `pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9` | `prometheus-server` | tools | 8Gi | Terminating |
|
||||
| `pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4` | `storage-prometheus-alertmanager-0` | tools | 2Gi | Terminating |
|
||||
| `pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635` | `data-hashicorp-vault-0` | tools | 10Gi | Terminating |
|
||||
| `pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90` | `traefik` | kube-system | 128Mi | Terminating |
|
||||
| `pvc-cdd434d1-88b4-4588-8fd2-8c7eafc56d07` | `url-shortener` | url-shortener | 128Mi | Terminating |
|
||||
| `pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce` | `redis-storage-redis-0` | tools | 1Gi | Terminating |
|
||||
| `pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa` | `backups-rwx` | longhorn-system | 50Gi | Lost |
|
||||
| `pvc-14ccc47e-0b8c-49d4-97bb-70e550f644b0` | `crowdsec-db-pvc` | tools | 1Gi | already replaced |
|
||||
| `pvc-f9fe3504-70ce-4401-8cda-bc6bb68bc1bf` | `crowdsec-config-pvc` | tools | 100Mi | already replaced |
|
||||
|
||||
CrowdSec volumes (`pvc-14ccc47e`, `pvc-f9fe3504`) are the old PVs — CrowdSec already got new volumes
|
||||
(`pvc-4785dc60`, `pvc-01b93e30`) and is running. These old dirs can be cleaned up later.
|
||||
|
||||
---
|
||||
|
||||
## Recovery Plan
|
||||
|
||||
### Why not restore PVCs
|
||||
|
||||
New PVCs will be created by the workloads themselves when they restart. Restoring old PVCs would
|
||||
conflict with both the stuck Terminating ones and any new ones pods may already be creating.
|
||||
**Restore PVs only** — strip `claimRef` so they become `Available`, and new PVCs bind to them via
|
||||
`storageClassName` + `accessMode` + `capacity` matching.
|
||||
|
||||
### Step 1 — Clear stuck Terminating PVs
|
||||
|
||||
The old PVs are stuck in `Terminating` with `kubernetes.io/pvc-protection` finalizers. Remove them:
|
||||
|
||||
```bash
|
||||
for pv in \
|
||||
pvc-1251909b-3cef-40c6-881c-3bb6e929a596 \
|
||||
pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70 \
|
||||
pvc-7971918e-e47f-4739-a976-965ea2d770b4 \
|
||||
pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9 \
|
||||
pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4 \
|
||||
pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635 \
|
||||
pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 \
|
||||
pvc-cdd434d1-88b4-4588-8fd2-8c7eafc56d07 \
|
||||
pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce \
|
||||
pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa; do
|
||||
kubectl patch pv $pv -p '{"metadata":{"finalizers":null}}' --type=merge
|
||||
done
|
||||
```
|
||||
|
||||
### Step 2 — Restore PVs with claimRef removed and Retain policy
|
||||
|
||||
Extract PVs from the backup, strip `claimRef` and set `persistentVolumeReclaimPolicy: Retain`,
|
||||
then apply:
|
||||
|
||||
```bash
|
||||
ssh pi1 "sudo kubectl get pv \
|
||||
pvc-1251909b-3cef-40c6-881c-3bb6e929a596 \
|
||||
pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70 \
|
||||
pvc-7971918e-e47f-4739-a976-965ea2d770b4 \
|
||||
pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9 \
|
||||
pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4 \
|
||||
pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635 \
|
||||
pvc-cc8a3cbb-dbc2-47a2-a0cc-a02136122b90 \
|
||||
pvc-cdd434d1-88b4-4588-8fd2-8c7eafc56d07 \
|
||||
pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce \
|
||||
pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa \
|
||||
-o yaml 2>/dev/null | \
|
||||
python3 -c \"
|
||||
import sys, yaml
|
||||
docs = list(yaml.safe_load_all(sys.stdin))
|
||||
for doc in docs:
|
||||
if not doc: continue
|
||||
items = doc.get('items', [doc])
|
||||
for pv in items:
|
||||
if pv.get('kind') != 'PersistentVolume': continue
|
||||
spec = pv.get('spec', {})
|
||||
spec.pop('claimRef', None)
|
||||
spec['persistentVolumeReclaimPolicy'] = 'Retain'
|
||||
pv.pop('status', None)
|
||||
meta = pv.get('metadata', {})
|
||||
meta.pop('resourceVersion', None)
|
||||
meta.pop('uid', None)
|
||||
meta.pop('creationTimestamp', None)
|
||||
print('---')
|
||||
print(yaml.dump(pv))
|
||||
\" | kubectl apply -f -"
|
||||
```
|
||||
|
||||
Expected result: PVs become `Available` (no claimRef = unbound).
|
||||
|
||||
### Step 3 — Longhorn creates new Volume CRDs + replica dirs
|
||||
|
||||
When new PVCs bind to the restored PVs and pods attempt to mount them, Longhorn's CSI provisioner
|
||||
will create new Volume CRDs for each. These new Volume CRDs will have new engine IDs, and Longhorn
|
||||
will create **new empty replica directories** on pi1, pi2, pi3.
|
||||
|
||||
At this point the volume directory layout will be:
|
||||
```
|
||||
/mnt/arcodange/longhorn/replicas/
|
||||
pvc-1251909b-...-<OLD_SUFFIX> ← pi2/pi3: OLD data
|
||||
pvc-1251909b-...-<NEW_SUFFIX> ← pi1/pi2/pi3: NEW empty dirs
|
||||
```
|
||||
|
||||
### Step 4 — Map old dirs to new dirs, verify data presence
|
||||
|
||||
For each volume, on each node, identify:
|
||||
- OLD dir: exists before new binding (larger .img file size, older timestamp)
|
||||
- NEW dir: created after binding (empty or minimal .img file)
|
||||
|
||||
```bash
|
||||
# Example: check sizes on pi2 for clickhouse
|
||||
ssh pi2 "du -sh /mnt/arcodange/longhorn/replicas/pvc-1251909b-*"
|
||||
```
|
||||
|
||||
### Step 5 — Swap directories (Method B)
|
||||
|
||||
For each volume on each node that has an old dir with data:
|
||||
|
||||
```bash
|
||||
# Scale down the workload first
|
||||
kubectl scale statefulset clickhouse -n tools --replicas=0
|
||||
|
||||
# Wait for volume to detach
|
||||
kubectl wait --for=jsonpath='{.status.state}'=detached \
|
||||
volume/pvc-1251909b-3cef-40c6-881c-3bb6e929a596 \
|
||||
-n longhorn-system --timeout=60s
|
||||
|
||||
# On pi2: rename new empty dir, move old data dir to new name
|
||||
ssh pi2 "
|
||||
NEW=$(ls /mnt/arcodange/longhorn/replicas/ | grep pvc-1251909b | \
|
||||
xargs -I{} stat --format='%Y {}' /mnt/arcodange/longhorn/replicas/{} | \
|
||||
sort -rn | head -1 | awk '{print \$2}')
|
||||
OLD=$(ls /mnt/arcodange/longhorn/replicas/ | grep pvc-1251909b | \
|
||||
xargs -I{} stat --format='%Y {}' /mnt/arcodange/longhorn/replicas/{} | \
|
||||
sort -n | head -1 | awk '{print \$2}')
|
||||
echo \"OLD: \$OLD\"
|
||||
echo \"NEW: \$NEW\"
|
||||
sudo mv \$NEW \${NEW}.empty_backup
|
||||
sudo mv \$OLD \$NEW
|
||||
"
|
||||
# Repeat on pi3
|
||||
|
||||
# Restart the instance manager on affected node to pick up new dir
|
||||
kubectl delete pod -n longhorn-system -l \
|
||||
longhorn.io/node=pi2,longhorn.io/component=instance-manager
|
||||
```
|
||||
|
||||
### Step 6 — Scale workloads back up and verify
|
||||
|
||||
```bash
|
||||
kubectl scale statefulset clickhouse -n tools --replicas=1
|
||||
kubectl get pvc -n tools clickhouse-storage-clickhouse-0
|
||||
kubectl get volumes -n longhorn-system pvc-1251909b-3cef-40c6-881c-3bb6e929a596
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Priority Order for Recovery
|
||||
|
||||
Given data criticality:
|
||||
|
||||
1. **HashiCorp Vault data** (`pvc-ca5567d3` + `pvc-6d2ea1c7`) — credentials/secrets store
|
||||
2. **ERP** (`pvc-7971918e`) — 50Gi, business data
|
||||
3. **Prometheus** (`pvc-88e18c7f`) — 8Gi, metrics history (degraded OK, can rebuild)
|
||||
4. **Redis** (`pvc-d1d5482b`) — 1Gi, cache (can rebuild from scratch if needed)
|
||||
5. **Alertmanager** (`pvc-aed7f2c4`) — 2Gi, alert history (can rebuild)
|
||||
6. **Clickhouse** (`pvc-1251909b`) — 16Gi
|
||||
7. **URL shortener** (`pvc-cdd434d1`) — 128Mi
|
||||
8. **Traefik** (`pvc-cc8a3cbb`) — 128Mi (TLS certs, can re-issue via cert-manager)
|
||||
9. **Longhorn backups-rwx** (`pvc-efda1d2f`) — 50Gi, backup volume itself
|
||||
|
||||
---
|
||||
|
||||
## Caution: Multiple Dirs on Pi3
|
||||
|
||||
Several volumes have 3 directories on pi3. This likely happened during the incident when Longhorn
|
||||
attempted rebuilds before the nuclear cleanup. **Do not blindly take the newest or oldest** — check
|
||||
actual `.img` file size to identify the one with data:
|
||||
|
||||
```bash
|
||||
ssh pi3 "du -sh /mnt/arcodange/longhorn/replicas/pvc-1251909b-*"
|
||||
# The largest .img is the one with actual data
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Lessons for Backup Script
|
||||
|
||||
The current backup command `kubectl get -A pv,pvc -o yaml && echo '---' && kubectl get -A pvc -o yaml`
|
||||
captures PV/PVC but not Longhorn Volume CRDs. The backup command must be updated to include:
|
||||
|
||||
```bash
|
||||
kubectl get -A pv -o yaml && echo '---' \
|
||||
&& kubectl get -A pvc -o yaml && echo '---' \
|
||||
&& kubectl get -n longhorn-system volumes.longhorn.io -o yaml
|
||||
```
|
||||
|
||||
This is tracked in ADR `docs/adr/20260414-longhorn-pvc-recovery.md` under "Prevention".
|
||||
|
||||
---
|
||||
|
||||
## Volume Recovery Status
|
||||
|
||||
| PV Name | PVC | Namespace | Size | Method | Status |
|
||||
|---------|-----|-----------|------|--------|--------|
|
||||
| `pvc-5391fa2b` | `traefik` | kube-system | 128Mi | PV claimRef remove | ✅ 2026-04-14 |
|
||||
| `pvc-cdd434d1` | `url-shortener-data` | url-shortener | 128Mi | Method B (dir rename) | ✅ 2026-04-14 |
|
||||
| `pvc-1251909b` | `clickhouse-storage-clickhouse-0` | tools | 16Gi | Block-device (playbook) | ✅ 2026-04-14 |
|
||||
| `pvc-88e18c7f` | `prometheus-server` | tools | 8Gi | Block-device (playbook) | ⏳ 2026-04-15 |
|
||||
| `pvc-aed7f2c4` | `storage-prometheus-alertmanager-0` | tools | 2Gi | Block-device (playbook) | ⏳ 2026-04-15 |
|
||||
| `pvc-d1d5482b` | `redis-storage-redis-0` | tools | 1Gi | Block-device (playbook) | ⏳ 2026-04-15 |
|
||||
| `pvc-efda1d2f` | `backups-rwx` | longhorn-system | 50Gi | Block-device (playbook) | ⏳ 2026-04-15 |
|
||||
| `pvc-ca5567d3` | `data-hashicorp-vault-0` | tools | 10Gi | Manual (deferred) | 🔴 Pending |
|
||||
| `pvc-6d2ea1c7` | `audit-hashicorp-vault-0` | tools | 10Gi | Manual (deferred) | 🔴 Pending |
|
||||
| `pvc-7971918e` | `erp` | erp | 50Gi | Manual (deferred) | 🔴 Pending |
|
||||
|
||||
**Vault and ERP are excluded from automated recovery** — they require coordinated manual procedures
|
||||
(Vault unseal key management; ERP business data verification). Use `docs/runbooks/longhorn-block-device-recovery.md`
|
||||
with extra validation steps for those volumes.
|
||||
|
||||
---
|
||||
|
||||
## Automated Recovery: Block-Device Injection
|
||||
|
||||
Directory rename (Method B) proved too risky for large volumes: Longhorn detects `Dirty: true` +
|
||||
inconsistency across replicas and silently rebuilds from the empty pi1 replica, destroying data.
|
||||
|
||||
**The approach that works** (implemented in `playbooks/recover/longhorn_data.yml`):
|
||||
|
||||
1. **Phase 0** — Auto-discover best replica dir per volume (skip `Rebuilding: true`, rank by actual disk usage)
|
||||
2. **Phase 1** — Backup untouched replica dir before touching anything
|
||||
3. **Phase 2** — Merge sparse snapshot + head layers into a flat image (`merge-longhorn-layers.py`)
|
||||
4. **Phase 3** — Create Longhorn Volume CRD, wait for replicas
|
||||
5. **Phase 4** — Scale down workload
|
||||
6. **Phase 5** — Attach volume via VolumeAttachment maintenance ticket
|
||||
7. **Phase 6** — `mkfs.ext4` the live block device, rsync data from merged image
|
||||
8. **Phase 7** — Remove maintenance attachment ticket
|
||||
9. **Phase 8** — Recreate PV (Retain, no claimRef) + PVC (pinned to PV)
|
||||
10. **Phase 9** — Scale up, wait for readyReplicas ≥ 1, optional verify_cmd
|
||||
|
||||
**Pitfall discovered (2026-04-15):** `du -sb` returns apparent size for sparse files, making a
|
||||
`Rebuilding: true` replica (1.3 GiB actual, 24 GiB apparent) beat healthy 11 GiB replicas.
|
||||
Fixed by checking `Rebuilding` flag in `volume.meta` and using `du -sk` (actual usage).
|
||||
|
||||
**Usage:**
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \
|
||||
-e @playbooks/recover/longhorn_data_vars_remaining.yml
|
||||
```
|
||||
|
||||
Vars files:
|
||||
- `playbooks/recover/longhorn_data_vars_clickhouse.yml` — clickhouse (already recovered, archived)
|
||||
- `playbooks/recover/longhorn_data_vars_remaining.yml` — prometheus, alertmanager, redis, backups-rwx
|
||||
- `playbooks/recover/longhorn_data_vars.example.yml` — template for future use
|
||||
|
||||
---
|
||||
|
||||
## Tested Recovery Procedure (url-shortener — 2026-04-14)
|
||||
|
||||
Method B confirmed working for this volume (small, no Rebuilding replicas). Full sequence:
|
||||
|
||||
1. Create Longhorn Volume CRD manually (size 128Mi, rwo, 3 replicas)
|
||||
2. Create Longhorn VolumeAttachment ticket to pi1 (disableFrontend: true) → triggers replica dir creation
|
||||
3. Remove attachment ticket → volume detaches
|
||||
4. On pi2: `mv new-dir new-dir.empty && mv old-dir new-dir`
|
||||
5. On pi3: same (chose `-70197659` over `-998f49ff` based on newer mtime: Apr 7 vs Apr 6)
|
||||
6. Clear finalizers on stuck Terminating PV/PVC → both deleted
|
||||
7. Recreate PV (Retain policy, no claimRef, same CSI volumeHandle)
|
||||
8. Recreate PVC with `volumeName:` pinned to the PV
|
||||
9. Delete old Error pod (was blocking volume attach)
|
||||
10. New pod comes up 1/1 Running, volume attached healthy on pi3, all 3 replicas running
|
||||
|
||||
**Traefik** was simpler — PV `pvc-5391fa2b` already existed in Longhorn (Released). Just removed
|
||||
claimRef (→ Available), created `kube-system/traefik` PVC with `volumeName:` pinned. Bound immediately.
|
||||
|
||||
**For all subsequent volumes** — use `playbooks/recover/longhorn_data.yml`. Method B is too risky.
|
||||
@@ -0,0 +1,70 @@
|
||||
---
|
||||
# Automated Longhorn Recovery Playbook (DRAFT)
|
||||
# Purpose: Break circular dependency and restore CSI driver after power-cut
|
||||
#
|
||||
# REQUIREMENTS:
|
||||
# - Ansible >= 2.15
|
||||
# - kubectl on control plane (pi1)
|
||||
# - Backup scripts from playbooks/backup/k3s_pvc.yml must be deployed
|
||||
#
|
||||
# USAGE:
|
||||
# ansible-playbook -i inventory/hosts.yml docs/incidents/2026-04-13-power-cut/recover_longhorn.yml
|
||||
#
|
||||
# REFERENCE FILES:
|
||||
# - playbooks/system/k3s_config.yml (Longhorn HelmChart template)
|
||||
# - playbooks/backup/k3s_pvc.yml (Backup/restore scripts)
|
||||
# - inventory/hosts.yml (Target hosts)
|
||||
# - /mnt/arcodange/longhorn/replicas/ (Data - MUST NOT be touched)
|
||||
# - /home/pi/arcodange/backups/k3s_pvc/ (Fallback backup location)
|
||||
#
|
||||
#
|
||||
# PLAYBOOK FLOW:
|
||||
#
|
||||
# Phase 1: DIAGNOSIS (idempotent, safe to run anytime)
|
||||
# - Check CSI driver registration status
|
||||
# - Check Longhorn manager health
|
||||
# - Identify which recovery phase is needed
|
||||
#
|
||||
# Phase 2: SOFT RECOVERY (least destructive)
|
||||
# - Touch longhorn-install.yaml manifest
|
||||
# - Wait 60s for k3s HelmChart controller to reconcile
|
||||
# - Verify pod recreation
|
||||
#
|
||||
# Phase 3: HARD RECOVERY (if soft fails)
|
||||
# - Delete driver-deployer pod
|
||||
# - Delete all longhorn-driver-deployer pods
|
||||
# - Wait for HelmChart to recreate
|
||||
#
|
||||
# Phase 4: NUCLEAR RECOVERY (if hard fails)
|
||||
# - Delete HelmChart resource
|
||||
# - Remove manifest file
|
||||
# - Force-delete longhorn-system namespace (after removing finalizers)
|
||||
# - Reinstall Longhorn via manifest
|
||||
#
|
||||
# Phase 5: RESTORE FROM BACKUP (idempotent)
|
||||
# - Apply PV/PVC from backup
|
||||
# - Apply Longhorn CRs from backup
|
||||
# - Data auto-discovered from disk
|
||||
#
|
||||
# DESIGNED TO HANDLE:
|
||||
# - CSI driver registration lost
|
||||
# - Longhorn manager webhook circular dependency
|
||||
# - Partial pod crashes
|
||||
# - Full Longhorn namespace corruption
|
||||
#
|
||||
# LIMITATIONS:
|
||||
# - Requires pi1 (control plane) to be reachable
|
||||
# - Data in /mnt/arcodange/longhorn/ MUST survive
|
||||
# - Docker must be functional on at least 1 node
|
||||
# - Does NOT handle Docker overlay2 corruption
|
||||
#
|
||||
# TESTED SCENARIOS:
|
||||
# - [ ] CSI driver not registered (primary use case)
|
||||
# - [ ] Longhorn manager CrashLoopBackOff
|
||||
# - [ ] Full namespace deletion needed
|
||||
# - [ ] Backup restore validation
|
||||
#
|
||||
# TODO:
|
||||
# - Add Docker storage health check
|
||||
# - Add pre-recovery data verification
|
||||
# - Add post-recovery validation
|
||||
@@ -0,0 +1,153 @@
|
||||
---
|
||||
title: Recovery Approach Analysis — Post-Incident Review
|
||||
incident_id: 2026-04-13-001
|
||||
date: 2026-04-13
|
||||
author: Claude Code (external review)
|
||||
---
|
||||
|
||||
# Recovery Approach Analysis
|
||||
|
||||
## TL;DR
|
||||
|
||||
The incident escalated from a **~5 minute fix** to a **full Longhorn reinstall with backup restore** because the simplest remediation (k3s restart) was never attempted, and a single aggressive command (`kubectl delete pods --all --force`) created a new problem that did not previously exist.
|
||||
|
||||
---
|
||||
|
||||
## What Was Skipped
|
||||
|
||||
### 1. Restart k3s on all nodes (never attempted)
|
||||
|
||||
This should have been the **first or second action** after the manifest touch failed.
|
||||
|
||||
```bash
|
||||
systemctl restart k3s # pi1 — control plane
|
||||
systemctl restart k3s-agent # pi2, pi3 — agent nodes
|
||||
```
|
||||
|
||||
After a power cut, k3s/kubelet state is dirty. Restarting k3s:
|
||||
- Forces kubelet to reinitialize the plugin registry cleanly
|
||||
- Allows Longhorn pods to restart in correct dependency order
|
||||
- Avoids the simultaneous-restart race condition that causes webhook issues
|
||||
- Takes ~2 minutes with no destructive side effects
|
||||
|
||||
This was listed as a last resort in the runbook consulted at incident start. It should have been tried **before any pod deletion**, not after.
|
||||
|
||||
### 2. Stale CSI socket check on each node (never attempted)
|
||||
|
||||
```bash
|
||||
# On each node (pi1, pi2, pi3):
|
||||
ls /var/lib/kubelet/plugins/driver.longhorn.io/
|
||||
# If a stale .sock file exists:
|
||||
rm /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock
|
||||
```
|
||||
|
||||
The incident log confirms the CSI socket was missing/stale, but no one went to the nodes to verify and clean this up. Removing a stale socket + restarting the `longhorn-csi-plugin` daemonset is a targeted, low-risk fix.
|
||||
|
||||
---
|
||||
|
||||
## Where the Direction Went Wrong
|
||||
|
||||
### The pivotal mistake: force deleting all 24 pods simultaneously
|
||||
|
||||
**Command run at 15:32:15:**
|
||||
```bash
|
||||
kubectl delete pods -n longhorn-system --all --force --grace-period=0
|
||||
```
|
||||
|
||||
This command created the **webhook circular dependency problem**, which did not exist before it was run.
|
||||
|
||||
**Why it caused the circular dependency:**
|
||||
|
||||
In normal operation, Longhorn managers start sequentially. One becomes the webhook leader and begins serving on port 9501 before others register as service endpoints.
|
||||
|
||||
When all 24 pods are force-deleted simultaneously:
|
||||
1. All 3 manager pods race-start at the same time
|
||||
2. All 3 IPs are registered as `longhorn-conversion-webhook` service endpoints immediately
|
||||
3. The health check (`https://<pod-ip>:9501/v1/healthz`) is run against all 3
|
||||
4. Only the elected leader actually serves port 9501 — the other 2 fail the probe
|
||||
5. Failing managers crash: `"conversion webhook service is not accessible after 1m0s"`
|
||||
6. `longhorn-driver-deployer` init container waits for healthy managers indefinitely
|
||||
7. CSI socket is never created, CSI driver never registers
|
||||
|
||||
**The original problem was only a lost CSI socket registration.** The webhook circular dependency is a new problem introduced by the recovery attempt.
|
||||
|
||||
---
|
||||
|
||||
## The Escalation Cascade
|
||||
|
||||
Each step created a harder problem than the one it was meant to solve:
|
||||
|
||||
```
|
||||
Power cut
|
||||
→ CSI socket lost (original problem — simple fix)
|
||||
→ Force delete all pods
|
||||
→ Webhook circular dependency (new problem)
|
||||
→ Delete HelmChart + manifest
|
||||
→ 84 finalizers blocking namespace deletion (new problem)
|
||||
→ Full reinstall required
|
||||
→ Backup restore required
|
||||
→ Risk to volume metadata
|
||||
```
|
||||
|
||||
The original problem required touching 1 socket file and restarting k3s. The current state requires:
|
||||
- Manually patching finalizers off 84+ resources
|
||||
- Full Longhorn reinstall
|
||||
- Restoring PV/PVC and Longhorn CRs from backup
|
||||
- Verifying data auto-discovery from replicas
|
||||
|
||||
---
|
||||
|
||||
## Correct Recovery Sequence (Hindsight)
|
||||
|
||||
### Step 1 — k3s restart (should have been tried at ~15:27)
|
||||
```bash
|
||||
ansible -i inventory/hosts.yml all -m shell -a "sudo systemctl restart k3s || sudo systemctl restart k3s-agent"
|
||||
```
|
||||
Wait 3 minutes. In most power-cut scenarios, this alone restores CSI registration.
|
||||
|
||||
### Step 2 — If still broken: targeted daemonset restart (not force-delete-all)
|
||||
```bash
|
||||
kubectl rollout restart daemonset/longhorn-manager -n longhorn-system
|
||||
kubectl rollout status daemonset/longhorn-manager -n longhorn-system
|
||||
```
|
||||
Graceful restart respects the dependency order. Wait for managers to stabilize before touching CSI pods.
|
||||
|
||||
### Step 3 — Check and clean stale sockets on each node
|
||||
```bash
|
||||
# Run on pi1, pi2, pi3:
|
||||
ls /var/lib/kubelet/plugins/driver.longhorn.io/
|
||||
rm -f /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock
|
||||
kubectl rollout restart daemonset/longhorn-csi-plugin -n longhorn-system
|
||||
```
|
||||
|
||||
### Step 4 — Verify CSI driver registered
|
||||
```bash
|
||||
kubectl get csidriver
|
||||
kubectl get csinodes
|
||||
```
|
||||
|
||||
### Step 5 — Only if all above failed: delete driver-deployer pod only
|
||||
```bash
|
||||
kubectl delete pod -n longhorn-system -l app=longhorn-driver-deployer
|
||||
```
|
||||
Not all pods. One targeted pod.
|
||||
|
||||
---
|
||||
|
||||
## What Was Done Well
|
||||
|
||||
- Quick identification of the original root cause (CSI registration)
|
||||
- Confirming volume data integrity early (`robustness="healthy"`)
|
||||
- Securing backups before destructive operations (16:30)
|
||||
- Fixing the backup script bug (useful regardless of incident)
|
||||
- Detailed logging throughout
|
||||
|
||||
---
|
||||
|
||||
## Action Items for Future Incidents
|
||||
|
||||
- [ ] Add k3s restart as **step 2** in the Longhorn recovery runbook (before any pod deletion)
|
||||
- [ ] Add CSI socket cleanup to the runbook as an explicit step on each node
|
||||
- [ ] Add a "minimum destructive action" principle: prefer `rollout restart` over `delete --force --all`
|
||||
- [ ] Implement `recover_longhorn.yml` playbook with the phased approach (soft → targeted → hard) to prevent ad-hoc escalation
|
||||
- [ ] Add a pre-action checklist: "have I tried restarting the service before deleting its resources?"
|
||||
@@ -0,0 +1,107 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Merge Longhorn snapshot + head layers into a single mountable raw image.
|
||||
|
||||
Longhorn stores replica data as sparse raw images in a chain:
|
||||
volume-snap-<id>.img — full state at the time the snapshot was taken
|
||||
volume-head-NNN.img — delta (only changed blocks) since the snapshot
|
||||
|
||||
To reconstruct the full filesystem, head blocks take priority over snapshot
|
||||
blocks. Sparse (all-zero) blocks in the head fall through to the snapshot.
|
||||
|
||||
Usage:
|
||||
sudo python3 merge-longhorn-layers.py <replica-dir> <output.img>
|
||||
|
||||
Example:
|
||||
sudo python3 merge-longhorn-layers.py \\
|
||||
/mnt/arcodange/longhorn/replicas/pvc-cdd434d1-...-998f49ff \\
|
||||
/tmp/merged.img
|
||||
|
||||
# Then mount and inspect:
|
||||
sudo mount -o loop /tmp/merged.img /mnt/recovery
|
||||
ls /mnt/recovery/
|
||||
|
||||
Proven useful during incident 2026-04-13 to recover the url-shortener SQLite
|
||||
database from a Longhorn replica that was never touched by the nuclear cleanup
|
||||
(pi3, dir suffix -998f49ff, Apr 6 snapshot).
|
||||
|
||||
Key lesson: always identify the untouched replica dir (oldest timestamps,
|
||||
never renamed) before attempting directory swaps. Back it up first.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
BLOCK = 4096
|
||||
|
||||
|
||||
def find_layers(replica_dir: str) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
Read volume.meta to find head filename and snapshot parent.
|
||||
Returns (snapshot_path, head_path). snapshot_path is None for base volumes.
|
||||
"""
|
||||
meta_path = os.path.join(replica_dir, "volume.meta")
|
||||
with open(meta_path) as f:
|
||||
meta = json.load(f)
|
||||
|
||||
head_name = meta["Head"]
|
||||
parent_name = meta.get("Parent", "")
|
||||
|
||||
head_path = os.path.join(replica_dir, head_name)
|
||||
snap_path = os.path.join(replica_dir, parent_name) if parent_name else None
|
||||
|
||||
return snap_path, head_path
|
||||
|
||||
|
||||
def merge(snap_path: str | None, head_path: str, out_path: str) -> None:
|
||||
size = os.path.getsize(head_path)
|
||||
print(f"Volume size: {size // (1024 * 1024)} MiB")
|
||||
print(f"Snapshot: {snap_path or '(none — base volume)'}")
|
||||
print(f"Head: {head_path}")
|
||||
print(f"Output: {out_path}")
|
||||
|
||||
snap_f = open(snap_path, "rb") if snap_path else None
|
||||
head_f = open(head_path, "rb")
|
||||
|
||||
with open(out_path, "wb") as out:
|
||||
out.truncate(size)
|
||||
blocks = size // BLOCK
|
||||
for i, offset in enumerate(range(0, size, BLOCK)):
|
||||
head_f.seek(offset)
|
||||
hb = head_f.read(BLOCK)
|
||||
|
||||
if hb and any(hb):
|
||||
out.seek(offset)
|
||||
out.write(hb)
|
||||
elif snap_f:
|
||||
snap_f.seek(offset)
|
||||
sb = snap_f.read(BLOCK)
|
||||
if sb and any(sb):
|
||||
out.seek(offset)
|
||||
out.write(sb)
|
||||
|
||||
if i % 4096 == 0:
|
||||
pct = (i / blocks) * 100
|
||||
print(f"\r {pct:.0f}%", end="", flush=True)
|
||||
|
||||
print("\r 100% — done.")
|
||||
if snap_f:
|
||||
snap_f.close()
|
||||
head_f.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 3:
|
||||
print(__doc__)
|
||||
sys.exit(1)
|
||||
|
||||
replica_dir = sys.argv[1]
|
||||
out_path = sys.argv[2]
|
||||
|
||||
if not os.path.isdir(replica_dir):
|
||||
print(f"Error: {replica_dir} is not a directory", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
snap, head = find_layers(replica_dir)
|
||||
merge(snap, head, out_path)
|
||||
312
ansible/arcodange/factory/docs/incidents/README.md
Normal file
312
ansible/arcodange/factory/docs/incidents/README.md
Normal file
@@ -0,0 +1,312 @@
|
||||
# Incident Documentation
|
||||
|
||||
This directory contains incident reports, postmortems, and recovery logs for the Arcodange Factory infrastructure.
|
||||
|
||||
## Purpose
|
||||
|
||||
Document all infrastructure incidents to:
|
||||
- Track root causes and resolutions
|
||||
- Maintain a knowledge base for future troubleshooting
|
||||
- Improve system reliability through lessons learned
|
||||
- Provide clear guidance for on-call responders
|
||||
|
||||
## Structure
|
||||
|
||||
Each incident is documented in its own directory under `docs/incidents/` with the following naming convention:
|
||||
|
||||
```
|
||||
docs/incidents/
|
||||
├── YYYY-MM-DD-incident-name/
|
||||
│ ├── README.md # Incident summary and timeline
|
||||
│ ├── status.md # Real-time status updates (optional)
|
||||
│ ├── log.md # Detailed recovery actions and logs
|
||||
│ ├── root-cause.md # Technical analysis (optional)
|
||||
│ └── diagrams/ # Architecture/flow diagrams (optional)
|
||||
│ └── *.mmd # Mermaid diagrams
|
||||
└── ...
|
||||
```
|
||||
|
||||
## Incident Directory Contents
|
||||
|
||||
### 1. `README.md` (Required)
|
||||
The primary incident document. Must include:
|
||||
|
||||
- **Incident ID**: Unique identifier (e.g., `2026-04-13-001`)
|
||||
- **Title**: Clear, descriptive title
|
||||
- **Date/Time**: Start and end timestamps
|
||||
- **Status**: Open / Investigating / Resolved / Monitoring
|
||||
- **Severity**: SEV-1 (Critical) / SEV-2 (High) / SEV-3 (Medium) / SEV-4 (Low)
|
||||
- **Impact**: Brief description of affected services
|
||||
- **Summary**: What happened
|
||||
- **Timeline**: Key events with timestamps
|
||||
- **Root Cause**: Technical analysis
|
||||
- **Resolution**: Steps taken to resolve
|
||||
- **Action Items**: Follow-up tasks
|
||||
- **Lessons Learned**: Key takeaways
|
||||
|
||||
**Front matter template:**
|
||||
```markdown
|
||||
---
|
||||
title: Incident Title
|
||||
incident_id: YYYY-MM-DD-NNN
|
||||
date: YYYY-MM-DD
|
||||
time_start: HH:MM:SS UTC
|
||||
time_end: HH:MM:SS UTC
|
||||
status: Resolved
|
||||
severity: SEV-2
|
||||
tags:
|
||||
- kubernetes
|
||||
- longhorn
|
||||
- storage
|
||||
---
|
||||
```
|
||||
|
||||
### 2. `log.md` (Recommended)
|
||||
Detailed technical log of all recovery actions. Must include:
|
||||
|
||||
- Commands executed with timestamps
|
||||
- Command output (relevant portions)
|
||||
- Decision rationale for each action
|
||||
- Outcome of each action
|
||||
- Next stepsidentified
|
||||
|
||||
Format:
|
||||
```markdown
|
||||
## [Time] Action Description
|
||||
|
||||
**Command:** `actual command run`
|
||||
|
||||
**Output:**
|
||||
```
|
||||
relevant output
|
||||
```
|
||||
|
||||
**Decision:** Why this action was taken
|
||||
|
||||
**Outcome:** What happened
|
||||
|
||||
**Next:** What to do next
|
||||
```
|
||||
|
||||
### 3. Mermaid Diagrams
|
||||
|
||||
Include at least one Mermaid diagram in each incident to visualize:
|
||||
- Architecture/flow before incident
|
||||
- Failure propagation
|
||||
- Recovery process
|
||||
- New architecture after fixes
|
||||
|
||||
**Example theme usage:**
|
||||
```mermaid
|
||||
%%{init: { 'theme': 'forest', 'themeVariables': { 'primaryColor': '#ffdfd3', 'edgeLabelBackground':'#fff' }}}%%
|
||||
```
|
||||
|
||||
Available themes: `default`, `base`, `forest`, `dark`, `neutral`
|
||||
|
||||
**Recommended diagrams:**
|
||||
- `incident-flow.mmd`: Timeline/flow of the incident
|
||||
- `architecture.mmd`: Affected components architecture
|
||||
- `recovery-flow.mmd`: Recovery steps visualization
|
||||
- `dependency-tree.mmd`: Component dependencies showing failure path
|
||||
|
||||
## Incident Severity Definitions
|
||||
|
||||
| Severity | Description | Response Time | Impact |
|
||||
|----------|-------------|---------------|--------|
|
||||
| SEV-1 | Critical system-wide outage | Immediate (24/7) | Multiple services down, potential data loss |
|
||||
| SEV-2 | Major service degradation | < 1 hour | Single critical service down |
|
||||
| SEV-3 | Partial service degradation | < 4 hours | Non-critical service affected |
|
||||
| SEV-4 | Minor issue | Next business day | Cosmetic or non-impacting |
|
||||
|
||||
## Available Ansible Playbooks for Recovery
|
||||
|
||||
This collection provides comprehensive infrastructure management via Ansible.
|
||||
Always use `-i inventory/hosts.yml` when running playbooks.
|
||||
|
||||
### Master Playbooks (Run in order for full recovery)
|
||||
|
||||
| Playbook | Purpose | Targets |
|
||||
|----------|---------|---------|
|
||||
| `playbooks/01_system.yml` | System setup (hostnames, iSCSI, Docker, Longhorn, DNS) | raspberries |
|
||||
| `playbooks/02_setup.yml` | Infrastructure setup (NFS backup, PostgreSQL, Gitea) | localhost, postgres, gitea |
|
||||
| `playbooks/03_cicd.yml` | CI/CD pipeline (Gitea tokens, Docker Compose, ArgoCD) | localhost, gitea |
|
||||
| `playbooks/04_tools.yml` | Tool deployment (Hashicorp Vault, Crowdsec) | tools group |
|
||||
| `playbooks/05_backup.yml` | Backup configuration | localhost |
|
||||
|
||||
### Component-Specific Playbooks
|
||||
|
||||
#### System
|
||||
| Playbook | Purpose | Notes |
|
||||
|----------|---------|-------|
|
||||
| `playbooks/system/rpi.yml` | Raspberry Pi hostname setup | |
|
||||
| `playbooks/system/dns.yml` | DNS/pi-hole configuration | |
|
||||
| `playbooks/system/ssl.yml` | SSL certificate setup with step-ca | |
|
||||
| `playbooks/system/prepare_disks.yml` | Disk partitioning and formatting | |
|
||||
| `playbooks/system/system_docker.yml` | Docker installation with custom storage | Storage at `/mnt/arcodange/docker` |
|
||||
| `playbooks/system/k3s_config.yml` | K3s configuration (Traefik, Longhorn HelmCharts) | **Key for k3s** |
|
||||
| `playbooks/system/system_k3s.yml` | K3s cluster deployment | Uses k3s-ansible collection |
|
||||
| `playbooks/system/iscsi_longhorn.yml` | iSCSI client for Longhorn | Prerequisite for Longhorn |
|
||||
| `playbooks/system/k3s_dns.yml` | K3s DNS configuration | |
|
||||
| `playbooks/system/k3s_ssl.yml` | K3s SSL/traefik certificates | |
|
||||
|
||||
#### Storage
|
||||
| Playbook | Purpose | Notes |
|
||||
|----------|---------|-------|
|
||||
| `playbooks/setup/backup_nfs.yml` | Longhorn RWX NFS backup volume | Creates 50Gi PVC + recurring backups |
|
||||
| `playbooks/backup/k3s_pvc.yml` | PVC backup scripts | Creates `/opt/k3s_volumes/backup.sh` and `restore.sh` |
|
||||
|
||||
#### Backup
|
||||
| Playbook | Purpose | Notes |
|
||||
|----------|---------|-------|
|
||||
| `playbooks/backup/backup.yml` | Main backup orchestration | Calls postgres, gitea, k3s_pvc |
|
||||
| `playbooks/backup/postgres.yml` | PostgreSQL database backup | Docker exec pg_dumpall |
|
||||
| `playbooks/backup/gitea.yml` | Gitea backup | Uses gitea dump command |
|
||||
| `playbooks/backup/cron_report.yml` | Mail utility for cron reports | |
|
||||
| `playbooks/backup/cron_report_mailutility.yml` | MTA configuration | |
|
||||
|
||||
### Inventory File
|
||||
|
||||
**File:** `inventory/hosts.yml`
|
||||
|
||||
**Groups:**
|
||||
- `raspberries`: pi1, pi2, pi3 (Raspberry Pi nodes)
|
||||
- `local`: localhost, pi1, pi2, pi3
|
||||
- `postgres`: pi2 (PostgreSQL host)
|
||||
- `gitea`: pi2 (Gitea host, inherits postgres)
|
||||
- `pihole`: pi1, pi3 (DNS hosts)
|
||||
- `step_ca`: pi1, pi2, pi3 (Certificate authority)
|
||||
- `all`: All above groups
|
||||
|
||||
**Important:** All playbooks MUST be run with `-i inventory/hosts.yml` flag:
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/01_system.yml
|
||||
```
|
||||
|
||||
### Handy Commands for Incident Response
|
||||
|
||||
```bash
|
||||
# Check all pods
|
||||
kubectl get pods -A
|
||||
|
||||
# Check Longhorn specifically
|
||||
kubectl get pods -n longhorn-system
|
||||
kubectl get volumes -n longhorn-system
|
||||
kubectl get replicas -n longhorn-system
|
||||
|
||||
# Check storage
|
||||
kubectl get pv -A
|
||||
kubectl get pvc -A
|
||||
kubectl get csidriver
|
||||
|
||||
# Check nodes
|
||||
kubectl get nodes -o wide
|
||||
kubectl describe node <nodename>
|
||||
|
||||
# Force Longhorn HelmChart reconcile (k3s-specific)
|
||||
sudo touch /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml
|
||||
|
||||
# Restart Longhorn
|
||||
kubectl delete pods -n longhorn-system --all --force --grace-period=0
|
||||
|
||||
# Check Longhorn data on disk
|
||||
ls /mnt/arcodange/longhorn/replicas/
|
||||
|
||||
# Check Docker storage
|
||||
ls /mnt/arcodange/docker/overlay2/ | head
|
||||
|
||||
# Run ansible playbook (dry-run first)
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/01_system.yml --check --diff
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/01_system.yml --limit pi1
|
||||
```
|
||||
|
||||
### K3s-Specific Recovery Notes
|
||||
|
||||
Longhorn is installed via **HelmChart manifest** (k3s native):
|
||||
- File: `/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml`
|
||||
- To trigger reconcile: `touch` the file (k3s watches for changes)
|
||||
- DO NOT use `helm install` directly - it may conflict with k3s HelmChart controller
|
||||
|
||||
Traefik is also installed via HelmChart manifest:
|
||||
- File: `/var/lib/rancher/k3s/server/manifests/traefik-v3.yaml`
|
||||
|
||||
## Incident Templates
|
||||
|
||||
### Quick Start Template
|
||||
|
||||
```markdown
|
||||
---
|
||||
title: [Short Description]
|
||||
incident_id: YYYY-MM-DD-NNN
|
||||
date: $(date +%Y-%m-%d)
|
||||
time_start: $(date +%H:%M:%S)
|
||||
status: Investigating
|
||||
severity: SEV-2
|
||||
tags:
|
||||
- tag1
|
||||
- tag2
|
||||
---
|
||||
|
||||
## Summary
|
||||
|
||||
[1-2 sentences describing the issue]
|
||||
|
||||
## Impact
|
||||
|
||||
[What services/users are affected]
|
||||
|
||||
## Timeline
|
||||
|
||||
| Time | Event | Owner |
|
||||
|------|-------|-------|
|
||||
| HH:MM | Initial detection | | @user
|
||||
| HH:MM | Investigation started | | @user
|
||||
| HH:MM | Root cause identified | | @user
|
||||
| HH:MM | Resolution applied | | @user
|
||||
| HH:MM | Service restored | | @user
|
||||
|
||||
## Root Cause
|
||||
|
||||
[Technical analysis]
|
||||
|
||||
## Resolution
|
||||
|
||||
[Step-by-step what was done]
|
||||
|
||||
## Mermaid Diagram
|
||||
|
||||
%%{init: { 'theme': 'forest' }}%%
|
||||
graph TD
|
||||
A[Component A] -->|depends on| B[Component B]
|
||||
B -->|failed due to| C[Component C]
|
||||
C -->|power cut| D[Root Cause]
|
||||
```
|
||||
|
||||
*remember to always to this for labels:*
|
||||
- have a space before a filepath
|
||||
- no parenthesis '()'
|
||||
- use <br> instead of \n for new lines
|
||||
|
||||
## Action Items
|
||||
|
||||
- [ ] Task 1
|
||||
- [ ] Task 2
|
||||
|
||||
## Lessons Learned
|
||||
|
||||
- Lesson 1
|
||||
- Lesson 2
|
||||
```
|
||||
|
||||
## Contributing to Incident Documentation
|
||||
|
||||
1. **During Incident**: Focus on resolution, log commands and outputs in `log.md`
|
||||
2. **After Resolution**: Create/read the `README.md` with full incident details
|
||||
3. **Add Diagrams**: Include at least one Mermaid diagram to visualize the issue
|
||||
4. **Peer Review**: Have another team member review before closing
|
||||
5. **Update Templates**: Improve templates based on what was missing
|
||||
|
||||
## Directory Index
|
||||
|
||||
| Incident | Date | Severity | Status |
|
||||
|----------|------|----------|--------|
|
||||
| [2026-04-13-power-cut](./2026-04-13-power-cut/README.md) | 2026-04-13 | SEV-1 | In Progress |
|
||||
@@ -0,0 +1,244 @@
|
||||
# Cluster Recovery Agent Instructions
|
||||
|
||||
You are recovering the Arcodange homelab k3s cluster after an outage (power cut, node failure, or
|
||||
Longhorn reinstall). Your job is to assess damage, run the appropriate Ansible playbooks and
|
||||
kubectl commands, and bring the cluster back to a fully healthy state.
|
||||
|
||||
You do NOT need to modify any code. All recovery tooling already exists.
|
||||
|
||||
---
|
||||
|
||||
## Cluster Overview
|
||||
|
||||
| Component | Details |
|
||||
|-----------|---------|
|
||||
| Nodes | pi1, pi2, pi3 (Raspberry Pi, SSH via `pi<N>.home`) |
|
||||
| k8s distribution | k3s |
|
||||
| Storage | Longhorn (`/mnt/arcodange/longhorn/`) |
|
||||
| GitOps | ArgoCD (apps auto-sync from `gitea.arcodange.lab/arcodange-org/`) |
|
||||
| Secrets | HashiCorp Vault (`tools` namespace, manual unseal) |
|
||||
| Ingress | Traefik + CrowdSec bouncer |
|
||||
| Working dir | `/Users/gabrielradureau/Work/Arcodange/factory/ansible/arcodange/factory/` |
|
||||
| Inventory | `inventory/hosts.yml` |
|
||||
|
||||
**Critical dependency:** ERP (Dolibarr) uses Vault-rotated DB credentials written to its PVC.
|
||||
**Always recover and unseal Vault before scaling ERP up.**
|
||||
|
||||
---
|
||||
|
||||
## Step 0 — Assess Damage
|
||||
|
||||
Run these first to understand what is broken:
|
||||
|
||||
```bash
|
||||
# Overall pod health
|
||||
kubectl get pods -A | grep -v Running | grep -v Completed
|
||||
|
||||
# PVC health (anything not Bound is a problem)
|
||||
kubectl get pvc -A | grep -v Bound
|
||||
|
||||
# Longhorn volume states
|
||||
kubectl get volumes.longhorn.io -n longhorn-system
|
||||
|
||||
# Longhorn manager health (prerequisite for all recovery)
|
||||
kubectl get pods -n longhorn-system -l app=longhorn-manager
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 1 — Longhorn Volume Recovery
|
||||
|
||||
### Path A — Fast path (backup file exists, Volume CRDs were backed up)
|
||||
|
||||
Check if a recent backup exists on pi1:
|
||||
```bash
|
||||
ssh pi1.home "ls -lt /mnt/backups/k3s_pvc/backup_*.volumes | head -5"
|
||||
```
|
||||
|
||||
If a backup file exists and is recent (from before the incident):
|
||||
```bash
|
||||
ssh pi1.home "kubectl apply -f /mnt/backups/k3s_pvc/backup_<YYYYMMDD>.volumes"
|
||||
```
|
||||
|
||||
Then verify PVCs bound and skip to Step 2.
|
||||
|
||||
### Path B — Block-device injection (no usable backup, raw replica files intact)
|
||||
|
||||
Use this when PVCs are `Lost`/`Terminating` and no Volume CRD backup is available.
|
||||
|
||||
**Check which volumes need recovery:**
|
||||
```bash
|
||||
# Volumes with no PVC or Lost/Terminating PVC
|
||||
kubectl get pvc -A | grep -v Bound
|
||||
```
|
||||
|
||||
**For each failed volume, create a vars file** following the pattern in:
|
||||
`playbooks/recover/longhorn_data_vars.example.yml`
|
||||
|
||||
Existing vars files from the 2026-04-13 incident (reusable as references):
|
||||
- `playbooks/recover/longhorn_data_vars_remaining.yml` — prometheus, alertmanager, redis, backups-rwx
|
||||
- `playbooks/recover/longhorn_data_vars_erp_vault.yml` — erp, hashicorp-vault (audit + data)
|
||||
- `playbooks/recover/longhorn_data_vars_clickhouse.yml` — clickhouse
|
||||
|
||||
**Key rules for the vars file:**
|
||||
- `source_node`/`source_dir` can be omitted — Phase 0 auto-discovers the largest non-Rebuilding replica
|
||||
- Set `workload_name: ""` for ERP — it must not scale up until Vault is unsealed
|
||||
- For StatefulSets with multiple PVCs (e.g. Vault), set `workload_name: ""` on all but the last entry
|
||||
|
||||
**Run the recovery playbook:**
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \
|
||||
-e @playbooks/recover/longhorn_data_vars_<NAME>.yml
|
||||
```
|
||||
|
||||
The playbook is **idempotent** — safe to re-run if it fails midway.
|
||||
|
||||
**Playbook phases (for context when troubleshooting):**
|
||||
| Phase | What it does |
|
||||
|-------|-------------|
|
||||
| 0 | Auto-discovers best replica dir (skips `Rebuilding: true`) |
|
||||
| 1 | Backs up untouched replica dir to `/home/pi/arcodange/backups/longhorn-recovery/` |
|
||||
| 2 | Merges snapshot+head layers into a single `.img` via `merge-longhorn-layers.py` |
|
||||
| 3 | **Scales down workloads first**, then clears stuck Terminating PVCs, creates Volume CRD |
|
||||
| 4 | Scale down (second pass, idempotent) |
|
||||
| 5 | Attaches volume via maintenance ticket to source node |
|
||||
| 6 | `mkfs.ext4` (if unformatted) + `rsync` from merged image into live block device |
|
||||
| 7 | Removes maintenance ticket (volume detaches) |
|
||||
| 8 | Creates PV (Retain, no claimRef) + PVC pinned to PV |
|
||||
| 9 | Scales up workloads, waits for readyReplicas ≥ 1 (failures here are `ignore_errors: yes`) |
|
||||
|
||||
**Common Phase 8 failure — StatefulSet re-creates PVCs before they can be pinned:**
|
||||
The playbook handles this automatically (scales down before finalizer removal). If you still hit it:
|
||||
```bash
|
||||
kubectl scale statefulset <name> -n <namespace> --replicas=0
|
||||
kubectl patch pvc <pvc-name> -n <namespace> --type=merge -p '{"metadata":{"finalizers":null}}'
|
||||
kubectl delete pvc <pvc-name> -n <namespace>
|
||||
# Then re-run the playbook
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 2 — Unseal HashiCorp Vault
|
||||
|
||||
After Vault's PVCs are recovered, the pod boots **sealed**. Check:
|
||||
```bash
|
||||
kubectl get pod hashicorp-vault-0 -n tools
|
||||
kubectl exec hashicorp-vault-0 -n tools -- vault status 2>/dev/null | grep Sealed
|
||||
```
|
||||
|
||||
If sealed, run the unseal playbook (requires interactive terminal for the Gitea password prompt):
|
||||
```bash
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/tools/hashicorp_vault.yml
|
||||
```
|
||||
|
||||
Unseal keys are at `~/.arcodange/cluster-keys.json` on the local machine. The playbook reads them automatically.
|
||||
|
||||
After the playbook completes, verify:
|
||||
```bash
|
||||
kubectl get pod hashicorp-vault-0 -n tools # must be 1/1 Ready
|
||||
kubectl exec hashicorp-vault-0 -n tools -- vault status | grep Sealed # must be false
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 3 — Scale Up ERP
|
||||
|
||||
Only after Vault is unsealed and Ready:
|
||||
```bash
|
||||
kubectl scale deployment erp -n erp --replicas=1
|
||||
kubectl rollout status deployment/erp -n erp
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 4 — Reconfigure Tools (CrowdSec, etc.)
|
||||
|
||||
Run if CrowdSec bouncer or Traefik middleware needs reconfiguring:
|
||||
```bash
|
||||
# Standard run (bouncer key + Traefik middleware + restart)
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/tools/crowdsec.yml
|
||||
|
||||
# Include captcha HTML injection (use when captcha page is broken)
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/tools/crowdsec.yml --tags never,all
|
||||
```
|
||||
|
||||
If crowdsec-agent or crowdsec-appsec pods are stuck in `Error` after a long outage,
|
||||
the playbook handles restarting them automatically.
|
||||
|
||||
---
|
||||
|
||||
## Step 5 — Re-enable ArgoCD selfHeal
|
||||
|
||||
Check if `selfHeal` was disabled during recovery (look for `selfHeal: false` in the tools app):
|
||||
```bash
|
||||
grep -A5 "tools:" /Users/gabrielradureau/Work/Arcodange/factory/argocd/values.yaml
|
||||
```
|
||||
|
||||
If disabled, re-enable it by editing `argocd/values.yaml` and setting `selfHeal: true`,
|
||||
then syncing the ArgoCD app:
|
||||
```bash
|
||||
kubectl get app tools -n argocd
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 6 — Final Verification
|
||||
|
||||
```bash
|
||||
# All pods running
|
||||
kubectl get pods -A | grep -v Running | grep -v Completed | grep -v "^NAME"
|
||||
|
||||
# All PVCs bound
|
||||
kubectl get pvc -A | grep -v Bound
|
||||
|
||||
# All Longhorn volumes healthy
|
||||
kubectl get volumes.longhorn.io -n longhorn-system
|
||||
|
||||
# Run a fresh backup to capture the recovered state
|
||||
ansible-playbook -i inventory/hosts.yml playbooks/backup/backup.yml \
|
||||
-e backup_root_dir=/mnt/backups
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Key Files Reference
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `playbooks/recover/longhorn_data.yml` | Main block-device recovery playbook |
|
||||
| `playbooks/recover/longhorn.yml` | Recovery when Volume CRDs still exist |
|
||||
| `playbooks/recover/longhorn_data_vars.example.yml` | Template for recovery vars |
|
||||
| `playbooks/recover/longhorn_data_vars_erp_vault.yml` | Vars for erp + vault (2026-04-13 incident) |
|
||||
| `playbooks/recover/longhorn_data_vars_remaining.yml` | Vars for other volumes (2026-04-13 incident) |
|
||||
| `playbooks/backup/backup.yml` | Full backup (postgres + gitea + k3s PVCs + Longhorn CRDs) |
|
||||
| `playbooks/backup/k3s_pvc.yml` | PV/PVC/Longhorn Volume CRD backup |
|
||||
| `playbooks/tools/hashicorp_vault.yml` | Vault unseal + OIDC reconfiguration |
|
||||
| `playbooks/tools/crowdsec.yml` | CrowdSec bouncer + Traefik middleware setup |
|
||||
| `docs/adr/20260414-longhorn-pvc-recovery.md` | Full incident ADR with all recovery methods |
|
||||
| `~/.arcodange/cluster-keys.json` | Vault unseal keys (local machine only) |
|
||||
|
||||
---
|
||||
|
||||
## Decision Tree
|
||||
|
||||
```
|
||||
Cluster down after outage
|
||||
│
|
||||
├─ kubectl works? ──No──▶ Check k3s: `systemctl status k3s` on pi1/pi2/pi3
|
||||
│
|
||||
└─ Yes
|
||||
│
|
||||
├─ PVCs all Bound? ──Yes──▶ Skip to Step 2 (check Vault)
|
||||
│
|
||||
└─ No
|
||||
│
|
||||
├─ Recent .volumes backup on pi1? ──Yes──▶ Path A (kubectl apply backup)
|
||||
│
|
||||
└─ No
|
||||
│
|
||||
├─ Longhorn Volume CRDs exist? ──Yes──▶ playbooks/recover/longhorn.yml
|
||||
│
|
||||
└─ No ──▶ Path B (longhorn_data.yml block-device injection)
|
||||
Check replica dirs exist first:
|
||||
ssh pi{1,2,3}.home "sudo du -sh /mnt/arcodange/longhorn/replicas/pvc-*"
|
||||
```
|
||||
@@ -0,0 +1,360 @@
|
||||
# Runbook: Longhorn Block-Device Data Recovery
|
||||
|
||||
**When to use:** Longhorn has been fully reinstalled (nuclear cleanup). Volume CRDs are gone.
|
||||
Application PVCs are stuck `Terminating` or `Lost`. The raw replica `.img` files still exist
|
||||
on disk across the nodes. kubectl/k8s objects cannot help — we must work directly with the
|
||||
Longhorn replica directories and block devices.
|
||||
|
||||
**Automated version:** `playbooks/recover/longhorn_data.yml`
|
||||
|
||||
---
|
||||
|
||||
## Mental Model
|
||||
|
||||
Longhorn stores each replica as a chain of sparse raw image files inside a directory named
|
||||
`<pv-name>-<random-hex>` under `<longhorn_data_path>/replicas/`. Each directory contains:
|
||||
|
||||
```
|
||||
volume.meta — engine state (Head filename, Parent snapshot, Dirty flag)
|
||||
volume-head-NNN.img — active write log (sparse, only changed blocks)
|
||||
volume-head-NNN.img.meta — head metadata
|
||||
volume-snap-<uuid>.img — snapshot at a point in time (sparse, full state)
|
||||
volume-snap-<uuid>.img.meta — snapshot metadata
|
||||
revision.counter — monotonically increasing write counter
|
||||
```
|
||||
|
||||
After a nuclear cleanup + reinstall, Longhorn creates **new empty replica directories** with
|
||||
new random hex suffixes. The old directories (with data) are left on disk but orphaned.
|
||||
|
||||
**Why directory-swap fails:** the old `volume.meta` has a different engine generation and
|
||||
`Dirty: true`. Longhorn detects the inconsistency across replicas and rebuilds from the
|
||||
"cleanest" source (the new empty pi1 replica), overwriting the old data.
|
||||
|
||||
**What works:** extract the filesystem from the untouched replica directory directly, then
|
||||
inject the data files into the live Longhorn block device while the volume is temporarily
|
||||
attached in maintenance mode.
|
||||
|
||||
---
|
||||
|
||||
## Decision Tree
|
||||
|
||||
```
|
||||
Are Volume CRDs present in Longhorn?
|
||||
├── YES → normal PV/PVC restore is enough, use playbooks/recover/longhorn.yml
|
||||
└── NO
|
||||
└── Are replica directories present on disk?
|
||||
├── NO → data is lost, provision fresh volumes
|
||||
└── YES
|
||||
└── Is there an untouched replica dir (timestamps from before the incident)?
|
||||
├── NO → data likely unrecoverable (all dirs were zeroed during reconciliation)
|
||||
└── YES → follow this runbook
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 0 — Pre-flight: Inventory Surviving Replica Directories
|
||||
|
||||
On each node, list replica dirs and their sizes. Dirs with actual data are large (>16K).
|
||||
New empty dirs created by Longhorn are always exactly 16K.
|
||||
|
||||
```bash
|
||||
for node in pi1 pi2 pi3; do
|
||||
echo "=== $node ==="
|
||||
ssh $node "sudo du -sh /mnt/arcodange/longhorn/replicas/pvc-<VOLUME>-* 2>/dev/null"
|
||||
done
|
||||
```
|
||||
|
||||
**Key rule:** identify the replica dir that was **never touched** by the reinstall — it has
|
||||
old timestamps (from before the incident) and its size matches the original volume usage.
|
||||
This is your recovery source. **Back it up before touching anything.**
|
||||
|
||||
```bash
|
||||
# On the node that has the untouched dir:
|
||||
sudo mkdir -p /home/pi/arcodange/backups/longhorn-recovery/<pvc-name>/
|
||||
sudo cp -a /mnt/arcodange/longhorn/replicas/<pv-name>-<old-hex>/ \
|
||||
/home/pi/arcodange/backups/longhorn-recovery/<pvc-name>/
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 1 — Reconstruct the Filesystem
|
||||
|
||||
The replica directory contains a snapshot chain. Each layer is a sparse raw image — unchanged
|
||||
blocks appear as zeroed sparse regions, only written blocks contain data. To reconstruct the
|
||||
full filesystem, layers must be merged: head takes priority, then snapshot.
|
||||
|
||||
Use `docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py`:
|
||||
|
||||
```bash
|
||||
# On the node holding the backup:
|
||||
sudo python3 merge-longhorn-layers.py \
|
||||
/home/pi/arcodange/backups/longhorn-recovery/<pvc-name>/<pv-name>-<old-hex>/ \
|
||||
/tmp/<pvc-name>-merged.img
|
||||
|
||||
# Verify the filesystem mounts
|
||||
sudo mkdir -p /mnt/recovery-<pvc-name>
|
||||
sudo mount -o loop /tmp/<pvc-name>-merged.img /mnt/recovery-<pvc-name>
|
||||
sudo ls -lah /mnt/recovery-<pvc-name>/
|
||||
sudo umount /mnt/recovery-<pvc-name>
|
||||
```
|
||||
|
||||
If mount fails with "wrong fs type" or "bad superblock":
|
||||
- The snapshot `.img` is all-zero (was overwritten by a prior Longhorn reconciliation)
|
||||
- Try the next oldest replica dir from another node
|
||||
- Check with `sudo od -A x -t x1z -v snap.img | grep -v ' 00 00...' | head -5`
|
||||
|
||||
---
|
||||
|
||||
## Step 2 — Create the Longhorn Volume CRD
|
||||
|
||||
Longhorn needs to know about the volume before its block device can be used.
|
||||
|
||||
```bash
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: Volume
|
||||
metadata:
|
||||
name: <pv-name>
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
accessMode: rwo # or rwx
|
||||
dataEngine: v1
|
||||
frontend: blockdev
|
||||
numberOfReplicas: 3
|
||||
size: "<size-in-bytes>" # e.g. "134217728" for 128Mi
|
||||
EOF
|
||||
```
|
||||
|
||||
Wait for replicas to appear:
|
||||
```bash
|
||||
kubectl get replicas.longhorn.io -n longhorn-system | grep <pv-name>
|
||||
# Expect 3 replicas in "stopped" state
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 3 — Attach the Volume in Maintenance Mode
|
||||
|
||||
Longhorn only creates the block device (`/dev/longhorn/<pv-name>`) when the volume is
|
||||
attached to a node. Use a `VolumeAttachment` ticket to attach without a pod.
|
||||
|
||||
Choose `<target-node>` = the same node where the backup/merged image is stored (avoids
|
||||
copying large files across the network).
|
||||
|
||||
```bash
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: VolumeAttachment
|
||||
metadata:
|
||||
name: <pv-name>
|
||||
namespace: longhorn-system
|
||||
spec:
|
||||
attachmentTickets:
|
||||
recovery:
|
||||
generation: 0
|
||||
id: recovery
|
||||
nodeID: <target-node>
|
||||
parameters:
|
||||
disableFrontend: "false"
|
||||
type: longhorn-api
|
||||
volume: <pv-name>
|
||||
EOF
|
||||
|
||||
kubectl wait --for=jsonpath='{.status.state}'=attached \
|
||||
volumes.longhorn.io/<pv-name> -n longhorn-system --timeout=120s
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 4 — Scale Down the Workload
|
||||
|
||||
Always stop the workload before touching the data to prevent concurrent writes and filesystem
|
||||
corruption.
|
||||
|
||||
```bash
|
||||
# For a Deployment:
|
||||
kubectl scale deployment <name> -n <namespace> --replicas=0
|
||||
|
||||
# For a StatefulSet:
|
||||
kubectl scale statefulset <name> -n <namespace> --replicas=0
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 5 — Inject Data Files via Block Device
|
||||
|
||||
```bash
|
||||
ssh <target-node> bash <<'SHELL'
|
||||
# Mount the live block device
|
||||
sudo mkdir -p /mnt/recovery-live
|
||||
sudo mount /dev/longhorn/<pv-name> /mnt/recovery-live
|
||||
|
||||
# Mount the reconstructed image (if not already mounted)
|
||||
sudo mkdir -p /mnt/recovery-src
|
||||
sudo mount -o loop /tmp/<pvc-name>-merged.img /mnt/recovery-src
|
||||
|
||||
# Sync: only the application data files, not lost+found
|
||||
sudo rsync -av --exclude='lost+found' /mnt/recovery-src/ /mnt/recovery-live/
|
||||
|
||||
# Verify
|
||||
sudo ls -lah /mnt/recovery-live/
|
||||
|
||||
# Unmount both
|
||||
sudo umount /mnt/recovery-src
|
||||
sudo umount /mnt/recovery-live
|
||||
SHELL
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 6 — Detach the Volume
|
||||
|
||||
```bash
|
||||
kubectl patch volumeattachments.longhorn.io <pv-name> \
|
||||
-n longhorn-system --type json \
|
||||
-p '[{"op":"remove","path":"/spec/attachmentTickets/recovery"}]'
|
||||
|
||||
kubectl wait --for=jsonpath='{.status.state}'=detached \
|
||||
volumes.longhorn.io/<pv-name> -n longhorn-system --timeout=60s
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 7 — Restore PV and PVC
|
||||
|
||||
Clear stuck Terminating PV/PVC finalizers first if they exist:
|
||||
```bash
|
||||
kubectl patch pv <pv-name> --type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null
|
||||
kubectl patch pvc <pvc-name> -n <namespace> --type=merge \
|
||||
-p '{"metadata":{"finalizers":null}}' 2>/dev/null
|
||||
# Wait a moment for them to delete
|
||||
```
|
||||
|
||||
Recreate the PV with `Retain` policy and no `claimRef`:
|
||||
```bash
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: <pv-name>
|
||||
annotations:
|
||||
pv.kubernetes.io/provisioned-by: driver.longhorn.io
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce] # match original
|
||||
capacity:
|
||||
storage: <size> # e.g. 128Mi
|
||||
csi:
|
||||
driver: driver.longhorn.io
|
||||
fsType: ext4
|
||||
volumeHandle: <pv-name>
|
||||
volumeAttributes:
|
||||
dataEngine: v1
|
||||
dataLocality: disabled
|
||||
disableRevisionCounter: "true"
|
||||
numberOfReplicas: "3"
|
||||
staleReplicaTimeout: "30"
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
storageClassName: longhorn
|
||||
volumeMode: Filesystem
|
||||
EOF
|
||||
```
|
||||
|
||||
Recreate the PVC pinned to this PV:
|
||||
```bash
|
||||
kubectl apply -f - <<EOF
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: <pvc-name>
|
||||
namespace: <namespace>
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: <size>
|
||||
storageClassName: longhorn
|
||||
volumeMode: Filesystem
|
||||
volumeName: <pv-name>
|
||||
EOF
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Step 8 — Scale Up and Verify
|
||||
|
||||
```bash
|
||||
kubectl scale deployment <name> -n <namespace> --replicas=1
|
||||
kubectl wait --for=condition=Ready pod -l app=<name> -n <namespace> --timeout=120s
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Pitfalls Learned During 2026-04-13 Recovery
|
||||
|
||||
| Pitfall | What happened | Prevention |
|
||||
|---------|--------------|------------|
|
||||
| **Directory swap corrupts data** | Longhorn found old `Dirty: true` volume.meta + empty pi1 replica → rebuilt from empty source | Never swap dirs. Use merge tool + block device injection instead |
|
||||
| **Snapshot is zeroed after swap** | Longhorn reconciliation overwrote snapshot images when rebuilding from empty replica | Back up the untouched dir FIRST before any rename |
|
||||
| **Multiple dirs per volume on pi3** | Rebuild attempts during the incident created extra dirs | Identify the untouched dir by timestamp AND verify non-zero content with `od` |
|
||||
| **`Rebuilding: true` replica → all-zeros merged image** | Phase 0 picked a replica mid-rebuild (1.3 GiB actual data, sparse files look large) — merge tool produced an all-zeros image | Check `volume.meta` and skip any dir with `"Rebuilding": true` before merging |
|
||||
| **`du -sb` gives misleading apparent sizes** | Sparse replica files (8 GiB file, 1.3 GiB actual) appeared larger than healthy 11 GiB replicas | Use `du -sk` (actual disk blocks) not `du -sb` (apparent/logical size) to rank replicas |
|
||||
| **Dirty journal prevents ro mount** | `mount -o loop,ro` fails with "bad superblock" on an ext4 with unclean shutdown | Use `mount -o loop,ro,noload` to skip journal replay for read-only access |
|
||||
| **New volume is unformatted** | `mount /dev/longhorn/<pv>` fails with "wrong fs type" on a freshly created volume | Run `mkfs.ext4 -F` before mounting; guard with `blkid` to skip if already formatted |
|
||||
| **rsync rc=23 on power-cut partitions** | Some filesystem blocks were unreadable ("Structure needs cleaning") → rsync exits 23 | Use `rsync --ignore-errors`; rc=23 is a partial transfer, not a total failure |
|
||||
| **pod blocks volume re-attach** | Old Error-state pod held a volume attachment claim | Delete old Error pods before scaling up new ones |
|
||||
| **`kubectl cp` needs `tar`** | Distroless container had no `tar` binary | Mount block device directly on the node instead |
|
||||
| **VolumeAttachment ticket removal** | Deleting a VolumeAttachment object causes Longhorn to immediately recreate it | Patch the `recovery` key out of `spec.attachmentTickets` instead of deleting the object |
|
||||
| **Phase 7 wait for `detached` times out** | After removing the recovery ticket, a workload may immediately create its own ticket | Wait for the `recovery` ticket to disappear from `spec.attachmentTickets`, not for full detach |
|
||||
| **StatefulSet pods not found by label** | `kubectl get pod -l app=<name>` returns nothing for StatefulSet pods | Wait on `readyReplicas ≥ 1` on the StatefulSet object, not on pod labels |
|
||||
| **`set_fact` overridden by `-e @file`** | Ansible extra vars have highest precedence — `set_fact: longhorn_recovery_volumes` was silently ignored | Use a different variable name (`_volumes`) for the resolved list, never reassign the extra var name |
|
||||
|
||||
---
|
||||
|
||||
## Identifying the Right Replica Directory
|
||||
|
||||
When multiple old dirs exist for the same volume on a node, pick the one to use for recovery:
|
||||
|
||||
1. **Skip `Rebuilding: true`:** check `volume.meta` first — a dir that was being rebuilt when
|
||||
the incident happened has incomplete data (sparse files are allocated but mostly zeroed):
|
||||
```bash
|
||||
python3 -c "import json; d=json.load(open('volume.meta')); print('Rebuilding:', d['Rebuilding'])"
|
||||
```
|
||||
Only consider dirs where `Rebuilding: false`.
|
||||
|
||||
2. **Actual size:** `sudo du -sk <dir>` (actual disk usage in KB — not `du -sb` which returns
|
||||
apparent/logical size and is misleading for sparse files). Pick the largest actual size.
|
||||
|
||||
3. **Timestamps:** prefer the most recently modified before the incident date.
|
||||
|
||||
4. **Snapshot chain:** if Rebuilding is false on multiple dirs, check `volume.meta` for
|
||||
`"Dirty": false` (clean shutdown) vs `"Dirty": true`. Prefer clean if available.
|
||||
|
||||
5. **Content check:** verify the snapshot is not all zeros:
|
||||
```bash
|
||||
sudo od -A x -t x1z -v volume-snap-*.img | grep -v ' 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00' | head -3
|
||||
```
|
||||
If the output is empty (all zeros), the snapshot was overwritten. Try another node.
|
||||
|
||||
**Summary rule:** `Rebuilding: false` → largest `du -sk` → non-zero snapshot content.
|
||||
|
||||
---
|
||||
|
||||
## Reference: Key Commands
|
||||
|
||||
```bash
|
||||
# List all replica dirs for a volume across all nodes
|
||||
for n in pi1 pi2 pi3; do echo "==$n=="; ssh $n "sudo ls /mnt/arcodange/longhorn/replicas/ | grep <pv-prefix>"; done
|
||||
|
||||
# Check Longhorn volume state
|
||||
kubectl get volumes.longhorn.io -n longhorn-system <pv-name>
|
||||
|
||||
# Check VolumeAttachment tickets
|
||||
kubectl get volumeattachments.longhorn.io -n longhorn-system <pv-name> \
|
||||
-o jsonpath='{.spec.attachmentTickets}'
|
||||
|
||||
# Check Longhorn block device existence on a node
|
||||
ssh <node> "ls /dev/longhorn/<pv-name>"
|
||||
|
||||
# Verify filesystem content without starting the app
|
||||
ssh <node> "sudo mount /dev/longhorn/<pv-name> /mnt/check && sudo ls /mnt/check && sudo umount /mnt/check"
|
||||
```
|
||||
11
ansible/arcodange/factory/inventory/group_vars/all/gitea.yml
Normal file
11
ansible/arcodange/factory/inventory/group_vars/all/gitea.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
# Gitea ownership configuration consumed by playbooks running on `localhost`
|
||||
# (e.g. tools/hashicorp_vault.yml). Role-level defaults (gitea_username,
|
||||
# gitea_organization) live in roles/gitea_secret/defaults/main.yml ; this file
|
||||
# is for fact lists that the inventory should declare.
|
||||
|
||||
# Users (Gitea owner_type=user) to which org-level Gitea Action secrets must
|
||||
# also be propagated. Repos owned by these users cannot read org-level secrets,
|
||||
# so the secret propagation playbook iterates over this list.
|
||||
gitea_secret_propagation_users:
|
||||
- arcodange
|
||||
@@ -1,4 +1,4 @@
|
||||
gitea_version: 1.24.3
|
||||
gitea_version: 1.25.5
|
||||
|
||||
gitea_database:
|
||||
db_name: gitea
|
||||
@@ -35,11 +35,10 @@ gitea:
|
||||
GITEA__mailer__PASSWD: '{{ gitea_vault.GITEA__mailer__PASSWD }}'
|
||||
GITEA__server__SSH_PORT: 2222
|
||||
GITEA__server__SSH_DOMAIN: "{{ hostvars[groups.gitea[0]]['preferred_ip'] }}"
|
||||
# GITEA__server__SSH_DOMAIN: "{{ lookup('dig', groups.gitea[0]) }}" # might work again if deactivate rpi wifi
|
||||
GITEA__server__SSH_LISTEN_PORT: 22
|
||||
GITEA_server__DOMAIN: localhost
|
||||
GITEA_server__HTTP_PORT: 3000
|
||||
GITEA_server__ROOT_URL: https://gitea.arcodange.duckdns.org/
|
||||
GITEA_server__ROOT_URL: https://gitea.arcodange.lab/
|
||||
GITEA_server__START_SSH_SERVER: true
|
||||
GITEA_server__OFFLINE_MODE: true
|
||||
GITEA_service__DISABLE_REGISTRATION: true
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
step_ca_primary: pi1
|
||||
|
||||
step_ca_fqdn: ssl-ca.arcodange.lab
|
||||
|
||||
step_ca_user: step
|
||||
step_ca_home: /home/step
|
||||
step_ca_dir: /home/step/.step
|
||||
|
||||
step_ca_listen_address: ":8443"
|
||||
@@ -0,0 +1,13 @@
|
||||
$ANSIBLE_VAULT;1.1;AES256
|
||||
35633437343661363030323466313735373033373566643530653539633133623462333337393037
|
||||
6336653635366439363031616637313339373465666433320a653936396438373132623264386665
|
||||
66623330343439613636353963373139363531613761613864623262623661666565373137306461
|
||||
3062646337353331300a636164643462343163303931646538653537323831623736393634343137
|
||||
39376139306165356138383664373334353364316435303265643965386135356561316130316239
|
||||
64393436363436393339393130383764353231333361313565333934313136666234356433626437
|
||||
35656666386538653963653334393262366562656631376636353538383661386661366438366133
|
||||
64346338666666323562313363363836613439633931306437393132616134666230613936623634
|
||||
34383366663031336236316566626666303764323631363239636461396366323733393731376563
|
||||
65356630326536333133393335383766616631323732333262396464326165366532383066363761
|
||||
37303033316135616661623431623836313965373930376361656334323336656561643336616265
|
||||
36666235623564383132
|
||||
@@ -30,6 +30,7 @@ local:
|
||||
hosts:
|
||||
localhost:
|
||||
ansible_connection: local
|
||||
ansible_python_interpreter: "{{ ansible_playbook_python }}"
|
||||
pi1:
|
||||
pi2:
|
||||
pi3:
|
||||
@@ -42,6 +43,17 @@ gitea:
|
||||
children:
|
||||
postgres:
|
||||
|
||||
pihole:
|
||||
hosts:
|
||||
pi1:
|
||||
pi3:
|
||||
|
||||
step_ca:
|
||||
hosts:
|
||||
pi1:
|
||||
pi2:
|
||||
pi3:
|
||||
|
||||
all:
|
||||
children:
|
||||
raspberries:
|
||||
@@ -1,422 +1,2 @@
|
||||
---
|
||||
|
||||
- name: Prepare disks for longhorn
|
||||
ansible.builtin.import_playbook: ./prepare_disks.yml
|
||||
|
||||
- name: System Docker
|
||||
hosts: raspberries:&local
|
||||
gather_facts: yes
|
||||
tags: never
|
||||
become: yes
|
||||
|
||||
pre_tasks:
|
||||
|
||||
- name: set hostname
|
||||
ansible.builtin.hostname:
|
||||
name: "{{ inventory_hostname }}"
|
||||
become: yes
|
||||
when: inventory_hostname != ansible_hostname
|
||||
|
||||
- name: Prevent apt source conflict
|
||||
ansible.builtin.file:
|
||||
state: absent
|
||||
path: /etc/apt/sources.list.d/docker.list
|
||||
become: yes
|
||||
|
||||
- name: Install role geerlingguy.docker
|
||||
community.general.ansible_galaxy_install:
|
||||
type: role
|
||||
name: geerlingguy.docker
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
|
||||
- ansible.builtin.debug:
|
||||
var: ansible_facts.machine
|
||||
|
||||
tasks:
|
||||
|
||||
- include_role:
|
||||
name: geerlingguy.docker
|
||||
|
||||
post_tasks:
|
||||
- name: adding existing user '{{ ansible_user }}' to group docker
|
||||
user:
|
||||
name: '{{ ansible_user }}'
|
||||
groups: docker
|
||||
append: yes
|
||||
become: yes
|
||||
|
||||
#---
|
||||
|
||||
- name: Install iSCSI client for Longhorn on Raspberry Pi
|
||||
hosts: raspberries:&local
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Install open-iscsi
|
||||
ansible.builtin.apt:
|
||||
name: open-iscsi
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Enable and start iSCSI service
|
||||
ansible.builtin.service:
|
||||
name: iscsid
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Installer cryptsetup
|
||||
ansible.builtin.apt:
|
||||
name: cryptsetup
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Charger le module noyau dm_crypt
|
||||
ansible.builtin.modprobe:
|
||||
name: dm_crypt
|
||||
state: present
|
||||
|
||||
- name: S'assurer que le module dm_crypt est chargé au démarrage
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/modules
|
||||
line: dm_crypt
|
||||
state: present
|
||||
|
||||
- name: Créer dossier longhorn
|
||||
ansible.builtin.file:
|
||||
path: /mnt/arcodange/longhorn
|
||||
state: directory
|
||||
owner: pi
|
||||
group: docker
|
||||
mode: '0774'
|
||||
ignore_errors: true
|
||||
|
||||
#---
|
||||
|
||||
- name: System K3S
|
||||
hosts: raspberries:&local
|
||||
tags: never
|
||||
|
||||
tasks:
|
||||
- name: prepare inventory for k3s external playbook
|
||||
tags: always
|
||||
ansible.builtin.add_host:
|
||||
hostname: "{{ item }}"
|
||||
groups:
|
||||
- k3s_cluster
|
||||
- "{{ ansible_loop.first | ternary('server', 'agent') }}"
|
||||
loop: "{{ groups.raspberries | intersect(groups.local) | sort }}"
|
||||
loop_control:
|
||||
extended: true
|
||||
extended_allitems: false
|
||||
|
||||
- name: Install collection k3s.orchestration
|
||||
local_action:
|
||||
module: community.general.ansible_galaxy_install
|
||||
type: collection
|
||||
name: git+https://github.com/k3s-io/k3s-ansible
|
||||
run_once: true
|
||||
|
||||
- name: k3s
|
||||
tags: never,k3s
|
||||
ansible.builtin.import_playbook: k3s.orchestration.site
|
||||
# ansible.builtin.import_playbook: k3s.orchestration.upgrade
|
||||
# ansible.builtin.import_playbook: k3s.orchestration.reset
|
||||
vars:
|
||||
k3s_version: v1.34.1+k3s1
|
||||
extra_server_args: "--docker --disable traefik"
|
||||
extra_agent_args: "--docker"
|
||||
api_endpoint: "{{ hostvars[groups['server'][0]]['ansible_host'] | default(groups['server'][0]) }}"
|
||||
|
||||
- name: how to reach k3s
|
||||
hosts: server
|
||||
tasks:
|
||||
- name: copy /etc/rancher/k3s/k3s.yaml to ~/.kube/config from the k3s server and replace 127.0.0.1 with the server ip or hostname
|
||||
run_once: true
|
||||
block:
|
||||
- ansible.builtin.fetch:
|
||||
src: /etc/rancher/k3s/k3s.yaml
|
||||
dest: ~/.kube/config
|
||||
flat: true
|
||||
become: true
|
||||
run_once: true
|
||||
- local_action:
|
||||
module: ansible.builtin.replace
|
||||
path: ~/.kube/config
|
||||
regexp: 'server: https://127.0.0.1:6443'
|
||||
replace: 'server: https://{{ ansible_default_ipv4.address }}:6443'
|
||||
|
||||
# - name: setup hard disk
|
||||
# tags: never
|
||||
# ansible.builtin.import_playbook: ./setup/hard_disk_v2.yml
|
||||
# # vars:
|
||||
# # hard_disk__partitions:
|
||||
# # nfs: []
|
||||
|
||||
- name: setup longhorn for volumes https://docs.k3s.io/helm
|
||||
become: true
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml
|
||||
content: |-
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
annotations:
|
||||
helmcharts.cattle.io/managed-by: helm-controller
|
||||
finalizers:
|
||||
- wrangler.cattle.io/on-helm-chart-remove
|
||||
generation: 1
|
||||
name: longhorn-install
|
||||
namespace: kube-system
|
||||
spec:
|
||||
version: v1.9.1
|
||||
chart: longhorn
|
||||
repo: https://charts.longhorn.io
|
||||
failurePolicy: abort
|
||||
targetNamespace: longhorn-system
|
||||
createNamespace: true
|
||||
valuesContent: |-
|
||||
defaultSettings:
|
||||
defaultDataPath: /mnt/arcodange/longhorn
|
||||
vars:
|
||||
longhorn_helm_values: {} # https://github.com/longhorn/longhorn/blob/master/chart/values.yaml
|
||||
|
||||
- name: customize k3s traefik configuration https://docs.k3s.io/helm
|
||||
block:
|
||||
- name: Get my public IP
|
||||
community.general.ipify_facts:
|
||||
- become: true
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/rancher/k3s/server/manifests/traefik-v3.yaml
|
||||
content: |-
|
||||
apiVersion: v1
|
||||
data:
|
||||
dynamic.yaml: |-
|
||||
{{ traefik_config_yaml | to_nice_yaml | indent( width=4 ) }}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: traefik-configmap
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: kube-system
|
||||
spec:
|
||||
repo: https://traefik.github.io/charts
|
||||
chart: traefik
|
||||
version: v37.4.0
|
||||
targetNamespace: kube-system
|
||||
valuesContent: |-
|
||||
{{ traefik_helm_values | to_nice_yaml | indent( width=4 ) }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: gitea-external
|
||||
namespace: kube-system
|
||||
spec:
|
||||
type: ExternalName
|
||||
externalName: {{ hostvars[groups.gitea[0]]['preferred_ip'] }}
|
||||
ports:
|
||||
- port: 3000
|
||||
targetPort: 3000
|
||||
vars:
|
||||
traefik_config_yaml:
|
||||
http:
|
||||
services:
|
||||
gitea:
|
||||
loadBalancer:
|
||||
servers:
|
||||
- url: "http://{{ hostvars[groups.gitea[0]]['preferred_ip'] }}:3000"
|
||||
# - url: "http://{{ lookup('dig', groups.gitea[0]) }}:3000" # might work again if deactivate rpi wifi
|
||||
routers:
|
||||
dashboard:
|
||||
rule: Host(`traefik.arcodange.duckdns.org`)
|
||||
service: api@internal
|
||||
middlewares:
|
||||
- localIp
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
domains:
|
||||
- main: "arcodange.duckdns.org"
|
||||
sans:
|
||||
- "traefik.arcodange.duckdns.org"
|
||||
entryPoints:
|
||||
- websecure
|
||||
- web
|
||||
acme-challenge:
|
||||
rule: Host(`arcodange.duckdns.org`) && PathPrefix(`/.well-known/acme-challenge`)
|
||||
service: acme-http@internal
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
domains:
|
||||
- main: "arcodange.duckdns.org"
|
||||
sans:
|
||||
- "*.arcodange.duckdns.org"
|
||||
entryPoints:
|
||||
- websecure
|
||||
- web
|
||||
gitea:
|
||||
rule: Host(`gitea.arcodange.duckdns.org`)
|
||||
service: gitea
|
||||
middlewares:
|
||||
- localIp
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
domains:
|
||||
- main: "arcodange.duckdns.org"
|
||||
sans:
|
||||
- "gitea.arcodange.duckdns.org"
|
||||
entrypoints:
|
||||
- websecure
|
||||
middlewares:
|
||||
localIp:
|
||||
ipAllowList:
|
||||
sourceRange:
|
||||
- "172.16.0.0/12"
|
||||
- "10.42.0.0/16"
|
||||
- "192.168.1.0/24"
|
||||
- "{{ ipify_public_ip }}/32"
|
||||
# - "0.0.0.0/0"
|
||||
# ipStrategy:
|
||||
# depth: 1
|
||||
traefik_helm_values:
|
||||
deployment:
|
||||
kind: "Deployment"
|
||||
initContainers:
|
||||
- name: volume-permissions
|
||||
image: busybox:latest
|
||||
command: ["sh", "-c", "touch /data/acme.json; chmod -v 600 /data/acme.json"]
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
# default is https://github.com/traefik/traefik-helm-chart/blob/v25.0.0/traefik/values.yaml <- for v25 (`kubectl describe deployments.apps traefik -n kube-system | grep helm.sh/chart`)
|
||||
# current is https://github.com/traefik/traefik-helm-chart/blob/v37.4.0/traefik/values.yaml
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/control-plane: 'true' # make predictible choice of node to direct https traffic to this node and avoid NAT/loss of client IP
|
||||
service:
|
||||
spec:
|
||||
externalTrafficPolicy: Local
|
||||
ports:
|
||||
traefik:
|
||||
expose:
|
||||
default: true
|
||||
web:
|
||||
forwardedHeaders:
|
||||
trustedIPs: ["10.42.0.0/16"] #default k3s cidr
|
||||
ingressRoute:
|
||||
dashboard:
|
||||
enabled: true
|
||||
globalArguments: [] # deactivate --global.sendanonymoususage
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: LEGO_DISABLE_CNAME_SUPPORT
|
||||
value: 'true'
|
||||
logs:
|
||||
general:
|
||||
level: INFO
|
||||
# format: json
|
||||
access:
|
||||
enabled: true
|
||||
# format: json
|
||||
podSecurityContext:
|
||||
runAsGroup: 65532
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
fsGroup: 65532 # else the persistent volume might be owned by root and be unwriteable
|
||||
persistence:
|
||||
# -- Enable persistence using Persistent Volume Claims
|
||||
# ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
|
||||
# It can be used to store TLS certificates, see `storage` in certResolvers
|
||||
enabled: true
|
||||
name: data
|
||||
# existingClaim: ""
|
||||
accessMode: ReadWriteOnce
|
||||
size: 128Mi
|
||||
storageClass: "longhorn"
|
||||
# volumeName: ""
|
||||
path: /data
|
||||
annotations: {}
|
||||
volumes:
|
||||
- name: traefik-configmap
|
||||
mountPath: /config
|
||||
type: configMap
|
||||
experimental:
|
||||
plugins:
|
||||
crowdsec-bouncer:
|
||||
moduleName: github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin #https://plugins.traefik.io/plugins/6335346ca4caa9ddeffda116/crowdsec-bouncer-traefik-plugin
|
||||
version: v1.3.3
|
||||
additionalArguments:
|
||||
- '--providers.file.filename=/config/dynamic.yaml'
|
||||
- '--providers.kubernetesingress.ingressendpoint.publishedservice=kube-system/traefik'
|
||||
- "--providers.kubernetescrd.allowcrossnamespace=true"
|
||||
- "--providers.kubernetescrd.allowExternalNameServices=true"
|
||||
certificatesResolvers:
|
||||
letsencrypt:
|
||||
acme:
|
||||
# for challenge options cf. https://doc.traefik.io/traefik/https/acme/
|
||||
email: arcodange@gmail.com
|
||||
tlsChallenge: true
|
||||
dnsChallenge:
|
||||
# requires env variable DUCKDNS_TOKEN
|
||||
provider: duckdns
|
||||
propagation:
|
||||
delayBeforeChecks: 120
|
||||
disableChecks: true
|
||||
resolvers:
|
||||
- "1.1.1.1:53"
|
||||
- "8.8.8.8:53"
|
||||
httpChallenge:
|
||||
entryPoint: "web"
|
||||
# It has to match the path with a persistent volume
|
||||
storage: /data/acme.json
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: traefik-duckdns-token
|
||||
# MY_TOKEN=<my token (see https://www.duckdns.org/domains)>
|
||||
# kubectl create secret generic traefik-duckdns-token --from-literal="DUCKDNS_TOKEN=$MY_TOKEN" -n kube-system
|
||||
- name: touch manifests/traefik.yaml to trigger update
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/rancher/k3s/server/manifests/traefik-v3.yaml
|
||||
state: touch
|
||||
become: true
|
||||
|
||||
|
||||
# ---
|
||||
|
||||
- name: redeploy traefik
|
||||
hosts: localhost
|
||||
tasks:
|
||||
- name: delete old traefik deployment
|
||||
kubernetes.core.k8s:
|
||||
api_version: v1
|
||||
name: traefik
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
state: "absent"
|
||||
- name: delete old deployment job so the k3s helm controller redeploy with our new configuration
|
||||
kubernetes.core.k8s:
|
||||
api_version: batch/v1
|
||||
name: helm-install-traefik
|
||||
kind: Job
|
||||
namespace: kube-system
|
||||
state: "absent"
|
||||
- name: get traefik deployment
|
||||
kubernetes.core.k8s_info:
|
||||
api_version: v1
|
||||
name: traefik
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
wait: true
|
||||
register: traefik_deployment
|
||||
- ansible.builtin.debug:
|
||||
var: traefik_deployment
|
||||
- name: system
|
||||
ansible.builtin.import_playbook: ./system/system.yml
|
||||
@@ -27,19 +27,29 @@
|
||||
container_name: gitea_action
|
||||
restart: always
|
||||
environment:
|
||||
CONFIG_FILE: /config.yaml
|
||||
GITEA_INSTANCE_URL: >-
|
||||
http://{{ hostvars[groups.gitea[0]].ansible_host }}:3000
|
||||
GITEA_RUNNER_REGISTRATION_TOKEN: "{{ gitea_runner_token_cmd.stdout }}"
|
||||
GITEA_RUNNER_NAME: arcodange_global_runner_{{ inventory_hostname }}
|
||||
# GITEA_RUNNER_LABELS: host={{ansible_host}},env=any
|
||||
GITEA_RUNNER_LABELS: ubuntu-latest:docker://gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca,ubuntu-latest-ca:docker://gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca
|
||||
ports:
|
||||
- "43707:43707"
|
||||
networks:
|
||||
- gitea_action_network
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
- /etc/timezone:/etc/timezone:ro
|
||||
- /etc/localtime:/etc/localtime:ro
|
||||
extra_hosts:
|
||||
gitea.arcodange.duckdns.org: '{{ lookup("dig", "gitea.arcodange.duckdns.org") }}'
|
||||
- /etc/ssl/certs:/etc/ssl/certs:ro
|
||||
- /usr/local/share/ca-certificates/:/usr/local/share/ca-certificates/:ro
|
||||
- /mnt/arcodange/gitea-runner-cache:/home/git/.cache/actcache
|
||||
- /mnt/arcodange/gitea-runner-act:/root/.cache/act
|
||||
configs:
|
||||
- config.yaml
|
||||
networks:
|
||||
gitea_action_network:
|
||||
name: gitea_action_network
|
||||
configs:
|
||||
config.yaml:
|
||||
content: |
|
||||
@@ -78,23 +88,22 @@
|
||||
# If it's empty when registering, it will ask for inputting labels.
|
||||
# If it's empty when execute `daemon`, will use labels in `.runner` file.
|
||||
labels:
|
||||
- "ubuntu-latest:docker://gitea/runner-images:ubuntu-latest"
|
||||
- "ubuntu-22.04:docker://gitea/runner-images:ubuntu-22.04"
|
||||
- "ubuntu-20.04:docker://gitea/runner-images:ubuntu-20.04"
|
||||
- "ubuntu-latest:docker://gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca"
|
||||
- "ubuntu-latest-ca:docker://gitea.arcodange.lab/arcodange-org/runner-images:ubuntu-latest-ca"
|
||||
|
||||
cache:
|
||||
# Enable cache server to use actions/cache.
|
||||
enabled: true
|
||||
# The directory to store the cache data.
|
||||
# If it's empty, the cache data will be stored in $HOME/.cache/actcache.
|
||||
dir: ""
|
||||
dir: "/home/git/.cache/actcache"
|
||||
# The host of the cache server.
|
||||
# It's not for the address to listen, but the address to connect from job containers.
|
||||
# So 0.0.0.0 is a bad choice, leave it empty to detect automatically.
|
||||
host: "{{ ansible_default_ipv4.address }}"
|
||||
# The port of the cache server.
|
||||
# 0 means to use a random available port.
|
||||
port: 0
|
||||
port: 43707
|
||||
# The external cache server URL. Valid only when enable is true.
|
||||
# If it's specified, act_runner will use this URL as the ACTIONS_CACHE_URL rather than start a server by itself.
|
||||
# The URL should generally end with "/".
|
||||
@@ -131,7 +140,7 @@
|
||||
# If it's not empty or "-", the specified docker host will be used. An error will be returned if it doesn't work.
|
||||
docker_host: ""
|
||||
# Pull docker image(s) even if already present
|
||||
force_pull: true
|
||||
force_pull: false
|
||||
# Rebuild docker image(s) even if already present
|
||||
force_rebuild: false
|
||||
|
||||
@@ -143,193 +152,8 @@
|
||||
community.docker.docker_compose_v2:
|
||||
project_src: "/home/pi/arcodange/docker_composes/arcodange_factory_gitea_action"
|
||||
pull: missing
|
||||
state: present
|
||||
state: "{{ docker_compose_down_then_up }}"
|
||||
register: deploy_result
|
||||
|
||||
- name: Set PACKAGES_TOKEN secret to upload packages from CI
|
||||
run_once: True
|
||||
block:
|
||||
- name: Generate cicd PACKAGES_TOKEN
|
||||
include_role:
|
||||
name: arcodange.factory.gitea_token
|
||||
vars:
|
||||
gitea_token_name: PACKAGES_TOKEN
|
||||
gitea_token_fact_name: cicd_PACKAGES_TOKEN
|
||||
gitea_token_scopes: write:package
|
||||
gitea_token_replace: true
|
||||
|
||||
- name: Register cicd PACKAGES_TOKEN secrets
|
||||
include_role:
|
||||
name: arcodange.factory.gitea_secret
|
||||
vars:
|
||||
gitea_secret_name: PACKAGES_TOKEN
|
||||
gitea_secret_value: "{{ cicd_PACKAGES_TOKEN }}"
|
||||
loop: ["organization", "user"]
|
||||
loop_control:
|
||||
loop_var: gitea_owner_type # Peut être "user" ou "organization"
|
||||
|
||||
post_tasks:
|
||||
- include_role:
|
||||
name: arcodange.factory.gitea_token
|
||||
vars:
|
||||
gitea_token_delete: true
|
||||
|
||||
|
||||
- name: Deploy Argo CD
|
||||
hosts: localhost
|
||||
roles:
|
||||
- role: arcodange.factory.gitea_token # generate gitea_api_token used to replace generated token with set name if required
|
||||
tags:
|
||||
- gitea_sync
|
||||
tasks:
|
||||
- name: Set factory repo
|
||||
include_role:
|
||||
name: arcodange.factory.gitea_repo
|
||||
vars:
|
||||
gitea_repo_name: factory
|
||||
- name: Sync other repos
|
||||
tags: gitea_sync
|
||||
include_role:
|
||||
name: arcodange.factory.gitea_sync
|
||||
apply:
|
||||
tags: gitea_sync
|
||||
- name: Generate Argo CD token
|
||||
include_role:
|
||||
name: arcodange.factory.gitea_token
|
||||
vars:
|
||||
gitea_token_name: ARGOCD_TOKEN
|
||||
gitea_token_fact_name: argocd_token
|
||||
gitea_token_scopes: read:repository,read:package
|
||||
gitea_token_replace: true
|
||||
- name: Figure out k3s master node
|
||||
shell:
|
||||
kubectl get nodes -l node-role.kubernetes.io/control-plane=true -o name | sed s'#node/##'
|
||||
register: get_k3s_master_node
|
||||
changed_when: false
|
||||
- name: Get kubernetes server internal url
|
||||
command: >-
|
||||
echo https://kubernetes.default.svc
|
||||
# {%raw%}
|
||||
# kubectl get svc/kubernetes -o template="{{.spec.clusterIP}}:{{(index .spec.ports 0).port}}"
|
||||
# {%endraw%}
|
||||
register: get_k3s_internal_server_url
|
||||
changed_when: false
|
||||
- set_fact:
|
||||
k3s_master_node: "{{ get_k3s_master_node.stdout }}"
|
||||
k3s_internal_server_url: "{{ get_k3s_internal_server_url.stdout }}"
|
||||
- name: Install Argo CD
|
||||
become: true
|
||||
delegate_to: "{{ k3s_master_node }}"
|
||||
vars:
|
||||
gitea_credentials:
|
||||
username: arcodange
|
||||
password: "{{ argocd_token }}"
|
||||
argocd_helm_values: # https://github.com/argoproj/argo-helm/blob/main/charts/argo-cd/values.yaml
|
||||
global:
|
||||
domain: argocd.arcodange.duckdns.org
|
||||
configs:
|
||||
params:
|
||||
server.insecure: true # let k3s traefik do TLS termination
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/rancher/k3s/server/manifests/argocd.yaml
|
||||
content: |-
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: argocd
|
||||
---
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
name: argocd
|
||||
namespace: kube-system
|
||||
spec:
|
||||
repo: https://argoproj.github.io/argo-helm
|
||||
chart: argo-cd
|
||||
targetNamespace: argocd
|
||||
valuesContent: |-
|
||||
{{ argocd_helm_values | to_nice_yaml | indent( width=4 ) }}
|
||||
---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: argocd-server-ingress
|
||||
namespace: argocd
|
||||
annotations:
|
||||
# For Traefik v2.x
|
||||
traefik.ingress.kubernetes.io/router.entrypoints: websecure
|
||||
traefik.ingress.kubernetes.io/router.tls: "true"
|
||||
traefik.ingress.kubernetes.io/router.tls.certresolver: letsencrypt
|
||||
traefik.ingress.kubernetes.io/router.tls.domains.0.main: arcodange.duckdns.org
|
||||
traefik.ingress.kubernetes.io/router.tls.domains.0.sans: argocd.arcodange.duckdns.org
|
||||
traefik.ingress.kubernetes.io/router.middlewares: localIp@file
|
||||
spec:
|
||||
rules:
|
||||
- host: argocd.arcodange.duckdns.org
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: argocd-server
|
||||
port:
|
||||
number: 80 #TLS is terminated at Traefik
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: gitea-arcodangeorg-factory-repo
|
||||
namespace: argocd
|
||||
labels:
|
||||
argocd.argoproj.io/secret-type: repository
|
||||
stringData:
|
||||
type: git
|
||||
url: https://gitea.arcodange.duckdns.org/arcodange-org/factory
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: gitea-arcodangeorg-repo-creds
|
||||
namespace: argocd
|
||||
labels:
|
||||
argocd.argoproj.io/secret-type: repo-creds
|
||||
stringData:
|
||||
type: git
|
||||
url: https://gitea.arcodange.duckdns.org/arcodange-org
|
||||
password: {{ gitea_credentials.password }}
|
||||
username: {{ gitea_credentials.username }}
|
||||
---
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
metadata:
|
||||
name: factory
|
||||
namespace: argocd
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: https://gitea.arcodange.duckdns.org/arcodange-org/factory
|
||||
targetRevision: HEAD
|
||||
path: argocd
|
||||
destination:
|
||||
server: {{ k3s_internal_server_url }}
|
||||
namespace: argocd
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
- name: touch manifests/argocd.yaml to trigger update
|
||||
delegate_to: "{{ k3s_master_node }}"
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/rancher/k3s/server/manifests/argocd.yaml
|
||||
state: touch
|
||||
become: true
|
||||
post_tasks:
|
||||
- include_role:
|
||||
name: arcodange.factory.gitea_token
|
||||
apply:
|
||||
tags: gitea_sync
|
||||
tags:
|
||||
- gitea_sync
|
||||
vars:
|
||||
gitea_token_delete: true
|
||||
loop: ["absent", "present"]
|
||||
loop_control:
|
||||
loop_var: docker_compose_down_then_up
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
---
|
||||
- name: setup cron report
|
||||
ansible.builtin.import_playbook: cron_report.yml
|
||||
# - name: setup cron report
|
||||
# ansible.builtin.import_playbook: cron_report.yml
|
||||
|
||||
- name: postgres
|
||||
ansible.builtin.import_playbook: postgres.yml
|
||||
@@ -12,4 +12,10 @@
|
||||
ansible.builtin.import_playbook: gitea.yml
|
||||
vars:
|
||||
backup_root_dir: "/mnt/backups"
|
||||
backup_dirname: "gitea"
|
||||
backup_dirname: "gitea"
|
||||
|
||||
- name: k3s_pvc
|
||||
ansible.builtin.import_playbook: k3s_pvc.yml
|
||||
vars:
|
||||
backup_root_dir: "/mnt/backups"
|
||||
backup_dirname: "k3s_pvc"
|
||||
@@ -24,7 +24,7 @@
|
||||
name:
|
||||
- postfix
|
||||
- msmtp
|
||||
- msmtp-mta
|
||||
# - msmtp-mta # conflicts with recent pi setup - may be required by pi2 with old setup
|
||||
- mailutils
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
gitea_user: "git"
|
||||
backup_dir: "{{ backup_root_dir }}/{{ backup_dirname }}"
|
||||
scripts_dir: "/home/pi/arcodange/docker_composes/gitea/scripts"
|
||||
keep_days: 15
|
||||
keep_days: 3
|
||||
|
||||
tasks:
|
||||
- name: S'assurer que le répertoire de backup existe
|
||||
@@ -22,7 +22,7 @@
|
||||
set_fact:
|
||||
backup_cmd: >-
|
||||
docker exec -u {{ gitea_user }} {{ gitea_container_name }}
|
||||
gitea dump --skip-log --skip-db --type tar.gz -c /data/gitea/conf/app.ini -C /data/gitea/ -f -
|
||||
gitea dump --skip-log --skip-db --skip-package-data --type tar.gz -c /data/gitea/conf/app.ini -C /data/gitea/ -f -
|
||||
|
||||
- name: test backup_cmd
|
||||
ansible.builtin.shell: |
|
||||
|
||||
101
ansible/arcodange/factory/playbooks/backup/k3s_pvc.yml
Normal file
101
ansible/arcodange/factory/playbooks/backup/k3s_pvc.yml
Normal file
@@ -0,0 +1,101 @@
|
||||
---
|
||||
- name: Backup K3S Persistent Volumes
|
||||
hosts: pi1
|
||||
gather_facts: yes
|
||||
become: yes
|
||||
|
||||
vars:
|
||||
backup_dir: "{{ backup_root_dir }}/{{ backup_dirname }}"
|
||||
scripts_dir: "/opt/k3s_volumes"
|
||||
keep_days: 3
|
||||
|
||||
tasks:
|
||||
- name: S'assurer que le répertoire de backup existe
|
||||
file:
|
||||
path: "{{ backup_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: S'assurer que le répertoire de scripts existe
|
||||
file:
|
||||
path: "{{ scripts_dir }}"
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: define backup command
|
||||
set_fact:
|
||||
# PVs + PVCs + Longhorn Volume CRDs (critical for fast recovery — without Volume CRDs,
|
||||
# Longhorn cannot re-associate orphaned replica dirs after a reinstall and forces
|
||||
# full block-device injection recovery. See docs/adr/20260414-longhorn-pvc-recovery.md)
|
||||
backup_cmd: >-
|
||||
kubectl get -A pv,pvc -o yaml
|
||||
&& echo '---'
|
||||
&& kubectl get -A volumes.longhorn.io -o yaml
|
||||
&& echo '---'
|
||||
&& kubectl get -A settings.longhorn.io -o yaml
|
||||
|
||||
- name: test backup_cmd
|
||||
ansible.builtin.shell: |
|
||||
{{ backup_cmd }} > /dev/null
|
||||
|
||||
- name: Créer le script de backup
|
||||
copy:
|
||||
dest: "{{ scripts_dir }}/backup.sh"
|
||||
mode: '0755'
|
||||
content: |
|
||||
#!/bin/bash
|
||||
set -e
|
||||
mkdir -p {{ backup_dir }}
|
||||
{{ backup_cmd }} > {{ backup_dir }}/backup_$(date +\%Y\%m\%d).volumes
|
||||
find {{ backup_dir }} -type f -name 'backup_*.volumes' -mtime +{{ keep_days }} -delete
|
||||
|
||||
SCRIPTS_DIR="$(dirname "$(realpath "${BASH_SOURCE[0]}")")"
|
||||
{{ backup_cmd }} > $SCRIPTS_DIR/backup.volumes
|
||||
|
||||
- name: Ajouter une tâche cron pour backup k3s volumes tous les jours à 4h
|
||||
cron:
|
||||
name: "Backup K3S Volumes"
|
||||
minute: "0"
|
||||
hour: "4"
|
||||
user: root
|
||||
job: "{{ scripts_dir }}/backup.sh"
|
||||
|
||||
- name: Créer le script de restauration
|
||||
copy:
|
||||
dest: "{{ scripts_dir }}/restore.sh"
|
||||
mode: '0755'
|
||||
content: |
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PRIMARY_BACKUP_DIR="{{ backup_dir }}"
|
||||
FALLBACK_BACKUP_DIR="/home/pi/arcodange/backups/k3s_pvc"
|
||||
|
||||
# Check if fallback directory exists and has backups
|
||||
if [ -d "$FALLBACK_BACKUP_DIR" ] && ls "$FALLBACK_BACKUP_DIR"/*.volumes 1>/dev/null 2>&1; then
|
||||
BACKUP_DIR="$FALLBACK_BACKUP_DIR"
|
||||
echo "Using fallback backup directory: $BACKUP_DIR"
|
||||
elif [ -d "$PRIMARY_BACKUP_DIR" ] && ls "$PRIMARY_BACKUP_DIR"/*.volumes 1>/dev/null 2>&1; then
|
||||
BACKUP_DIR="$PRIMARY_BACKUP_DIR"
|
||||
else
|
||||
echo "No backup directory found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
FILE=$(ls -1t "$BACKUP_DIR"/backup_*.volumes | head -n 1)
|
||||
echo "No date provided, restoring latest dump: $FILE"
|
||||
else
|
||||
FILE="$BACKUP_DIR/backup_$1.volumes"
|
||||
if [ ! -f "$FILE" ]; then
|
||||
echo "File $FILE not found"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
kubectl apply -f "$FILE"
|
||||
|
||||
echo "K3S volumes restoration complete."
|
||||
echo "NOTE: file includes PVs, PVCs, and Longhorn Volume CRDs."
|
||||
echo "If Longhorn replica dirs are still orphaned after this restore,"
|
||||
echo "fall back to: ansible-playbook playbooks/recover/longhorn_data.yml"
|
||||
@@ -9,7 +9,7 @@
|
||||
postgres_user: "{{ postgres.dockercompose.services.postgres.environment.POSTGRES_USER }}"
|
||||
backup_dir: "{{ backup_root_dir }}/{{ backup_dirname }}"
|
||||
scripts_dir: "/home/pi/arcodange/docker_composes/postgres/scripts"
|
||||
keep_days: 15
|
||||
keep_days: 3
|
||||
|
||||
tasks:
|
||||
- name: S'assurer que le répertoire de backup existe
|
||||
|
||||
2
ansible/arcodange/factory/playbooks/dns/dns.yml
Normal file
2
ansible/arcodange/factory/playbooks/dns/dns.yml
Normal file
@@ -0,0 +1,2 @@
|
||||
- name: pihole
|
||||
ansible.builtin.import_playbook: pihole.yml
|
||||
11
ansible/arcodange/factory/playbooks/dns/pihole.yml
Normal file
11
ansible/arcodange/factory/playbooks/dns/pihole.yml
Normal file
@@ -0,0 +1,11 @@
|
||||
---
|
||||
- name: Installer et configurer Pi-hole sur pi1
|
||||
hosts: raspberries:&local
|
||||
become: yes
|
||||
vars:
|
||||
|
||||
pihole_custom_dns:
|
||||
".arcodange.duckdns.org": "{{ hostvars['pi1'].preferred_ip }}"
|
||||
".arcodange.lab": "{{ hostvars['pi1'].preferred_ip }}"
|
||||
roles:
|
||||
- pihole
|
||||
@@ -0,0 +1,8 @@
|
||||
pihole_primary: pi1
|
||||
pihole_user_gravity: pihole_gravity
|
||||
pihole_gravity_home: /var/lib/pihole_gravity
|
||||
pihole_dns_domain: lab
|
||||
pihole_ports: '8081o,443os,[::]:8081o,[::]:443os' # web interface
|
||||
pihole_gravity_conf: /etc/gravity-sync/gravity-sync.conf # should not be changed
|
||||
pihole_custom_dns: {}
|
||||
pihole_upstream_dns: ["8.8.8.8", "1.1.1.1", "8.8.4.4"] # Explicit upstream DNS servers
|
||||
@@ -0,0 +1,5 @@
|
||||
---
|
||||
- name: Restart Pi-hole
|
||||
service:
|
||||
name: pihole-FTL
|
||||
state: restarted
|
||||
@@ -0,0 +1,75 @@
|
||||
---
|
||||
- name: Build DNS server list (exclude self)
|
||||
set_fact:
|
||||
pihole_dns_servers: >-
|
||||
{{
|
||||
groups['pihole']
|
||||
| reject('equalto', inventory_hostname)
|
||||
| map('extract', hostvars, 'preferred_ip')
|
||||
| list
|
||||
}}
|
||||
|
||||
# 1️⃣ Supprimer d’éventuelles anciennes entrées Pi-hole
|
||||
- name: Remove existing Pi-hole nameservers
|
||||
lineinfile:
|
||||
path: /etc/resolv.conf
|
||||
regexp: '^nameserver ({{ pihole_dns_servers | join("|") }})$'
|
||||
state: absent
|
||||
when: pihole_dns_servers | length > 0
|
||||
|
||||
# 2️⃣ Insérer les Pi-hole juste après la ligne search
|
||||
- name: Insert Pi-hole nameservers with priority
|
||||
lineinfile:
|
||||
path: /etc/resolv.conf
|
||||
insertafter: '^search'
|
||||
line: "nameserver {{ item }}"
|
||||
state: present
|
||||
loop: "{{ pihole_dns_servers }}"
|
||||
|
||||
|
||||
# 3️⃣ Définir les priorités par interface
|
||||
- name: Set DNS priority mapping
|
||||
set_fact:
|
||||
interface_dns_priority:
|
||||
eth0: 50
|
||||
wlan0: 100
|
||||
|
||||
# 5️⃣ Configurer les DNS Pi-hole sur toutes les interfaces actives
|
||||
|
||||
- name: Get active connections
|
||||
command: nmcli -t -f NAME,DEVICE connection show --active
|
||||
register: active_connections
|
||||
changed_when: false
|
||||
|
||||
- name: Get current DNS for each active interface
|
||||
vars:
|
||||
iface_name: "{{ item.split(':')[1] }}"
|
||||
conn_name: "{{ item.split(':')[0] }}"
|
||||
loop: "{{ active_connections.stdout_lines }}"
|
||||
when: item.split(':')[1] in interface_dns_priority
|
||||
command: nmcli -g IP4.DNS connection show "{{ conn_name }}"
|
||||
register: current_dns
|
||||
changed_when: false
|
||||
|
||||
- name: Apply Pi-hole DNS if different
|
||||
vars:
|
||||
iface_name: "{{ item.split(':')[1] }}"
|
||||
conn_name: "{{ item.split(':')[0] }}"
|
||||
loop: "{{ active_connections.stdout_lines }}"
|
||||
when: item.split(':')[1] in interface_dns_priority
|
||||
command: >
|
||||
nmcli connection modify "{{ conn_name }}"
|
||||
ipv4.dns "{{ pihole_dns_servers | join(' ') }}"
|
||||
ipv4.ignore-auto-dns yes
|
||||
ipv4.dns-priority "{{ interface_dns_priority[iface_name] }}"
|
||||
register: dns_changed
|
||||
changed_when: dns_changed is defined and dns_changed.stdout != ""
|
||||
|
||||
- name: Reactivate interface if DNS changed
|
||||
vars:
|
||||
iface_name: "{{ item.split(':')[1] }}"
|
||||
conn_name: "{{ item.split(':')[0] }}"
|
||||
loop: "{{ active_connections.stdout_lines }}"
|
||||
when: item.split(':')[1] in interface_dns_priority
|
||||
command: nmcli connection up "{{ conn_name }}"
|
||||
when: dns_changed is defined and dns_changed.changed
|
||||
@@ -0,0 +1,153 @@
|
||||
---
|
||||
# -------------------------------------------------------------------
|
||||
# Gravity Sync HA setup – final version with SSH key rotation
|
||||
# -------------------------------------------------------------------
|
||||
|
||||
- name: Determine primary Pi-hole
|
||||
set_fact:
|
||||
pihole_primary: "{{ groups['pihole'] | first }}"
|
||||
|
||||
- name: Set secondary Pi-hole hosts
|
||||
set_fact:
|
||||
pihole_secondaries: "{{ groups['pihole'] | difference([pihole_primary]) }}"
|
||||
|
||||
#################################################################
|
||||
# 1️⃣ Ensure gravity user exists on all Pi-hole nodes
|
||||
#################################################################
|
||||
|
||||
- name: Ensure gravity user exists
|
||||
user:
|
||||
name: "{{ pihole_user_gravity }}"
|
||||
home: "{{ pihole_gravity_home }}"
|
||||
shell: /bin/bash
|
||||
system: yes
|
||||
create_home: yes
|
||||
|
||||
- name: Create .ssh directory for gravity user
|
||||
file:
|
||||
path: "{{ pihole_gravity_home }}/.ssh"
|
||||
state: directory
|
||||
owner: "{{ pihole_user_gravity }}"
|
||||
group: "{{ pihole_user_gravity }}"
|
||||
mode: '0700'
|
||||
|
||||
#################################################################
|
||||
# 2️⃣ Generate SSH key for each host (rotation at each run)
|
||||
#################################################################
|
||||
|
||||
- name: Generate SSH keypair for gravity user
|
||||
openssh_keypair:
|
||||
path: "{{ pihole_gravity_home }}/.ssh/id_ed25519"
|
||||
type: ed25519
|
||||
owner: "{{ pihole_user_gravity }}"
|
||||
group: "{{ pihole_user_gravity }}"
|
||||
mode: '0600'
|
||||
register: gravity_key
|
||||
no_log: true
|
||||
|
||||
- name: Set gravity key in hostvars
|
||||
set_fact:
|
||||
gravity_pubkey: "{{ gravity_key.public_key }}"
|
||||
|
||||
- name: Clean authorized_keys for gravity user
|
||||
file:
|
||||
path: "{{ pihole_gravity_home }}/.ssh/authorized_keys"
|
||||
state: absent
|
||||
|
||||
- name: Authorize SSH keys from other Pi-hole hosts
|
||||
authorized_key:
|
||||
user: "{{ pihole_user_gravity }}"
|
||||
key: "{{ hostvars[item].gravity_pubkey }}"
|
||||
state: present
|
||||
loop: "{{ groups['pihole'] }}"
|
||||
when: inventory_hostname != item
|
||||
|
||||
- name: Add all Pi-hole hosts to known_hosts
|
||||
known_hosts:
|
||||
path: "{{ pihole_gravity_home }}/.ssh/known_hosts"
|
||||
name: "{{ item }}"
|
||||
key: "{{ lookup('pipe', 'ssh-keyscan -t ed25519 ' ~ item) }}"
|
||||
state: present
|
||||
loop: "{{ groups['pihole'] }}"
|
||||
when: inventory_hostname != item
|
||||
become: yes
|
||||
become_user: "{{ pihole_user_gravity }}"
|
||||
|
||||
#################################################################
|
||||
# Install Gravity Sync binary if absent
|
||||
#################################################################
|
||||
|
||||
- name: Check if Gravity Sync binary exists
|
||||
stat:
|
||||
path: /usr/local/bin/gravity-sync
|
||||
register: gravity_sync_bin
|
||||
|
||||
- name: Download installer
|
||||
get_url:
|
||||
url: https://raw.githubusercontent.com/vmstan/gs-install/main/gs-install.sh
|
||||
dest: /tmp/gs-install.sh
|
||||
mode: '0755'
|
||||
when: not gravity_sync_bin.stat.exists
|
||||
|
||||
- name: Give full sudo to gravity user
|
||||
copy:
|
||||
dest: /etc/sudoers.d/gravity-sync
|
||||
mode: '0440'
|
||||
content: "{{ pihole_user_gravity }} ALL=(ALL) NOPASSWD: ALL"
|
||||
when: not gravity_sync_bin.stat.exists
|
||||
|
||||
- name: Execute Gravity Sync installer non-interactively
|
||||
command: bash /tmp/gs-install.sh
|
||||
become: yes
|
||||
become_user: "{{ pihole_user_gravity }}"
|
||||
environment:
|
||||
HOME: "{{ pihole_gravity_home }}"
|
||||
when: not gravity_sync_bin.stat.exists
|
||||
|
||||
#################################################################
|
||||
# Generate gravity-sync.conf for non-interactive use
|
||||
#################################################################
|
||||
|
||||
- name: Set remote host for gravity-sync.conf
|
||||
set_fact:
|
||||
remote_pihole: "{{ (inventory_hostname == pihole_primary) | ternary(pihole_secondaries[0] ~ '.home', pihole_primary ~ '.home') }}"
|
||||
|
||||
- name: Create gravity-sync.conf file
|
||||
copy:
|
||||
dest: "{{ pihole_gravity_conf }}"
|
||||
owner: "{{ pihole_user_gravity }}"
|
||||
group: "{{ pihole_user_gravity }}"
|
||||
mode: '0600'
|
||||
content: |
|
||||
# REQUIRED SETTINGS
|
||||
REMOTE_HOST='{{ remote_pihole }}'
|
||||
REMOTE_USER='{{ pihole_user_gravity }}'
|
||||
|
||||
# CUSTOM VARIABLES
|
||||
# LOCAL_PIHOLE_DIRECTORY='/etc/pihole'
|
||||
# REMOTE_PIHOLE_DIRECTORY='/etc/pihole'
|
||||
# LOCAL_FILE_OWNER='{{ pihole_user_gravity }}'
|
||||
# REMOTE_FILE_OWNER='{{ pihole_user_gravity }}'
|
||||
|
||||
# LOCAL_DOCKER_CONTAINER='' # optional
|
||||
# REMOTE_DOCKER_CONTAINER='' # optional
|
||||
|
||||
- name: Create symlink for gravity-sync.rsa
|
||||
file:
|
||||
src: "{{ pihole_gravity_home }}/.ssh/id_ed25519"
|
||||
dest: /etc/gravity-sync/gravity-sync.rsa
|
||||
owner: "{{ pihole_user_gravity }}"
|
||||
group: "{{ pihole_user_gravity }}"
|
||||
mode: '0600'
|
||||
state: link
|
||||
|
||||
#################################################################
|
||||
# Execute Gravity Sync with non-interactive config
|
||||
#################################################################
|
||||
|
||||
- name: Run Gravity Sync script
|
||||
command: bash /usr/local/bin/gravity-sync
|
||||
become: yes
|
||||
become_user: "{{ pihole_user_gravity }}"
|
||||
environment:
|
||||
HOME: "{{ pihole_gravity_home }}"
|
||||
@@ -0,0 +1,114 @@
|
||||
#################################################################
|
||||
# Bootstrap Pi-hole (installation manuelle attendue)
|
||||
#################################################################
|
||||
|
||||
- name: Proposer la commande d'installation manuelle de Pi-hole
|
||||
debug:
|
||||
msg: |
|
||||
Veuillez installer Pi-hole manuellement sur ce host avec la commande suivante :
|
||||
------------------------------------------------------------
|
||||
curl -sSL https://install.pi-hole.net | sudo bash
|
||||
------------------------------------------------------------
|
||||
L'installation sera vérifiée automatiquement dans les 10 prochaines minutes.
|
||||
|
||||
#################################################################
|
||||
# Vérification installation Pi-hole
|
||||
#################################################################
|
||||
|
||||
- name: Attendre que Pi-hole soit installé (FTL DB)
|
||||
wait_for:
|
||||
path: /etc/pihole/pihole-FTL.db
|
||||
state: present
|
||||
timeout: 600 # 10 minutes
|
||||
register: pihole_config_ready
|
||||
|
||||
- name: Vérifier que le service pihole-FTL est actif
|
||||
wait_for:
|
||||
port: 53
|
||||
state: started
|
||||
timeout: 60
|
||||
when: pihole_config_ready is succeeded
|
||||
|
||||
#################################################################
|
||||
# Configuration Pi-hole (commune HA)
|
||||
#################################################################
|
||||
|
||||
- name: Modifier le port d'écoute Pi-hole
|
||||
replace:
|
||||
path: /etc/pihole/pihole.toml
|
||||
regexp: '^\s*port\s*=\s*".*"'
|
||||
replace: ' port = "{{ pihole_ports }}"'
|
||||
notify: Restart Pi-hole
|
||||
|
||||
- name: Autoriser Pi-hole à écouter sur toutes les interfaces
|
||||
replace:
|
||||
path: /etc/pihole/pihole.toml
|
||||
regexp: '^\s*listeningMode\s*=\s*".*"'
|
||||
replace: ' listeningMode = "ALL"'
|
||||
notify: Restart Pi-hole
|
||||
|
||||
- name: Activer le chargement de /etc/dnsmasq.d
|
||||
lineinfile:
|
||||
path: /etc/pihole/pihole.toml
|
||||
regexp: '^\s*etc_dnsmasq_d\s*='
|
||||
line: ' etc_dnsmasq_d = true'
|
||||
state: present
|
||||
notify: Restart Pi-hole
|
||||
|
||||
#################################################################
|
||||
# DNS custom (wildcard + locaux)
|
||||
#################################################################
|
||||
|
||||
- name: Validate custom DNS IPs
|
||||
assert:
|
||||
that:
|
||||
- ip is match('^([0-9]{1,3}\.){3}[0-9]{1,3}$')
|
||||
fail_msg: "Invalid IP for {{ fqdn }}"
|
||||
loop: "{{ pihole_custom_dns | dict2items }}"
|
||||
loop_control:
|
||||
label: "{{ item.key }}"
|
||||
vars:
|
||||
fqdn: "{{ item.key }}"
|
||||
ip: "{{ item.value }}"
|
||||
|
||||
- name: Générer les règles DNS custom (wildcards + FQDN)
|
||||
copy:
|
||||
dest: /etc/dnsmasq.d/10-custom-rules.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
content: |
|
||||
# Generated by Ansible – Pi-hole custom DNS rules
|
||||
{% for fqdn, ip in pihole_custom_dns.items() %}
|
||||
address=/{{ fqdn }}/{{ ip }}
|
||||
{% endfor %}
|
||||
when: pihole_custom_dns | length > 0
|
||||
notify: Restart Pi-hole
|
||||
|
||||
- name: Créer les entrées DNS locales pour les RPis
|
||||
copy:
|
||||
dest: /etc/dnsmasq.d/20-rpis.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
content: |
|
||||
# Generated by Ansible – Raspberry Pi local DNS
|
||||
{% for host in groups['raspberries']
|
||||
if hostvars[host].preferred_ip is defined %}
|
||||
address=/{{ host }}.home/{{ hostvars[host].preferred_ip }}
|
||||
{% endfor %}
|
||||
notify: Restart Pi-hole
|
||||
|
||||
- name: Configure explicit upstream DNS servers for Pi-hole
|
||||
copy:
|
||||
dest: /etc/dnsmasq.d/99-upstream.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: '0644'
|
||||
content: |
|
||||
# Generated by Ansible – Explicit upstream DNS servers
|
||||
# Fixes issue where Pi-hole relies on DHCP-provided DNS which may be unavailable
|
||||
{% for dns_server in pihole_upstream_dns %}
|
||||
server={{ dns_server }}
|
||||
{% endfor %}
|
||||
notify: Restart Pi-hole
|
||||
@@ -0,0 +1,11 @@
|
||||
---
|
||||
- name: Setup Pi-hole HA
|
||||
include_tasks: ha_pihole_setup.yml
|
||||
when: "'pihole' in group_names"
|
||||
|
||||
- name: Setup Gravity Sync
|
||||
include_tasks: gravity_setup.yml
|
||||
when: "'pihole' in group_names"
|
||||
|
||||
- name: Setup DNS client
|
||||
include_tasks: client_setup.yml
|
||||
536
ansible/arcodange/factory/playbooks/recover/longhorn.yml
Normal file
536
ansible/arcodange/factory/playbooks/recover/longhorn.yml
Normal file
@@ -0,0 +1,536 @@
|
||||
---
|
||||
- name: Recover Longhorn from Power Cut - CSI Driver Registration Loss
|
||||
hosts: raspberries:&local
|
||||
gather_facts: yes
|
||||
become: yes
|
||||
|
||||
vars:
|
||||
# Backup locations
|
||||
primary_backup_dir: "/mnt/backups/k3s_pvc"
|
||||
fallback_backup_dir: "/home/pi/arcodange/backups/k3s_pvc"
|
||||
scripts_dir: "/opt/k3s_volumes"
|
||||
|
||||
# Longhorn configuration
|
||||
longhorn_manifest_path: "/var/lib/rancher/k3s/server/manifests/longhorn-install.yaml"
|
||||
longhorn_namespace: "longhorn-system"
|
||||
longhorn_chart_name: "longhorn-install"
|
||||
longhorn_chart_namespace: "kube-system"
|
||||
|
||||
# Data paths (DO NOT MODIFY - points to actual volume data)
|
||||
longhorn_data_path: "/mnt/arcodange/longhorn"
|
||||
|
||||
tasks:
|
||||
# ========================================================================
|
||||
# PHASE 0: Pre-flight Checks
|
||||
# ========================================================================
|
||||
|
||||
- name: Verify data directory exists on control plane
|
||||
ansible.builtin.stat:
|
||||
path: "{{ longhorn_data_path }}"
|
||||
register: data_dir
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
|
||||
- name: FAIL if data directory missing
|
||||
ansible.builtin.fail:
|
||||
msg: "CRITICAL: Longhorn data directory {{ longhorn_data_path }} does not exist. Aborting recovery."
|
||||
when: inventory_hostname == 'pi1' and not data_dir.stat.exists
|
||||
run_once: true
|
||||
|
||||
- name: Check for fallback backups on pi1
|
||||
ansible.builtin.shell: ls {{ fallback_backup_dir }}/backup_*.volumes 2>/dev/null
|
||||
register: fallback_backup_check
|
||||
changed_when: false
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Check for primary backups on pi1
|
||||
ansible.builtin.shell: ls {{ primary_backup_dir }}/backup_*.volumes 2>/dev/null
|
||||
register: primary_backup_check
|
||||
changed_when: false
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Set backup fact
|
||||
ansible.builtin.set_fact:
|
||||
has_backups: "{{ (fallback_backup_check.rc == 0 and fallback_backup_check.stdout | trim != '') or (primary_backup_check.rc == 0 and primary_backup_check.stdout | trim != '') }}"
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
|
||||
- name: FAIL if no backups found
|
||||
ansible.builtin.fail:
|
||||
msg: "No backup files found in {{ primary_backup_dir }} or {{ fallback_backup_dir }}. Cannot proceed."
|
||||
when: inventory_hostname == 'pi1' and not has_backups | bool
|
||||
run_once: true
|
||||
|
||||
# ========================================================================
|
||||
# PHASE 1: Diagnosis - Check Current State
|
||||
# ========================================================================
|
||||
|
||||
- name: Gather Longhorn namespace status
|
||||
block:
|
||||
- name: Check if longhorn-system namespace exists
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Namespace
|
||||
name: "{{ longhorn_namespace }}"
|
||||
register: longhorn_ns
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Check CSI driver registration
|
||||
kubernetes.core.k8s_info:
|
||||
kind: CSIDriver
|
||||
name: driver.longhorn.io
|
||||
register: csi_driver
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Check Longhorn manager pods
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- app=longhorn-manager
|
||||
register: managers
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Set recovery_phase fact
|
||||
ansible.builtin.set_fact:
|
||||
recovery_phase: "none"
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Determine recovery phase needed
|
||||
ansible.builtin.set_fact:
|
||||
recovery_phase: >-
|
||||
{% if csi_driver.failed %}
|
||||
soft
|
||||
{% elif managers.failed or managers.resources | default([]) | selectattr('status.phase', 'defined') | selectattr('status.phase', 'ne', 'Running') | list | length > 0 %}
|
||||
hard
|
||||
{% elif longhorn_ns.failed %}
|
||||
none
|
||||
{% else %}
|
||||
none
|
||||
{% endif %}
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Display recovery diagnosis
|
||||
ansible.builtin.debug:
|
||||
msg: "Diagnosis: recovery_phase={{ recovery_phase | default('none') }}. CSI Driver exists: {{ not csi_driver.failed | bool }}, Managers healthy: {{ managers.failed | ternary('unknown', managers.resources | default([]) | selectattr('status.phase', 'defined') | selectattr('status.phase', 'eq', 'Running') | list | length >= 3) | bool }}"
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
|
||||
# ========================================================================
|
||||
# PHASE 2: Soft Recovery - Touch Manifest
|
||||
# ========================================================================
|
||||
|
||||
- name: Execute soft recovery - touch Longhorn manifest
|
||||
block:
|
||||
- name: Touch longhorn-install.yaml manifest
|
||||
ansible.builtin.file:
|
||||
path: "{{ longhorn_manifest_path }}"
|
||||
state: touch
|
||||
register: manifest_touch
|
||||
when: inventory_hostname == 'pi1'
|
||||
|
||||
- name: Wait for k3s to detect manifest change
|
||||
ansible.builtin.pause:
|
||||
minutes: 1
|
||||
when: manifest_touch is changed
|
||||
|
||||
- name: Check if Longhorn pods are recreating
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
register: longhorn_pods
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Verify soft recovery success
|
||||
ansible.builtin.set_fact:
|
||||
soft_recovery_success: >-
|
||||
{{ (longhorn_pods.resources | default([]) | selectattr('metadata.creationTimestamp', 'defined') | list | length) >= 10 }}
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
when: recovery_phase == 'soft' and inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
|
||||
# ========================================================================
|
||||
# PHASE 3: Hard Recovery - Delete Driver-Deployer
|
||||
# ========================================================================
|
||||
|
||||
- name: Execute hard recovery - delete driver-deployer pods
|
||||
block:
|
||||
- name: Get driver-deployer pods
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- app=longhorn-driver-deployer
|
||||
register: driver_deployer_pods
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Delete driver-deployer pods
|
||||
kubernetes.core.k8s:
|
||||
state: absent
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.metadata.name }}"
|
||||
force: yes
|
||||
grace_period: 0
|
||||
loop: "{{ driver_deployer_pods.resources | default([]) }}"
|
||||
when: driver_deployer_pods.resources | default([]) | length > 0
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Wait for HelmChart to recreate driver-deployer
|
||||
ansible.builtin.pause:
|
||||
minutes: 2
|
||||
|
||||
- name: Check driver-deployer status
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- app=longhorn-driver-deployer
|
||||
register: new_driver_deployer
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
when: (recovery_phase == 'hard' or (recovery_phase == 'soft' and not soft_recovery_success | default(false))) and inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
|
||||
# ========================================================================
|
||||
# PHASE 4: Nuclear Recovery - Full Reinstall
|
||||
# ========================================================================
|
||||
|
||||
- name: Execute nuclear recovery - full Longhorn reinstall
|
||||
block:
|
||||
# Step 1: Delete HelmChart
|
||||
- name: Delete Longhorn HelmChart
|
||||
kubernetes.core.k8s:
|
||||
state: absent
|
||||
kind: HelmChart
|
||||
namespace: "{{ longhorn_chart_namespace }}"
|
||||
name: "{{ longhorn_chart_name }}"
|
||||
force: yes
|
||||
grace_period: 0
|
||||
register: helmchart_deleted
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Wait for HelmChart to be fully removed
|
||||
ansible.builtin.pause:
|
||||
seconds: 30
|
||||
when: helmchart_deleted is changed
|
||||
run_once: true
|
||||
|
||||
# Step 2: Remove Longhorn manifest from filesystem
|
||||
- name: Remove Longhorn manifest file
|
||||
ansible.builtin.file:
|
||||
path: "{{ longhorn_manifest_path }}"
|
||||
state: absent
|
||||
when: inventory_hostname == 'pi1'
|
||||
register: manifest_removed
|
||||
|
||||
# Step 3: Remove finalizers from all Longhorn resources
|
||||
- name: Get list of all Longhorn CRDs
|
||||
kubernetes.core.k8s_info:
|
||||
kind: CustomResourceDefinition
|
||||
label_selectors:
|
||||
- app=longhorn
|
||||
register: longhorn_crds
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Get all Longhorn CR instances
|
||||
kubernetes.core.k8s_info:
|
||||
kind: "{{ item.spec.names.kind }}"
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
api_version: "{{ item.spec.group ~ '/' ~ item.spec.versions[0].name }}"
|
||||
register: cr_instances
|
||||
ignore_errors: yes
|
||||
loop: "{{ longhorn_crds.resources | default([]) }}"
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Remove finalizers from all Longhorn CR instances
|
||||
kubernetes.core.k8s_json_patch:
|
||||
kind: "{{ item.0.spec.names.kind }}"
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.1.metadata.name }}"
|
||||
api_version: "{{ item.0.spec.group ~ '/' ~ item.0.spec.versions[0].name }}"
|
||||
patch:
|
||||
- op: replace
|
||||
path: /metadata/finalizers
|
||||
value: []
|
||||
loop: >-
|
||||
{% set results = [] %}
|
||||
{% for crd in longhorn_crds.resources | default([]) %}
|
||||
{% for instance in hostvars['localhost']['cr_instances'].results | default([]) %}
|
||||
{% if instance.crd == crd %}
|
||||
{% set results = results.append([crd, instance.resources[0] if instance.resources else {}]) %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
{{ results }}
|
||||
when: cr_instances.results | default([]) | length > 0
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
ignore_errors: yes
|
||||
|
||||
# Step 4: Remove finalizers from PVCs
|
||||
- name: Get all PVCs with longhorn storage class
|
||||
kubernetes.core.k8s_info:
|
||||
kind: PersistentVolumeClaim
|
||||
register: all_pvcs
|
||||
ignore_errors: yes
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Remove finalizers from PVCs
|
||||
kubernetes.core.k8s_json_patch:
|
||||
kind: PersistentVolumeClaim
|
||||
namespace: "{{ item.metadata.namespace }}"
|
||||
name: "{{ item.metadata.name }}"
|
||||
patch:
|
||||
- op: replace
|
||||
path: /metadata/finalizers
|
||||
value: []
|
||||
loop: "{{ all_pvcs.resources | default([]) | selectattr('spec.storageClassName', 'defined') | selectattr('spec.storageClassName', 'match', 'longhorn.*') | list }}"
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
ignore_errors: yes
|
||||
|
||||
# Step 5: Remove namespace finalizers
|
||||
- name: Remove finalizers from longhorn-system namespace
|
||||
kubernetes.core.k8s_json_patch:
|
||||
kind: Namespace
|
||||
name: "{{ longhorn_namespace }}"
|
||||
patch:
|
||||
- op: replace
|
||||
path: /spec/finalizers
|
||||
value: []
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Delete longhorn-system namespace
|
||||
kubernetes.core.k8s:
|
||||
state: absent
|
||||
kind: Namespace
|
||||
name: "{{ longhorn_namespace }}"
|
||||
force: yes
|
||||
grace_period: 0
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Wait for namespace deletion
|
||||
ansible.builtin.pause:
|
||||
seconds: 15
|
||||
run_once: true
|
||||
|
||||
# Step 6: Reinstall Longhorn via manifest
|
||||
- name: Deploy Longhorn HelmChart manifest
|
||||
ansible.builtin.copy:
|
||||
dest: "{{ longhorn_manifest_path }}"
|
||||
content: |
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
annotations:
|
||||
helmcharts.cattle.io/managed-by: helm-controller
|
||||
finalizers:
|
||||
- wrangler.cattle.io/on-helm-chart-remove
|
||||
name: longhorn-install
|
||||
namespace: kube-system
|
||||
spec:
|
||||
version: v1.9.1
|
||||
chart: longhorn
|
||||
repo: https://charts.longhorn.io
|
||||
failurePolicy: abort
|
||||
targetNamespace: longhorn-system
|
||||
createNamespace: true
|
||||
valuesContent: |-
|
||||
defaultSettings:
|
||||
defaultDataPath: {{ longhorn_data_path }}
|
||||
when: inventory_hostname == 'pi1'
|
||||
register: manifest_deployed
|
||||
|
||||
- name: Trigger k3s reconcile by touching manifest
|
||||
ansible.builtin.file:
|
||||
path: "{{ longhorn_manifest_path }}"
|
||||
state: touch
|
||||
when: manifest_deployed is changed and inventory_hostname == 'pi1'
|
||||
|
||||
- name: Wait for Longhorn pods to be created
|
||||
ansible.builtin.pause:
|
||||
minutes: 3
|
||||
when: manifest_deployed is changed
|
||||
run_once: true
|
||||
|
||||
when: >-
|
||||
(recovery_phase == 'hard' and not new_driver_deployer.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length > 0)
|
||||
or (recovery_phase == 'soft' and not soft_recovery_success | default(false) and not new_driver_deployer.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length > 0)
|
||||
or recovery_phase == 'none'
|
||||
run_once: true
|
||||
|
||||
# ========================================================================
|
||||
# PHASE 5: Restore from Backup
|
||||
# ========================================================================
|
||||
|
||||
- name: Execute restore from backup
|
||||
block:
|
||||
- name: Determine backup directory to use
|
||||
ansible.builtin.set_fact:
|
||||
backup_dir_to_use: >-
|
||||
{% if fallback_backup_dir and lookup('fileglob', fallback_backup_dir ~ '/backup_*.volumes') | length > 0 %}
|
||||
{{ fallback_backup_dir }}
|
||||
{% elif primary_backup_dir and lookup('fileglob', primary_backup_dir ~ '/backup_*.volumes') | length > 0 %}
|
||||
{{ primary_backup_dir }}
|
||||
{% else %}
|
||||
""
|
||||
{% endif %}
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: FAIL if no backup directory found
|
||||
ansible.builtin.fail:
|
||||
msg: "No valid backup directory found with backup_*.volumes files"
|
||||
when: backup_dir_to_use == ""
|
||||
run_once: true
|
||||
|
||||
- name: Find latest backup file
|
||||
ansible.builtin.set_fact:
|
||||
latest_backup: >-
|
||||
{% set files = lookup('fileglob', backup_dir_to_use ~ '/backup_*.volumes', wantlist=True) | sort(attribute='stat.mtime', reverse=True) %}
|
||||
{% if files | length > 0 %}
|
||||
{{ files[0].path }}
|
||||
{% endif %}
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: FAIL if no backup files found
|
||||
ansible.builtin.fail:
|
||||
msg: "No backup files found in {{ backup_dir_to_use }}"
|
||||
when: latest_backup | default('') == ''
|
||||
run_once: true
|
||||
|
||||
- name: Wait for Longhorn managers to be ready
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- app=longhorn-manager
|
||||
register: managers_status
|
||||
until: >-
|
||||
{{ (managers_status.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length) >= 1 }}
|
||||
retries: 30
|
||||
delay: 10
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Apply PV/PVC backup
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
src: "{{ latest_backup }}"
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Find Longhorn metadata backup
|
||||
ansible.builtin.set_fact:
|
||||
longhorn_backup: >-
|
||||
{% set lh_files = lookup('fileglob', backup_dir_to_use ~ '/longhorn_metadata_*.yaml', wantlist=True) | sort(attribute='stat.mtime', reverse=True) %}
|
||||
{% if lh_files | length > 0 %}
|
||||
{{ lh_files[0].path }}
|
||||
{% endif %}
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Apply Longhorn metadata backup (if exists)
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
src: "{{ longhorn_backup | default(omit) }}"
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
when: longhorn_backup | default('') != ''
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
|
||||
# ========================================================================
|
||||
# PHASE 6: Post-Recovery Verification
|
||||
# ========================================================================
|
||||
|
||||
- name: Verify recovery success
|
||||
block:
|
||||
- name: Check CSI driver registration
|
||||
kubernetes.core.k8s_info:
|
||||
kind: CSIDriver
|
||||
name: driver.longhorn.io
|
||||
register: csi_final
|
||||
until: csi_final.resources | length > 0
|
||||
retries: 10
|
||||
delay: 10
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Check Longhorn manager health
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- app=longhorn-manager
|
||||
register: managers_final
|
||||
until: >-
|
||||
{{ (managers_final.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length) >= 3 }}
|
||||
retries: 15
|
||||
delay: 10
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
|
||||
- name: Check CSI socket exists (on pi1)
|
||||
ansible.builtin.stat:
|
||||
path: /var/lib/kubelet/plugins/driver.longhorn.io/csi.sock
|
||||
register: csi_socket
|
||||
when: inventory_hostname == 'pi1'
|
||||
|
||||
- name: Verify volume data is still present
|
||||
ansible.builtin.stat:
|
||||
path: "{{ longhorn_data_path }}/replicas"
|
||||
register: replicas_dir
|
||||
when: inventory_hostname == 'pi1'
|
||||
|
||||
- name: Display recovery summary
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
===== Longhorn Recovery Summary =====
|
||||
CSI Driver Registered: {{ not csi_final.failed | bool | ternary('✓', '✗') }}
|
||||
Managers Running: {{ (managers_final.resources | default([]) | selectattr('status.phase', 'eq', 'Running') | list | length) }}/3
|
||||
CSI Socket Exists: {{ csi_socket.stat.exists | default(false) | bool | ternary('✓', '✗') }}
|
||||
Volume Data Present: {{ replicas_dir.stat.exists | default(false) | bool | ternary('✓', '✗') }}
|
||||
Backup Used: {{ latest_backup | default('none') }}
|
||||
======================================
|
||||
run_once: true
|
||||
|
||||
when: inventory_hostname == 'pi1'
|
||||
run_once: true
|
||||
914
ansible/arcodange/factory/playbooks/recover/longhorn_data.yml
Normal file
914
ansible/arcodange/factory/playbooks/recover/longhorn_data.yml
Normal file
@@ -0,0 +1,914 @@
|
||||
---
|
||||
# Longhorn Block-Device Data Recovery Playbook
|
||||
#
|
||||
# PURPOSE:
|
||||
# Recover application data directly from raw Longhorn replica files when Volume CRDs
|
||||
# are missing (e.g. after a nuclear cleanup + reinstall). Bypasses k8s objects entirely
|
||||
# and works at the block-device level.
|
||||
#
|
||||
# WHEN TO USE:
|
||||
# - Longhorn has been fully reinstalled (Volume CRDs are gone)
|
||||
# - Application PVCs are stuck Terminating / Lost
|
||||
# - The raw replica .img files still exist on disk
|
||||
# → See docs/runbooks/longhorn-block-device-recovery.md for the manual equivalent
|
||||
#
|
||||
# WHEN NOT TO USE:
|
||||
# - Volume CRDs still exist → use playbooks/recover/longhorn.yml instead
|
||||
# - All replica dirs were zeroed by Longhorn reconciliation (data is unrecoverable)
|
||||
#
|
||||
# USAGE:
|
||||
# ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \
|
||||
# -e @vars/recovery_volumes.yml
|
||||
#
|
||||
# VARS FILE FORMAT (vars/recovery_volumes.yml):
|
||||
# longhorn_recovery_volumes:
|
||||
# - pv_name: pvc-abc123 # Longhorn volume name (== PV name)
|
||||
# pvc_name: myapp-data # PVC name in the namespace
|
||||
# namespace: myapp # namespace where the PVC lives
|
||||
# size_bytes: "134217728" # volume size in bytes (string)
|
||||
# size_human: 128Mi # human-readable, used in PVC spec
|
||||
# access_mode: ReadWriteOnce # ReadWriteOnce or ReadWriteMany
|
||||
# workload_kind: Deployment # Deployment or StatefulSet
|
||||
# workload_name: myapp # name of the workload to scale down/up
|
||||
# source_node: pi3 # [OPTIONAL] node with untouched replica dir
|
||||
# source_dir: pvc-abc123-998f49ff # [OPTIONAL] exact replica dir name
|
||||
# verify_cmd: "" # optional: command to run inside pod to verify data after recovery
|
||||
#
|
||||
# source_node and source_dir are auto-discovered (largest dir >16K across all nodes)
|
||||
# when not specified. Override manually only to force a specific replica dir.
|
||||
#
|
||||
# REQUIREMENTS:
|
||||
# - python3 on all cluster nodes
|
||||
# - kubectl configured on the Ansible controller (localhost)
|
||||
# - longhorn-system namespace running and healthy before this playbook starts
|
||||
# - kubernetes.core collection: ansible-galaxy collection install kubernetes.core
|
||||
#
|
||||
# TESTED SCENARIO:
|
||||
# 2026-04-13 power cut — nuclear Longhorn reinstall — url-shortener SQLite recovery
|
||||
# Proven working as of 2026-04-14.
|
||||
|
||||
- name: Longhorn Block-Device Data Recovery
|
||||
hosts: localhost
|
||||
gather_facts: no
|
||||
|
||||
vars:
|
||||
longhorn_data_path: /mnt/arcodange/longhorn
|
||||
longhorn_namespace: longhorn-system
|
||||
longhorn_nodes: [pi1, pi2, pi3]
|
||||
merge_tool_local: "{{ playbook_dir }}/../../docs/incidents/2026-04-13-power-cut/tools/merge-longhorn-layers.py"
|
||||
merge_tool_remote: /home/pi/merge-longhorn-layers.py
|
||||
backup_base: /home/pi/arcodange/backups/longhorn-recovery
|
||||
merged_base: /tmp/longhorn-recovery-merged
|
||||
recovery_mount: /mnt/recovery-src
|
||||
live_mount: /mnt/recovery-live
|
||||
longhorn_recovery_volumes: [] # override with -e @vars/recovery_volumes.yml
|
||||
|
||||
tasks:
|
||||
|
||||
# =========================================================================
|
||||
# PRE-FLIGHT
|
||||
# =========================================================================
|
||||
|
||||
- name: "Pre-flight | Fail fast if no volumes defined"
|
||||
ansible.builtin.fail:
|
||||
msg: >
|
||||
No recovery volumes defined. Pass -e @vars/recovery_volumes.yml with a
|
||||
longhorn_recovery_volumes list. See playbook header for format.
|
||||
when: longhorn_recovery_volumes | length == 0
|
||||
|
||||
- name: "Pre-flight | Verify merge tool exists locally"
|
||||
ansible.builtin.stat:
|
||||
path: "{{ merge_tool_local }}"
|
||||
register: merge_tool_stat
|
||||
delegate_to: localhost
|
||||
|
||||
- name: "Pre-flight | Fail if merge tool missing"
|
||||
ansible.builtin.fail:
|
||||
msg: "merge-longhorn-layers.py not found at {{ merge_tool_local }}"
|
||||
when: not merge_tool_stat.stat.exists
|
||||
|
||||
- name: "Pre-flight | Check Longhorn is healthy"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- app=longhorn-manager
|
||||
register: lh_managers
|
||||
delegate_to: localhost
|
||||
|
||||
- name: "Pre-flight | Fail if Longhorn managers are not running"
|
||||
ansible.builtin.fail:
|
||||
msg: >
|
||||
Longhorn managers not running (found {{ lh_managers.resources | default([]) |
|
||||
selectattr('status.phase', 'eq', 'Running') | list | length }} Running pods).
|
||||
Ensure Longhorn is healthy before attempting data recovery.
|
||||
when: >
|
||||
(lh_managers.resources | default([]) |
|
||||
selectattr('status.phase', 'eq', 'Running') | list | length) < 1
|
||||
|
||||
- name: "Pre-flight | Summary"
|
||||
ansible.builtin.debug:
|
||||
msg: >
|
||||
Longhorn healthy ({{ lh_managers.resources |
|
||||
selectattr('status.phase', 'eq', 'Running') | list | length }} managers running).
|
||||
Recovering {{ longhorn_recovery_volumes | length }} volume(s):
|
||||
{{ longhorn_recovery_volumes | map(attribute='pv_name') | list | join(', ') }}
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 0 — AUTO-DISCOVER BEST REPLICA DIR (when source_node/source_dir absent)
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 0 | Scan replica dirs on all nodes"
|
||||
ansible.builtin.shell: |
|
||||
result=""
|
||||
for dir in {{ longhorn_data_path }}/replicas/{{ item.1.pv_name }}-*; do
|
||||
[ -d "$dir" ] || continue
|
||||
# Skip replicas that were being rebuilt — their data is incomplete
|
||||
meta="$dir/volume.meta"
|
||||
if [ -f "$meta" ]; then
|
||||
rebuilding=$(python3 -c "import json; d=json.load(open('$meta')); print(d.get('Rebuilding', False))" 2>/dev/null)
|
||||
[ "$rebuilding" = "True" ] && continue
|
||||
fi
|
||||
# Use actual disk usage (not apparent/sparse size) to rank replicas
|
||||
size=$(du -sk "$dir" 2>/dev/null | cut -f1)
|
||||
name=$(basename "$dir")
|
||||
result="$result\n$size $name"
|
||||
done
|
||||
printf '%b' "$result" | grep -v '^$' || true
|
||||
delegate_to: "{{ item.0 }}"
|
||||
become: yes
|
||||
loop: "{{ longhorn_nodes | product(longhorn_recovery_volumes) | list }}"
|
||||
loop_control:
|
||||
label: "{{ item.0 }}: {{ item.1.pv_name }}"
|
||||
register: dir_scan_raw
|
||||
changed_when: false
|
||||
when: item.1.source_node | default('') == '' or item.1.source_dir | default('') == ''
|
||||
|
||||
- name: "Phase 0 | Pick best source (largest dir with data, >16K)"
|
||||
ansible.builtin.set_fact:
|
||||
_discovered_sources: "{{ _build | from_json }}"
|
||||
vars:
|
||||
_build: >-
|
||||
{% set ns = namespace(result={}) %}
|
||||
{% for res in dir_scan_raw.results | default([]) %}
|
||||
{% if not res.skipped | default(false) and res.stdout | default('') != '' %}
|
||||
{% set node = res.item.0 %}
|
||||
{% set vol = res.item.1.pv_name %}
|
||||
{% for line in res.stdout_lines %}
|
||||
{% set parts = line.split() %}
|
||||
{% if parts | length == 2 %}
|
||||
{% set size = parts[0] | int %}
|
||||
{% set dir = parts[1] %}
|
||||
{% if size > 16384 and (vol not in ns.result or size > ns.result[vol].size) %}
|
||||
{# size is in KB (from du -sk); 16384 KB = 16 MiB minimum real replica #}
|
||||
{% set _ = ns.result.update({vol: {'node': node, 'dir': dir, 'size': size}}) %}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
{{ ns.result | to_json }}
|
||||
|
||||
- name: "Phase 0 | Show discovered sources"
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
{% for vol in longhorn_recovery_volumes %}
|
||||
{{ vol.pv_name }}:
|
||||
{% if vol.source_node | default('') != '' %}
|
||||
source: MANUAL → {{ vol.source_node }}/{{ vol.source_dir }}
|
||||
{% elif vol.pv_name in _discovered_sources %}
|
||||
source: AUTO → {{ _discovered_sources[vol.pv_name].node }}/{{ _discovered_sources[vol.pv_name].dir }}
|
||||
({{ (_discovered_sources[vol.pv_name].size / 1024 / 1024) | round(0) | int }} MiB)
|
||||
{% else %}
|
||||
source: NOT FOUND — no dir >16K on any node for this volume
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
- name: "Phase 0 | Fail if source not found for any volume"
|
||||
ansible.builtin.fail:
|
||||
msg: >
|
||||
No replica dir with data found for {{ item.pv_name }} on any node
|
||||
({{ longhorn_nodes | join(', ') }}). Check that the replica files survived.
|
||||
loop: "{{ longhorn_recovery_volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
when: >
|
||||
item.source_node | default('') == '' and
|
||||
item.source_dir | default('') == '' and
|
||||
item.pv_name not in _discovered_sources
|
||||
|
||||
- name: "Phase 0 | Initialize merged volume list"
|
||||
ansible.builtin.set_fact:
|
||||
_merged_volumes: []
|
||||
|
||||
- name: "Phase 0 | Append each volume with resolved source"
|
||||
ansible.builtin.set_fact:
|
||||
_merged_volumes: "{{ _merged_volumes + [item | combine(_source)] }}"
|
||||
vars:
|
||||
_manual: "{{ item.source_node | default('') != '' and item.source_dir | default('') != '' }}"
|
||||
_source: "{{ _manual | bool | ternary(
|
||||
{'source_node': item.source_node, 'source_dir': item.source_dir},
|
||||
{'source_node': _discovered_sources[item.pv_name].node,
|
||||
'source_dir': _discovered_sources[item.pv_name].dir}) }}"
|
||||
loop: "{{ longhorn_recovery_volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 0 | Apply resolved volume list"
|
||||
ansible.builtin.set_fact:
|
||||
_volumes: "{{ _merged_volumes }}"
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 1 — UPLOAD MERGE TOOL AND BACK UP REPLICA DIRS
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 1 | Upload merge tool to source nodes"
|
||||
ansible.builtin.command: >
|
||||
scp -o StrictHostKeyChecking=no
|
||||
{{ merge_tool_local }}
|
||||
pi@{{ item.source_node }}.home:{{ merge_tool_remote }}
|
||||
delegate_to: localhost
|
||||
become: no
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }} → {{ item.source_node }}"
|
||||
changed_when: true
|
||||
|
||||
- name: "Phase 1 | Create backup directory on source node"
|
||||
ansible.builtin.file:
|
||||
path: "{{ backup_base }}/{{ item.pvc_name }}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
|
||||
- name: "Phase 1 | Check if backup already exists (skip if re-running)"
|
||||
ansible.builtin.stat:
|
||||
path: "{{ backup_base }}/{{ item.pvc_name }}/{{ item.source_dir }}/volume.meta"
|
||||
register: backup_exists
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
|
||||
- name: "Phase 1 | Back up untouched replica dir (safe copy before any operation)"
|
||||
ansible.builtin.shell: >
|
||||
cp -a {{ longhorn_data_path }}/replicas/{{ item.item.source_dir }}
|
||||
{{ backup_base }}/{{ item.item.pvc_name }}/
|
||||
delegate_to: "{{ item.item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ backup_exists.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pvc_name }}"
|
||||
when: not item.stat.exists
|
||||
changed_when: true
|
||||
|
||||
- name: "Phase 1 | Verify backup contains volume.meta"
|
||||
ansible.builtin.stat:
|
||||
path: "{{ backup_base }}/{{ item.pvc_name }}/{{ item.source_dir }}/volume.meta"
|
||||
register: backup_meta
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
|
||||
- name: "Phase 1 | Fail if backup is incomplete"
|
||||
ansible.builtin.fail:
|
||||
msg: >
|
||||
Backup for {{ item.item.pvc_name }} is missing volume.meta — the source dir
|
||||
{{ item.item.source_dir }} may not exist or backup copy failed.
|
||||
loop: "{{ backup_meta.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pvc_name }}"
|
||||
when: not item.stat.exists
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 2 — RECONSTRUCT FILESYSTEMS FROM REPLICA LAYERS
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 2 | Create merged output directory"
|
||||
ansible.builtin.file:
|
||||
path: "{{ merged_base }}"
|
||||
state: directory
|
||||
mode: "0755"
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
|
||||
- name: "Phase 2 | Check if merged image already exists"
|
||||
ansible.builtin.stat:
|
||||
path: "{{ merged_base }}/{{ item.pvc_name }}.img"
|
||||
register: merged_exists
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
|
||||
- name: "Phase 2 | Merge snapshot + head layers into single image"
|
||||
ansible.builtin.command: >
|
||||
python3 {{ merge_tool_remote }}
|
||||
{{ backup_base }}/{{ item.item.pvc_name }}/{{ item.item.source_dir }}
|
||||
{{ merged_base }}/{{ item.item.pvc_name }}.img
|
||||
delegate_to: "{{ item.item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ merged_exists.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pvc_name }}"
|
||||
when: not item.stat.exists
|
||||
changed_when: true
|
||||
register: merge_output
|
||||
|
||||
- name: "Phase 2 | Show merge output"
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ item.stdout_lines | default([]) }}"
|
||||
loop: "{{ merge_output.results | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.item.pvc_name | default('') }}"
|
||||
when: item.stdout_lines is defined
|
||||
|
||||
- name: "Phase 2 | Test mount merged image to verify filesystem"
|
||||
ansible.builtin.shell: |
|
||||
mkdir -p {{ recovery_mount }}-{{ item.pvc_name }}
|
||||
mount -o loop,ro,noload {{ merged_base }}/{{ item.pvc_name }}.img {{ recovery_mount }}-{{ item.pvc_name }}
|
||||
ls {{ recovery_mount }}-{{ item.pvc_name }}/
|
||||
umount {{ recovery_mount }}-{{ item.pvc_name }}
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
register: mount_test
|
||||
changed_when: false
|
||||
|
||||
- name: "Phase 2 | Show filesystem contents"
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ item.item.pvc_name }}: {{ item.stdout_lines }}"
|
||||
loop: "{{ mount_test.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pvc_name }}"
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 3 — CREATE LONGHORN VOLUME CRDs
|
||||
# =========================================================================
|
||||
|
||||
# Scale down StatefulSets BEFORE removing PVC finalizers.
|
||||
# StatefulSet controllers auto-recreate PVCs as soon as they are deleted; if we
|
||||
# remove finalizers while the StatefulSet is still running, the controller
|
||||
# immediately provisions a new empty PVC (bound to a fresh volume), making the
|
||||
# PVC spec immutable by the time Phase 8 tries to pin it to our recovered PV.
|
||||
# Deployments are less urgent here but scaled early for consistency.
|
||||
|
||||
- name: "Phase 3 | Pre-scale down Deployments (before PVC finalizer removal)"
|
||||
kubernetes.core.k8s_scale:
|
||||
kind: Deployment
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
replicas: 0
|
||||
wait: yes
|
||||
wait_timeout: 60
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_kind == 'Deployment' and item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 3 | Pre-scale down StatefulSets (before PVC finalizer removal)"
|
||||
kubernetes.core.k8s_scale:
|
||||
kind: StatefulSet
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
replicas: 0
|
||||
wait: yes
|
||||
wait_timeout: 60
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_kind == 'StatefulSet' and item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
# Clear any stuck Terminating PVs/PVCs BEFORE creating Volume CRDs.
|
||||
# If old Terminating PVCs still exist when we create the Volume CRD, Longhorn
|
||||
# associates them and deletes the Volume CRD when the PVC finishes terminating.
|
||||
|
||||
- name: "Phase 3 | Check PVC state before touching finalizers"
|
||||
ansible.builtin.shell: >
|
||||
kubectl get pvc {{ item.pvc_name }} -n {{ item.namespace }}
|
||||
-o jsonpath='{.metadata.deletionTimestamp}' 2>/dev/null || true
|
||||
register: pvc_deletion_ts
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.pvc_name }}"
|
||||
changed_when: false
|
||||
|
||||
- name: "Phase 3 | Remove finalizers from stuck PV (if Terminating)"
|
||||
ansible.builtin.shell: >
|
||||
kubectl patch pv {{ item.pv_name }} --type=merge
|
||||
-p '{"metadata":{"finalizers":null}}' 2>/dev/null || true
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
changed_when: false
|
||||
|
||||
- name: "Phase 3 | Remove finalizers from stuck PVC (if Terminating)"
|
||||
ansible.builtin.shell: >
|
||||
kubectl patch pvc {{ item.pvc_name }} -n {{ item.namespace }}
|
||||
--type=merge -p '{"metadata":{"finalizers":null}}' 2>/dev/null || true
|
||||
delegate_to: localhost
|
||||
loop: "{{ pvc_deletion_ts.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.namespace }}/{{ item.item.pvc_name }}"
|
||||
when: item.stdout != ''
|
||||
changed_when: false
|
||||
|
||||
- name: "Phase 3 | Wait for stuck PVCs to fully delete before creating Volume CRDs"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: PersistentVolumeClaim
|
||||
name: "{{ item.item.pvc_name }}"
|
||||
namespace: "{{ item.item.namespace }}"
|
||||
register: pvc_pre_check
|
||||
until: pvc_pre_check.resources | default([]) | length == 0
|
||||
retries: 12
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ pvc_deletion_ts.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.namespace }}/{{ item.item.pvc_name }}"
|
||||
when: item.stdout != ''
|
||||
|
||||
- name: "Phase 3 | Check if Longhorn Volume CRD already exists"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Volume
|
||||
api_version: longhorn.io/v1beta2
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.pv_name }}"
|
||||
register: volume_crd_check
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 3 | Create Longhorn Volume CRD"
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: Volume
|
||||
metadata:
|
||||
name: "{{ item.item.pv_name }}"
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
spec:
|
||||
accessMode: "{{ item.item.access_mode | lower | replace('readwriteonce', 'rwo') | replace('readwritemany', 'rwx') }}"
|
||||
dataEngine: v1
|
||||
frontend: blockdev
|
||||
numberOfReplicas: 3
|
||||
size: "{{ item.item.size_bytes }}"
|
||||
delegate_to: localhost
|
||||
loop: "{{ volume_crd_check.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pv_name }}"
|
||||
when: item.resources | default([]) | length == 0
|
||||
|
||||
- name: "Phase 3 | Wait for Longhorn replicas to appear (stopped state)"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Replica
|
||||
api_version: longhorn.io/v1beta2
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
label_selectors:
|
||||
- "longhornvolume={{ item.pv_name }}"
|
||||
register: replicas_check
|
||||
until: replicas_check.resources | default([]) | length >= 1
|
||||
retries: 24
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 3 | Wait for Volume status to be populated (webhook cache)"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Volume
|
||||
api_version: longhorn.io/v1beta2
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.pv_name }}"
|
||||
register: vol_ready
|
||||
until: >
|
||||
(vol_ready.resources | default([]) | first | default({}) ).status.state | default('') != ''
|
||||
retries: 24
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 4 — SCALE DOWN WORKLOADS
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 4 | Scale down Deployments"
|
||||
kubernetes.core.k8s_scale:
|
||||
kind: Deployment
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
replicas: 0
|
||||
wait: yes
|
||||
wait_timeout: 60
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_kind == 'Deployment' and item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 4 | Scale down StatefulSets"
|
||||
kubernetes.core.k8s_scale:
|
||||
kind: StatefulSet
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
replicas: 0
|
||||
wait: yes
|
||||
wait_timeout: 60
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_kind == 'StatefulSet' and item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 4 | Delete any lingering Error-state pods that may hold volume attachments"
|
||||
ansible.builtin.shell: |
|
||||
kubectl get pods -n {{ item.namespace }} \
|
||||
--field-selector='status.phase=Failed' -o name | xargs -r kubectl delete -n {{ item.namespace }}
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}"
|
||||
changed_when: false
|
||||
ignore_errors: yes
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 5 — ATTACH VOLUME VIA MAINTENANCE TICKET
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 5 | Create VolumeAttachment maintenance ticket"
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: longhorn.io/v1beta2
|
||||
kind: VolumeAttachment
|
||||
metadata:
|
||||
name: "{{ item.pv_name }}"
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
spec:
|
||||
attachmentTickets:
|
||||
recovery:
|
||||
generation: 0
|
||||
id: recovery
|
||||
nodeID: "{{ item.source_node }}"
|
||||
parameters:
|
||||
disableFrontend: "false"
|
||||
type: longhorn-api
|
||||
volume: "{{ item.pv_name }}"
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }} → {{ item.source_node }}"
|
||||
|
||||
- name: "Phase 5 | Wait for volume to reach attached state"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Volume
|
||||
api_version: longhorn.io/v1beta2
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.pv_name }}"
|
||||
register: vol_state
|
||||
until: >
|
||||
(vol_state.resources | default([]) | first | default({}) ).status.state | default('') == 'attached'
|
||||
retries: 24
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 5 | Verify block device exists on target node"
|
||||
ansible.builtin.stat:
|
||||
path: "/dev/longhorn/{{ item.pv_name }}"
|
||||
register: blockdev_check
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 5 | Fail if block device not present"
|
||||
ansible.builtin.fail:
|
||||
msg: >
|
||||
Block device /dev/longhorn/{{ item.item.pv_name }} not found on
|
||||
{{ item.item.source_node }} after volume attached — check Longhorn logs.
|
||||
loop: "{{ blockdev_check.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pv_name }}"
|
||||
when: not item.stat.exists
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 6 — INJECT DATA INTO LIVE BLOCK DEVICE
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 6 | Inject data via block device (mount, rsync, umount)"
|
||||
ansible.builtin.shell: |
|
||||
LIVE="{{ live_mount }}-{{ item.pvc_name }}"
|
||||
SRC="{{ recovery_mount }}-{{ item.pvc_name }}"
|
||||
BLOCKDEV="/dev/longhorn/{{ item.pv_name }}"
|
||||
MERGED="{{ merged_base }}/{{ item.pvc_name }}.img"
|
||||
|
||||
# Always unmount on exit (success or partial failure)
|
||||
cleanup() {
|
||||
mountpoint -q "$SRC" && umount "$SRC" || true
|
||||
mountpoint -q "$LIVE" && umount "$LIVE" || true
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
mkdir -p "$LIVE" "$SRC"
|
||||
|
||||
# Format if not already formatted (idempotent — safe on re-run)
|
||||
if ! blkid "$BLOCKDEV" | grep -q 'TYPE='; then
|
||||
mkfs.ext4 -F "$BLOCKDEV"
|
||||
fi
|
||||
|
||||
# Mount live block device if not already mounted
|
||||
if ! mountpoint -q "$LIVE"; then
|
||||
mount "$BLOCKDEV" "$LIVE"
|
||||
fi
|
||||
|
||||
# Mount merged recovery image read-only if not already mounted
|
||||
if ! mountpoint -q "$SRC"; then
|
||||
mount -o loop,ro,noload "$MERGED" "$SRC"
|
||||
fi
|
||||
|
||||
# Sync data — exclude lost+found
|
||||
# --ignore-errors: continue past unreadable files (e.g. corrupted parts from power cut)
|
||||
# rc=23 (partial transfer) is treated as success — bulk data transferred
|
||||
rsync -av --ignore-errors --exclude='lost+found' "$SRC/" "$LIVE/" || \
|
||||
{ RC=$?; [ $RC -eq 23 ] && echo "WARNING: rsync rc=23 (some files unreadable in source — expected for power-cut partitions)" || exit $RC; }
|
||||
delegate_to: "{{ item.source_node }}"
|
||||
become: yes
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pvc_name }}"
|
||||
register: inject_output
|
||||
changed_when: true
|
||||
|
||||
- name: "Phase 6 | Show rsync output"
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ item.stdout_lines | default([]) }}"
|
||||
loop: "{{ inject_output.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pvc_name }}"
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 7 — DETACH VOLUME
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 7 | Remove recovery attachment ticket"
|
||||
kubernetes.core.k8s_json_patch:
|
||||
kind: VolumeAttachment
|
||||
api_version: longhorn.io/v1beta2
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.pv_name }}"
|
||||
patch:
|
||||
- op: remove
|
||||
path: /spec/attachmentTickets/recovery
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 7 | Wait for recovery ticket to be gone"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: VolumeAttachment
|
||||
api_version: longhorn.io/v1beta2
|
||||
namespace: "{{ longhorn_namespace }}"
|
||||
name: "{{ item.pv_name }}"
|
||||
register: va_state
|
||||
until: >
|
||||
(va_state.resources | default([]) | first | default({}) ).spec.attachmentTickets.recovery is not defined
|
||||
retries: 24
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 8 — RESTORE PV AND PVC
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 8 | Create PersistentVolume (Retain, no claimRef)"
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: "{{ item.pv_name }}"
|
||||
annotations:
|
||||
pv.kubernetes.io/provisioned-by: driver.longhorn.io
|
||||
spec:
|
||||
accessModes:
|
||||
- "{{ item.access_mode }}"
|
||||
capacity:
|
||||
storage: "{{ item.size_human }}"
|
||||
csi:
|
||||
driver: driver.longhorn.io
|
||||
fsType: ext4
|
||||
volumeHandle: "{{ item.pv_name }}"
|
||||
volumeAttributes:
|
||||
dataEngine: v1
|
||||
dataLocality: disabled
|
||||
disableRevisionCounter: "true"
|
||||
numberOfReplicas: "3"
|
||||
staleReplicaTimeout: "30"
|
||||
persistentVolumeReclaimPolicy: Retain
|
||||
storageClassName: longhorn
|
||||
volumeMode: Filesystem
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 8 | Wait for PV to be Available or Bound"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: PersistentVolume
|
||||
name: "{{ item.pv_name }}"
|
||||
register: pv_state
|
||||
until: >
|
||||
(pv_state.resources | default([]) | first | default({}) ).status.phase | default('')
|
||||
in ['Available', 'Bound']
|
||||
retries: 12
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.pv_name }}"
|
||||
|
||||
- name: "Phase 8 | Check if PVC already bound to correct PV"
|
||||
ansible.builtin.shell: >
|
||||
kubectl get pvc {{ item.pvc_name }} -n {{ item.namespace }}
|
||||
-o jsonpath='{.spec.volumeName}' 2>/dev/null || true
|
||||
register: pvc_current_volume
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.pvc_name }}"
|
||||
changed_when: false
|
||||
|
||||
- name: "Phase 8 | Create PersistentVolumeClaim pinned to PV"
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: "{{ item.item.pvc_name }}"
|
||||
namespace: "{{ item.item.namespace }}"
|
||||
spec:
|
||||
accessModes:
|
||||
- "{{ item.item.access_mode }}"
|
||||
resources:
|
||||
requests:
|
||||
storage: "{{ item.item.size_human }}"
|
||||
storageClassName: longhorn
|
||||
volumeMode: Filesystem
|
||||
volumeName: "{{ item.item.pv_name }}"
|
||||
delegate_to: localhost
|
||||
loop: "{{ pvc_current_volume.results }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.namespace }}/{{ item.item.pvc_name }}"
|
||||
when: item.stdout != item.item.pv_name
|
||||
|
||||
- name: "Phase 8 | Wait for PVC to be Bound"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: PersistentVolumeClaim
|
||||
namespace: "{{ item.namespace }}"
|
||||
name: "{{ item.pvc_name }}"
|
||||
register: pvc_state
|
||||
until: >
|
||||
(pvc_state.resources | default([]) | first | default({}) ).status.phase | default('') == 'Bound'
|
||||
retries: 12
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.pvc_name }}"
|
||||
|
||||
# =========================================================================
|
||||
# PHASE 9 — SCALE UP AND VERIFY
|
||||
# =========================================================================
|
||||
|
||||
- name: "Phase 9 | Scale up Deployments"
|
||||
kubernetes.core.k8s_scale:
|
||||
kind: Deployment
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
replicas: 1
|
||||
wait: yes
|
||||
wait_timeout: 120
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_kind == 'Deployment' and item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 9 | Scale up StatefulSets"
|
||||
kubernetes.core.k8s_scale:
|
||||
kind: StatefulSet
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
replicas: 1
|
||||
wait: yes
|
||||
wait_timeout: 120
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_kind == 'StatefulSet' and item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 9 | Wait for workload to report ready replicas"
|
||||
kubernetes.core.k8s_info:
|
||||
kind: "{{ item.workload_kind }}"
|
||||
name: "{{ item.workload_name }}"
|
||||
namespace: "{{ item.namespace }}"
|
||||
register: workload_state
|
||||
until: >
|
||||
(workload_state.resources | default([]) | first | default({}) ).status.readyReplicas | default(0) | int >= 1
|
||||
retries: 24
|
||||
delay: 5
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.workload_name != ''
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 9 | Run optional verification command in pod"
|
||||
ansible.builtin.shell: >
|
||||
kubectl exec -n {{ item.namespace }}
|
||||
$(kubectl get pod -n {{ item.namespace }}
|
||||
-l statefulset.kubernetes.io/pod-name={{ item.workload_name }}-0
|
||||
--no-headers -o custom-columns=':metadata.name' 2>/dev/null ||
|
||||
kubectl get pod -n {{ item.namespace }} {{ item.workload_name }}-0
|
||||
--no-headers -o custom-columns=':metadata.name' 2>/dev/null)
|
||||
-- sh -c '{{ item.verify_cmd }}'
|
||||
delegate_to: localhost
|
||||
loop: "{{ _volumes }}"
|
||||
loop_control:
|
||||
label: "{{ item.namespace }}/{{ item.workload_name }}"
|
||||
when: item.verify_cmd | default('') != ''
|
||||
register: verify_output
|
||||
changed_when: false
|
||||
ignore_errors: yes
|
||||
|
||||
- name: "Phase 9 | Show verification output"
|
||||
ansible.builtin.debug:
|
||||
msg: "{{ item.stdout_lines | default([]) }}"
|
||||
loop: "{{ verify_output.results | default([]) }}"
|
||||
loop_control:
|
||||
label: "{{ item.item.pvc_name | default('') }}"
|
||||
when: item.stdout_lines is defined and item.item.verify_cmd | default('') != ''
|
||||
|
||||
# =========================================================================
|
||||
# RECOVERY SUMMARY
|
||||
# =========================================================================
|
||||
|
||||
- name: "Summary | Recovery complete"
|
||||
ansible.builtin.debug:
|
||||
msg: |
|
||||
╔══════════════════════════════════════════════════════╗
|
||||
║ Longhorn Block-Device Recovery Complete ║
|
||||
╚══════════════════════════════════════════════════════╝
|
||||
Volumes recovered:
|
||||
{% for v in _volumes %}
|
||||
• {{ v.pvc_name }} ({{ v.namespace }}) ← {{ v.source_node }}:{{ v.source_dir }}
|
||||
{% endfor %}
|
||||
|
||||
Backups retained at: {{ backup_base }}/<pvc-name>/
|
||||
Merged images at: {{ merged_base }}/<pvc-name>.img
|
||||
|
||||
Next steps:
|
||||
1. Verify application data through the app UI / API
|
||||
2. Repeat for remaining volumes (update vars file)
|
||||
3. Run a fresh k8s_pvc backup once all volumes are healthy
|
||||
@@ -0,0 +1,84 @@
|
||||
---
|
||||
# Example vars file for playbooks/recover/longhorn_data.yml
|
||||
#
|
||||
# Usage:
|
||||
# ansible-playbook -i inventory/hosts.yml playbooks/recover/longhorn_data.yml \
|
||||
# -e @playbooks/recover/longhorn_data_vars.example.yml
|
||||
#
|
||||
# HOW TO FILL THIS IN:
|
||||
#
|
||||
# 1. Find untouched replica dirs across all nodes:
|
||||
# for node in pi1 pi2 pi3; do
|
||||
# echo "=== $node ==="
|
||||
# ssh $node "sudo du -sh /mnt/arcodange/longhorn/replicas/pvc-<VOLUME>-* 2>/dev/null"
|
||||
# done
|
||||
# Pick the dir with the largest size (>16K) and oldest timestamps (from before the incident).
|
||||
#
|
||||
# 2. Get pv_name and pvc_name from PV/PVC backup:
|
||||
# cat /home/pi/arcodange/backups/k3s_pvc/backup_*.volumes | grep -A5 "kind: PersistentVolume"
|
||||
#
|
||||
# 3. Get size_bytes from Longhorn volume spec or from:
|
||||
# cat /mnt/arcodange/longhorn/replicas/<source_dir>/volume.meta
|
||||
#
|
||||
# 4. source_node = the node where the untouched dir lives
|
||||
# source_dir = the exact directory name (e.g. pvc-abc123-998f49ff)
|
||||
#
|
||||
# Fields:
|
||||
# pv_name — Longhorn volume name, equals the PV name (pvc-<uuid>) [REQUIRED]
|
||||
# pvc_name — PVC name in the namespace [REQUIRED]
|
||||
# namespace — namespace where the PVC lives [REQUIRED]
|
||||
# size_bytes — volume capacity in bytes as a string (from volume spec) [REQUIRED]
|
||||
# size_human — human-readable size for PVC spec (e.g. 128Mi, 8Gi) [REQUIRED]
|
||||
# access_mode — ReadWriteOnce or ReadWriteMany [REQUIRED]
|
||||
# workload_kind — Deployment or StatefulSet [REQUIRED]
|
||||
# workload_name — name of the workload to scale down/up [REQUIRED]
|
||||
# source_node — node holding the untouched replica dir (pi1/pi2/pi3) [OPTIONAL — auto-discovered]
|
||||
# source_dir — exact replica dir name on source_node [OPTIONAL — auto-discovered]
|
||||
# verify_cmd — shell command to run inside pod to confirm data after restore [OPTIONAL]
|
||||
#
|
||||
# source_node and source_dir are auto-discovered by Phase 0 (largest dir >16K across all
|
||||
# nodes). Override them manually only if you want to force a specific replica dir.
|
||||
|
||||
longhorn_recovery_volumes:
|
||||
|
||||
# --- url-shortener (example, already recovered 2026-04-14) ---
|
||||
- pv_name: pvc-cdd434d1-c8b4-4a75-acde-2978ec9febd4
|
||||
pvc_name: url-shortener-data
|
||||
namespace: url-shortener
|
||||
size_bytes: "134217728"
|
||||
size_human: 128Mi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: Deployment
|
||||
workload_name: url-shortener
|
||||
source_node: pi3
|
||||
source_dir: pvc-cdd434d1-c8b4-4a75-acde-2978ec9febd4-998f49ff
|
||||
verify_cmd: "sqlite3 /data/urls.db 'SELECT COUNT(*) FROM urls;'"
|
||||
|
||||
# --- traefik (example, already recovered 2026-04-14) ---
|
||||
# - pv_name: pvc-<traefik-uuid>
|
||||
# pvc_name: traefik-data
|
||||
# namespace: traefik
|
||||
# size_bytes: "134217728"
|
||||
# size_human: 128Mi
|
||||
# access_mode: ReadWriteOnce
|
||||
# workload_kind: Deployment
|
||||
# workload_name: traefik
|
||||
# source_node: pi3
|
||||
# source_dir: pvc-<traefik-uuid>-<hex>
|
||||
# verify_cmd: ""
|
||||
|
||||
# --- vault (uncomment and fill for recovery) ---
|
||||
# - pv_name: pvc-<vault-uuid>
|
||||
# pvc_name: vault-data
|
||||
# namespace: vault
|
||||
# size_bytes: "1073741824"
|
||||
# size_human: 1Gi
|
||||
# access_mode: ReadWriteOnce
|
||||
# workload_kind: StatefulSet
|
||||
# workload_name: vault
|
||||
# source_node: pi2
|
||||
# source_dir: pvc-<vault-uuid>-<hex>
|
||||
# verify_cmd: ""
|
||||
|
||||
# Add more volumes here following the same pattern.
|
||||
# Process one at a time first to validate, then batch.
|
||||
@@ -0,0 +1,17 @@
|
||||
---
|
||||
# Recovery vars for Clickhouse
|
||||
# Source: pi3, dir pvc-1251909b-...-1163420b (2.6G — largest, snapshot verified non-zero)
|
||||
# Generated: 2026-04-14
|
||||
|
||||
longhorn_recovery_volumes:
|
||||
- pv_name: pvc-1251909b-3cef-40c6-881c-3bb6e929a596
|
||||
pvc_name: clickhouse-storage-clickhouse-0
|
||||
namespace: tools
|
||||
size_bytes: "17179869184" # 16Gi
|
||||
size_human: 16Gi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: StatefulSet
|
||||
workload_name: clickhouse
|
||||
source_node: pi3
|
||||
source_dir: pvc-1251909b-3cef-40c6-881c-3bb6e929a596-1163420b
|
||||
verify_cmd: "clickhouse-client --query 'SHOW DATABASES'"
|
||||
@@ -0,0 +1,38 @@
|
||||
---
|
||||
# Recovery vars for erp and hashicorp-vault volumes
|
||||
# source_node/source_dir omitted — auto-discovered by Phase 0
|
||||
|
||||
longhorn_recovery_volumes:
|
||||
|
||||
- pv_name: pvc-7971918e-e47f-4739-a976-965ea2d770b4
|
||||
pvc_name: erp
|
||||
namespace: erp
|
||||
size_bytes: "53687091200"
|
||||
size_human: 50Gi
|
||||
access_mode: ReadWriteMany
|
||||
workload_kind: Deployment
|
||||
workload_name: "" # intentionally blank — ERP needs Vault unsealed first; scale up manually
|
||||
verify_cmd: ""
|
||||
|
||||
# hashicorp-vault StatefulSet has two PVCs (audit + data).
|
||||
# workload_name is set only on the last entry so the StatefulSet is scaled up
|
||||
# once after both volumes are ready, not between them.
|
||||
- pv_name: pvc-6d2ea1c7-9327-4992-a02c-93ae604eda70
|
||||
pvc_name: audit-hashicorp-vault-0
|
||||
namespace: tools
|
||||
size_bytes: "10737418240"
|
||||
size_human: 10Gi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: StatefulSet
|
||||
workload_name: ""
|
||||
verify_cmd: ""
|
||||
|
||||
- pv_name: pvc-ca5567d3-a682-4cee-8ff1-2b8e23260635
|
||||
pvc_name: data-hashicorp-vault-0
|
||||
namespace: tools
|
||||
size_bytes: "10737418240"
|
||||
size_human: 10Gi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: StatefulSet
|
||||
workload_name: hashicorp-vault
|
||||
verify_cmd: ""
|
||||
@@ -0,0 +1,47 @@
|
||||
---
|
||||
# Recovery vars for remaining volumes (prometheus, alertmanager, redis, backups-rwx)
|
||||
# source_node and source_dir intentionally omitted — auto-discovered by Phase 0
|
||||
|
||||
longhorn_recovery_volumes:
|
||||
|
||||
- pv_name: pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9
|
||||
pvc_name: prometheus-server
|
||||
namespace: tools
|
||||
size_bytes: "8589934592"
|
||||
size_human: 8Gi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: Deployment
|
||||
workload_name: prometheus-server
|
||||
source_node: pi2
|
||||
source_dir: pvc-88e18c7f-2cfd-45e3-be5b-78c31ab829e9-910583f6
|
||||
verify_cmd: ""
|
||||
|
||||
- pv_name: pvc-aed7f2c4-1948-487a-8d10-d8a1372289b4
|
||||
pvc_name: storage-prometheus-alertmanager-0
|
||||
namespace: tools
|
||||
size_bytes: "2147483648"
|
||||
size_human: 2Gi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: StatefulSet
|
||||
workload_name: prometheus-alertmanager
|
||||
verify_cmd: ""
|
||||
|
||||
- pv_name: pvc-d1d5482b-81c8-4d7c-a528-7a57ef47a5ce
|
||||
pvc_name: redis-storage-redis-0
|
||||
namespace: tools
|
||||
size_bytes: "1073741824"
|
||||
size_human: 1Gi
|
||||
access_mode: ReadWriteOnce
|
||||
workload_kind: StatefulSet
|
||||
workload_name: redis
|
||||
verify_cmd: "redis-cli ping"
|
||||
|
||||
- pv_name: pvc-efda1d2f-1db8-46dd-9a97-3d11f1807ffa
|
||||
pvc_name: backups-rwx
|
||||
namespace: longhorn-system
|
||||
size_bytes: "53687091200"
|
||||
size_human: 50Gi
|
||||
access_mode: ReadWriteMany
|
||||
workload_kind: Deployment
|
||||
workload_name: ""
|
||||
verify_cmd: ""
|
||||
@@ -60,7 +60,7 @@
|
||||
name: "{{ recurring_job }}"
|
||||
groups: []
|
||||
task: backup
|
||||
cron: "0 5 1,10,20 * *"
|
||||
cron: "0 5 */2 * *"
|
||||
retain: 2
|
||||
concurrency: 1
|
||||
|
||||
|
||||
@@ -126,14 +126,14 @@
|
||||
debug:
|
||||
msg: >-
|
||||
Clé SSH ajoutée avec succès.
|
||||
Visitez https://gitea.arcodange.duckdns.org/user/settings/keys?verify_ssh={{ add_ssh_key_result.json.fingerprint }}
|
||||
Visitez https://gitea.arcodange.lab/user/settings/keys?verify_ssh={{ add_ssh_key_result.json.fingerprint }}
|
||||
pour vérifier la signature de vos commits avec cette clé.
|
||||
|
||||
- set_fact:
|
||||
gitea_org_name: arcodange-org
|
||||
gitea_org_full_name: Arcodange
|
||||
gitea_org_description: '🏹💻🪽'
|
||||
gitea_org_website: https://www.arcodange.duckdns.org
|
||||
gitea_org_website: https://www.arcodange.fr
|
||||
gitea_org_location: Paris
|
||||
gitea_org_avatar_img_path: '{{ inventory_dir }}/../img/arcodange-org.jpeg'
|
||||
|
||||
|
||||
@@ -55,3 +55,123 @@
|
||||
loop_var: database__pg_instruction
|
||||
loop:
|
||||
"{{ ['postgres', 'gitea'] | product(pg_instructions) }}"
|
||||
|
||||
# ---
|
||||
|
||||
- name: Change table owner (CronJob with dynamic roles and auto DB naming)
|
||||
hosts: localhost
|
||||
connection: local
|
||||
gather_facts: false
|
||||
|
||||
collections:
|
||||
- kubernetes.core
|
||||
|
||||
vars:
|
||||
|
||||
namespace: kube-system
|
||||
cronjob_name: pg-fix-table-ownership
|
||||
|
||||
pg_conf: >-
|
||||
{{ hostvars[groups.postgres[0]].postgres.dockercompose.services.postgres.environment }}
|
||||
postgres_admin_credentials:
|
||||
username: '{{ pg_conf.POSTGRES_USER }}'
|
||||
password: '{{ pg_conf.POSTGRES_PASSWORD }}'
|
||||
pg_host: "{{ hostvars[groups.postgres[0]]['preferred_ip'] }}"
|
||||
|
||||
tasks:
|
||||
|
||||
- name: Create Kubernetes Secret for PostgreSQL admin credentials
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: postgres-admin-credentials
|
||||
namespace: "{{ namespace }}"
|
||||
type: Opaque
|
||||
data:
|
||||
username: "{{ postgres_admin_credentials.username | b64encode }}"
|
||||
password: "{{ postgres_admin_credentials.password | b64encode }}"
|
||||
|
||||
- name: Create cronjob to change table owners (dynamic roles, auto DB)
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: "{{ cronjob_name }}"
|
||||
namespace: "{{ namespace }}"
|
||||
spec:
|
||||
schedule: "0 3 * * *" # Exécution quotidienne à 3h du matin
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: psql
|
||||
image: postgres:16.3
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: postgres-admin-credentials
|
||||
env:
|
||||
- name: PGPASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: postgres-admin-credentials
|
||||
key: password
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
args:
|
||||
- |
|
||||
set -eu
|
||||
|
||||
# Récupérer dynamiquement les rôles PostgreSQL
|
||||
echo "Fetching roles from PostgreSQL..."
|
||||
ROLES=$(psql \
|
||||
-h {{ pg_host }} \
|
||||
-U $username \
|
||||
-d postgres \
|
||||
-t -A \
|
||||
-c "SELECT rolname FROM pg_roles WHERE rolname LIKE '%_role';")
|
||||
|
||||
echo "Roles found: $ROLES"
|
||||
|
||||
# Pour chaque rôle, changer le propriétaire des tables dans sa base associée
|
||||
for role in $ROLES; do
|
||||
# Déduire le nom de la base en retirant "_role"
|
||||
DB_NAME="${role%_role}"
|
||||
echo "Database for $role: $DB_NAME"
|
||||
|
||||
# Vérifier si la base existe
|
||||
if psql -h {{ pg_host }} -U $username -d postgres -t -A -c "SELECT 1 FROM pg_database WHERE datname = '$DB_NAME';" | grep -q 1; then
|
||||
echo "Changing owner to $role for all tables in $DB_NAME..."
|
||||
psql \
|
||||
-h {{ pg_host }} \
|
||||
-U $username \
|
||||
-d "$DB_NAME" \
|
||||
-c "
|
||||
DO \$\$
|
||||
DECLARE
|
||||
r RECORD;
|
||||
BEGIN
|
||||
FOR r IN
|
||||
SELECT tablename
|
||||
FROM pg_tables
|
||||
WHERE schemaname = 'public'
|
||||
LOOP
|
||||
EXECUTE format('ALTER TABLE public.%I OWNER TO %I', r.tablename, '$role');
|
||||
END LOOP;
|
||||
END \$\$;
|
||||
"
|
||||
echo "Owner changed for $role in $DB_NAME"
|
||||
else
|
||||
echo "Database $DB_NAME does not exist, skipping..."
|
||||
fi
|
||||
done
|
||||
|
||||
@@ -3,7 +3,7 @@ APP_NAME = Arcodange repositories
|
||||
[server]
|
||||
DOMAIN = localhost
|
||||
HTTP_PORT = 3000
|
||||
ROOT_URL = https://gitea.arcodange.duckdns.org/
|
||||
ROOT_URL = https://gitea.arcodange.lab/
|
||||
DISABLE_SSH = false
|
||||
SSH_PORT = 22
|
||||
START_SSH_SERVER = true
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
step_ca_primary: pi1
|
||||
step_ca_user: step
|
||||
step_ca_home: /home/step
|
||||
step_ca_dir: /home/step/.step
|
||||
|
||||
step_ca_name: "Arcodange Lab CA"
|
||||
step_ca_fqdn: ssl-ca.arcodange.lab
|
||||
step_ca_listen_address: ":8443"
|
||||
|
||||
step_ca_password: "{{ vault_step_ca_password }}"
|
||||
step_ca_force_reinit: false
|
||||
|
||||
step_ca_provisioner_name: cert-manager
|
||||
step_ca_provisioner_type: JWK
|
||||
step_ca_jwk_dir: "{{ step_ca_dir }}/provisioners"
|
||||
step_ca_jwk_key: "{{ step_ca_jwk_dir }}/cert-manager.jwk"
|
||||
step_ca_jwk_password: "{{ vault_step_ca_jwk_password }}"
|
||||
step_ca_jwk_password_file: "{{ step_ca_dir }}/secrets/cert-manager.jwk.pass"
|
||||
|
||||
step_ca_url: "https://{{ step_ca_fqdn }}{{ step_ca_listen_address }}"
|
||||
step_ca_root: "{{ step_ca_dir }}/certs/root_ca.crt"
|
||||
@@ -0,0 +1,4 @@
|
||||
- name: restart step-ca
|
||||
systemd:
|
||||
name: step-ca
|
||||
state: restarted
|
||||
@@ -0,0 +1,67 @@
|
||||
# can be called with -e step_ca_force_reinit=true
|
||||
|
||||
# 1️⃣ Vérifier si le CA est déjà initialisé
|
||||
- name: Check if CA already initialized
|
||||
stat:
|
||||
path: "{{ step_ca_dir }}/config/ca.json"
|
||||
register: step_ca_initialized
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
# 2️⃣ Arrêter step-ca si reinit forcée
|
||||
- name: Stop step-ca service (reinit)
|
||||
systemd:
|
||||
name: step-ca
|
||||
state: stopped
|
||||
when:
|
||||
- inventory_hostname == step_ca_primary
|
||||
- step_ca_force_reinit | bool
|
||||
ignore_errors: true
|
||||
|
||||
# 3️⃣ Wipe complet du CA si reinit forcée
|
||||
- name: Wipe existing step-ca data
|
||||
file:
|
||||
path: "{{ step_ca_dir }}"
|
||||
state: absent
|
||||
when:
|
||||
- inventory_hostname == step_ca_primary
|
||||
- step_ca_force_reinit | bool
|
||||
|
||||
# 4️⃣ Recréer le dossier CA proprement
|
||||
- name: Recreate step-ca directory
|
||||
file:
|
||||
path: "{{ step_ca_dir }}"
|
||||
state: directory
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0700"
|
||||
when:
|
||||
- inventory_hostname == step_ca_primary
|
||||
- step_ca_force_reinit | bool
|
||||
|
||||
# 5️⃣ Installer le fichier de mot de passe
|
||||
- name: Install step-ca password file
|
||||
copy:
|
||||
dest: "{{ step_ca_home }}/.step-pass"
|
||||
content: "{{ step_ca_password }}"
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0600"
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
# 6️⃣ Initialiser step-ca (non interactif)
|
||||
- name: Initialize step-ca
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
command: >
|
||||
step ca init
|
||||
--name "{{ step_ca_name }}"
|
||||
--dns "{{ step_ca_fqdn }}"
|
||||
--address "{{ step_ca_listen_address }}"
|
||||
--provisioner admin
|
||||
--password-file {{ step_ca_home }}/.step-pass
|
||||
args:
|
||||
creates: "{{ step_ca_dir }}/config/ca.json"
|
||||
when:
|
||||
- inventory_hostname == step_ca_primary
|
||||
- step_ca_force_reinit | bool or not step_ca_initialized.stat.exists
|
||||
notify: restart step-ca
|
||||
@@ -0,0 +1,51 @@
|
||||
- name: Install base packages
|
||||
apt:
|
||||
name:
|
||||
- curl
|
||||
- vim
|
||||
- gpg
|
||||
- ca-certificates
|
||||
state: present
|
||||
update_cache: yes
|
||||
install_recommends: no
|
||||
|
||||
- name: Download Smallstep apt signing key
|
||||
get_url:
|
||||
url: https://packages.smallstep.com/keys/apt/repo-signing-key.gpg
|
||||
dest: /etc/apt/trusted.gpg.d/smallstep.asc
|
||||
mode: "0644"
|
||||
|
||||
- name: Add Smallstep apt repository
|
||||
copy:
|
||||
dest: /etc/apt/sources.list.d/smallstep.list
|
||||
mode: "0644"
|
||||
content: |
|
||||
deb [signed-by=/etc/apt/trusted.gpg.d/smallstep.asc] https://packages.smallstep.com/stable/debian debs main
|
||||
|
||||
- name: Update apt cache
|
||||
apt:
|
||||
update_cache: yes
|
||||
|
||||
- name: Install step-cli and step-ca
|
||||
apt:
|
||||
name:
|
||||
- step-cli
|
||||
- step-ca
|
||||
state: present
|
||||
|
||||
|
||||
|
||||
- name: Create step user
|
||||
user:
|
||||
name: "{{ step_ca_user }}"
|
||||
system: true
|
||||
shell: /usr/sbin/nologin
|
||||
home: "{{ step_ca_home }}"
|
||||
|
||||
- name: Secure step directory
|
||||
file:
|
||||
path: "{{ step_ca_dir }}"
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0700"
|
||||
recurse: yes
|
||||
@@ -0,0 +1,5 @@
|
||||
- import_tasks: install.yml
|
||||
- import_tasks: init.yml
|
||||
- import_tasks: sync.yml
|
||||
- import_tasks: systemd.yml
|
||||
- import_tasks: provisioners.yml
|
||||
@@ -0,0 +1,73 @@
|
||||
- name: Ensure provisioner directory exists
|
||||
file:
|
||||
path: "{{ step_ca_jwk_dir }}"
|
||||
state: directory
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0700"
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
- name: Check if JWK provisioner already exists
|
||||
command: >
|
||||
step ca provisioner list
|
||||
--ca-url {{ step_ca_url }}
|
||||
--root {{ step_ca_root }}
|
||||
register: step_ca_provisioners
|
||||
changed_when: false
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
- name: Check if cert-manager provisioner exists
|
||||
set_fact:
|
||||
step_ca_provisioner_exists: >-
|
||||
{{
|
||||
(step_ca_provisioners.stdout | from_json
|
||||
| selectattr('name', 'equalto', step_ca_provisioner_name)
|
||||
| list
|
||||
| length) > 0
|
||||
}}
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
- name: Install JWK password file
|
||||
copy:
|
||||
dest: "{{ step_ca_jwk_password_file }}"
|
||||
content: "{{ step_ca_jwk_password }}"
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0400"
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
- name: Generate JWK key for cert-manager
|
||||
command: >
|
||||
step crypto jwk create
|
||||
{{ step_ca_jwk_key }}.pub
|
||||
{{ step_ca_jwk_key }}
|
||||
--password-file "{{ step_ca_jwk_password_file }}"
|
||||
args:
|
||||
creates: "{{ step_ca_jwk_key }}"
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
- name: Add JWK provisioner to step-ca
|
||||
command: >
|
||||
step ca provisioner add {{ step_ca_provisioner_name }}
|
||||
--type JWK
|
||||
--public-key {{ step_ca_jwk_key }}.pub
|
||||
--private-key {{ step_ca_jwk_key }}
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
when:
|
||||
- inventory_hostname == step_ca_primary
|
||||
- step_ca_provisioner_name not in step_ca_provisioners.stdout
|
||||
notify: restart step-ca
|
||||
|
||||
- name: Secure JWK keys permissions
|
||||
file:
|
||||
path: "{{ step_ca_jwk_dir }}"
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0700"
|
||||
recurse: yes
|
||||
when: inventory_hostname == step_ca_primary
|
||||
@@ -0,0 +1,121 @@
|
||||
# 1️⃣ Lock sur le primaire (évite double sync concurrente)
|
||||
- name: Create sync lock on primary
|
||||
file:
|
||||
path: "{{ step_ca_dir }}/.sync.lock"
|
||||
state: touch
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0600"
|
||||
delegate_to: "{{ step_ca_primary }}"
|
||||
run_once: true
|
||||
|
||||
# 2️⃣ Calcul du checksum du CA sur le primaire
|
||||
- name: Compute deterministic checksum of CA directory on primary
|
||||
shell: |
|
||||
set -o pipefail
|
||||
tar --sort=name \
|
||||
--mtime='UTC 1970-01-01' \
|
||||
--owner=0 --group=0 --numeric-owner \
|
||||
-cf - {{ step_ca_dir }} \
|
||||
| sha256sum | awk '{print $1}'
|
||||
args:
|
||||
executable: /bin/bash
|
||||
register: step_ca_primary_checksum
|
||||
changed_when: false
|
||||
delegate_to: "{{ step_ca_primary }}"
|
||||
run_once: true
|
||||
|
||||
# 3️⃣ Charger le checksum précédent (s'il existe)
|
||||
- name: Load previous checksum (controller)
|
||||
slurp:
|
||||
src: /tmp/step-ca-sync/.checksum
|
||||
register: step_ca_previous_checksum
|
||||
failed_when: false
|
||||
changed_when: false
|
||||
run_once: true
|
||||
become: false
|
||||
delegate_to: localhost
|
||||
|
||||
# 4️⃣ Décider si une synchronisation est nécessaire
|
||||
- name: Decide if sync is required
|
||||
set_fact:
|
||||
step_ca_sync_required: >-
|
||||
{{
|
||||
step_ca_previous_checksum.content | default('') | b64decode
|
||||
!= step_ca_primary_checksum.stdout
|
||||
}}
|
||||
run_once: true
|
||||
|
||||
- name: Ensure temporary sync directory exists on controller
|
||||
file:
|
||||
path: /tmp/step-ca-sync
|
||||
state: directory
|
||||
mode: "0700"
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
run_once: true
|
||||
|
||||
# 5️⃣ Pull depuis le primaire vers le contrôleur
|
||||
- name: Fetch CA data from primary to controller
|
||||
synchronize:
|
||||
rsync_path: "sudo -u {{ step_ca_user }} rsync"
|
||||
src: "{{ step_ca_dir }}/"
|
||||
dest: "/tmp/step-ca-sync/"
|
||||
mode: pull
|
||||
recursive: yes
|
||||
delete: no
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
when: step_ca_sync_required
|
||||
run_once: true
|
||||
|
||||
# 6️⃣ Sauvegarder le nouveau checksum (controller)
|
||||
- name: Save new checksum on controller
|
||||
copy:
|
||||
dest: /tmp/step-ca-sync/.checksum
|
||||
content: "{{ step_ca_primary_checksum.stdout }}"
|
||||
mode: "0600"
|
||||
when: step_ca_sync_required
|
||||
run_once: true
|
||||
become: false
|
||||
delegate_to: localhost
|
||||
|
||||
# 7️⃣ Push vers les standby
|
||||
- name: Push CA data to standby nodes
|
||||
synchronize:
|
||||
rsync_path: "sudo -u {{ step_ca_user }} rsync"
|
||||
src: "/tmp/step-ca-sync/"
|
||||
dest: "{{ step_ca_dir }}/"
|
||||
mode: push
|
||||
recursive: yes
|
||||
delete: no
|
||||
when:
|
||||
- inventory_hostname != step_ca_primary
|
||||
- step_ca_sync_required
|
||||
|
||||
- name: Wipe temporary CA sync directory on controller
|
||||
file:
|
||||
path: /tmp/step-ca-sync
|
||||
state: absent
|
||||
delegate_to: localhost
|
||||
run_once: true
|
||||
become: false
|
||||
when: step_ca_sync_required
|
||||
|
||||
# 8️⃣ Forcer permissions correctes (sécurité)
|
||||
- name: Fix step directory permissions
|
||||
file:
|
||||
path: "{{ step_ca_dir }}"
|
||||
owner: "{{ step_ca_user }}"
|
||||
group: "{{ step_ca_user }}"
|
||||
mode: "0700"
|
||||
recurse: yes
|
||||
notify: restart step-ca
|
||||
|
||||
# 9️⃣ Retirer le lock sur le primaire
|
||||
- name: Remove sync lock on primary
|
||||
file:
|
||||
path: "{{ step_ca_dir }}/.sync.lock"
|
||||
state: absent
|
||||
delegate_to: "{{ step_ca_primary }}"
|
||||
run_once: true
|
||||
@@ -0,0 +1,23 @@
|
||||
- name: Install step-ca systemd service
|
||||
template:
|
||||
src: step-ca.service.j2
|
||||
dest: /etc/systemd/system/step-ca.service
|
||||
mode: "0644"
|
||||
|
||||
- name: Reload systemd
|
||||
systemd:
|
||||
daemon_reload: yes
|
||||
|
||||
- name: Enable step-ca on primary
|
||||
systemd:
|
||||
name: step-ca
|
||||
enabled: yes
|
||||
state: started
|
||||
when: inventory_hostname == step_ca_primary
|
||||
|
||||
- name: Disable step-ca on standby nodes
|
||||
systemd:
|
||||
name: step-ca
|
||||
enabled: no
|
||||
state: stopped
|
||||
when: inventory_hostname != step_ca_primary
|
||||
@@ -0,0 +1,15 @@
|
||||
[Unit]
|
||||
Description=Smallstep CA
|
||||
After=network.target
|
||||
|
||||
[Service]
|
||||
User={{ step_ca_user }}
|
||||
Group={{ step_ca_user }}
|
||||
ExecStart=/usr/bin/step-ca \
|
||||
--password-file {{ step_ca_home }}/.step-pass \
|
||||
{{ step_ca_dir }}/config/ca.json
|
||||
Restart=always
|
||||
RestartSec=5
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
98
ansible/arcodange/factory/playbooks/ssl/ssl.yml
Normal file
98
ansible/arcodange/factory/playbooks/ssl/ssl.yml
Normal file
@@ -0,0 +1,98 @@
|
||||
- name: step-ca
|
||||
ansible.builtin.import_playbook: step-ca.yml
|
||||
|
||||
- name: Fetch Step-CA root certificate
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
vars:
|
||||
step_ca_primary: pi1
|
||||
step_ca_user: step
|
||||
step_ca_root: "/home/step/.step/certs/root_ca.crt"
|
||||
tmp_dir: "/tmp/step-ca-cert-manager"
|
||||
tasks:
|
||||
- name: Ensure local temp directory exists
|
||||
file:
|
||||
path: "{{ tmp_dir }}"
|
||||
state: directory
|
||||
mode: "0700"
|
||||
|
||||
- name: Fetch root CA from step_ca_primary
|
||||
fetch:
|
||||
src: "{{ step_ca_root }}"
|
||||
dest: "{{ tmp_dir }}/root_ca.crt"
|
||||
flat: true
|
||||
delegate_to: "{{ step_ca_primary }}"
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
run_once: true
|
||||
|
||||
- name: Préparer le répertoire de build
|
||||
file:
|
||||
path: /tmp/gitea-runner-image
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Copier le root CA dans le contexte Docker
|
||||
copy:
|
||||
src: "{{ tmp_dir }}/root_ca.crt"
|
||||
dest: /tmp/gitea-runner-image/root_ca.crt
|
||||
mode: '0644'
|
||||
|
||||
- name: Créer le Dockerfile pour l'image runner avec CA custom
|
||||
copy:
|
||||
dest: /tmp/gitea-runner-image/Dockerfile
|
||||
mode: '0644'
|
||||
content: |
|
||||
FROM gitea/runner-images:ubuntu-latest
|
||||
|
||||
COPY root_ca.crt /usr/local/share/ca-certificates/root_ca.crt
|
||||
RUN update-ca-certificates
|
||||
|
||||
- name: Builder l'image runner avec le CA
|
||||
community.docker.docker_image:
|
||||
name: gitea.arcodange.lab/arcodange-org/runner-images
|
||||
tag: ubuntu-latest-ca
|
||||
source: build
|
||||
build:
|
||||
path: /tmp/gitea-runner-image
|
||||
push: true
|
||||
|
||||
# - /etc/ssl/certs:/etc/ssl/certs:ro
|
||||
|
||||
# - name: Distribute Step-CA root certificate
|
||||
# hosts: all
|
||||
# gather_facts: true
|
||||
# become: true
|
||||
# vars:
|
||||
# root_ca_source: "/tmp/step-ca-cert-manager/root_ca.crt"
|
||||
# root_ca_filename: "arcodange-root.crt"
|
||||
|
||||
# tasks:
|
||||
# - name: Ensure root CA file is copied to correct location
|
||||
# copy:
|
||||
# src: "{{ root_ca_source }}"
|
||||
# dest: "{{ ca_dest_path }}"
|
||||
# owner: root
|
||||
# group: root
|
||||
# mode: '0644'
|
||||
# vars:
|
||||
# ca_dest_path: >-
|
||||
# {% if ansible_facts['os_family'] == 'Debian' %}
|
||||
# /usr/local/share/ca-certificates/{{ root_ca_filename }}
|
||||
# {% elif ansible_facts['os_family'] in ['RedHat', 'Fedora'] %}
|
||||
# /etc/pki/ca-trust/source/anchors/{{ root_ca_filename }}
|
||||
# {% else %}
|
||||
# /etc/ssl/certs/{{ root_ca_filename }}
|
||||
# {% endif %}
|
||||
|
||||
# - name: Update CA trust store
|
||||
# command: "{{ ca_update_command }}"
|
||||
# vars:
|
||||
# ca_update_command: >-
|
||||
# {% if ansible_facts['os_family'] == 'Debian' %}
|
||||
# update-ca-certificates
|
||||
# {% elif ansible_facts['os_family'] in ['RedHat', 'Fedora'] %}
|
||||
# update-ca-trust
|
||||
# {% else %}
|
||||
# echo 'Please update the CA trust manually'
|
||||
# {% endif %}
|
||||
6
ansible/arcodange/factory/playbooks/ssl/step-ca.yml
Normal file
6
ansible/arcodange/factory/playbooks/ssl/step-ca.yml
Normal file
@@ -0,0 +1,6 @@
|
||||
---
|
||||
- name: Setup step-ca on raspberries
|
||||
hosts: step_ca #raspberries:&local
|
||||
become: yes
|
||||
roles:
|
||||
- step_ca
|
||||
@@ -0,0 +1,41 @@
|
||||
- name: Install iSCSI client for Longhorn on Raspberry Pi
|
||||
hosts: raspberries:&local
|
||||
become: yes
|
||||
tasks:
|
||||
- name: Install open-iscsi
|
||||
ansible.builtin.apt:
|
||||
name: open-iscsi
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Enable and start iSCSI service
|
||||
ansible.builtin.service:
|
||||
name: iscsid
|
||||
state: started
|
||||
enabled: yes
|
||||
|
||||
- name: Installer cryptsetup
|
||||
ansible.builtin.apt:
|
||||
name: cryptsetup
|
||||
state: present
|
||||
update_cache: yes
|
||||
|
||||
- name: Charger le module noyau dm_crypt
|
||||
ansible.builtin.modprobe:
|
||||
name: dm_crypt
|
||||
state: present
|
||||
|
||||
- name: S'assurer que le module dm_crypt est chargé au démarrage
|
||||
ansible.builtin.lineinfile:
|
||||
path: /etc/modules
|
||||
line: dm_crypt
|
||||
state: present
|
||||
|
||||
- name: Créer dossier longhorn
|
||||
ansible.builtin.file:
|
||||
path: /mnt/arcodange/longhorn
|
||||
state: directory
|
||||
owner: pi
|
||||
group: docker
|
||||
mode: '0774'
|
||||
ignore_errors: true
|
||||
315
ansible/arcodange/factory/playbooks/system/k3s_config.yml
Normal file
315
ansible/arcodange/factory/playbooks/system/k3s_config.yml
Normal file
@@ -0,0 +1,315 @@
|
||||
---
|
||||
|
||||
- name: System K3S
|
||||
hosts: raspberries:&local
|
||||
|
||||
tasks:
|
||||
- name: prepare inventory for k3s external playbook
|
||||
tags: always
|
||||
ansible.builtin.add_host:
|
||||
hostname: "{{ item }}"
|
||||
groups:
|
||||
- k3s_cluster
|
||||
- "{{ ansible_loop.first | ternary('server', 'agent') }}"
|
||||
loop: "{{ groups.raspberries | intersect(groups.local) | sort }}"
|
||||
loop_control:
|
||||
extended: true
|
||||
extended_allitems: false
|
||||
|
||||
- name: how to reach k3s
|
||||
hosts: server
|
||||
tasks:
|
||||
|
||||
- name: setup longhorn for volumes https://docs.k3s.io/helm
|
||||
become: true
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/rancher/k3s/server/manifests/longhorn-install.yaml
|
||||
content: |-
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
annotations:
|
||||
helmcharts.cattle.io/managed-by: helm-controller
|
||||
finalizers:
|
||||
- wrangler.cattle.io/on-helm-chart-remove
|
||||
generation: 1
|
||||
name: longhorn-install
|
||||
namespace: kube-system
|
||||
spec:
|
||||
version: v1.9.1
|
||||
chart: longhorn
|
||||
repo: https://charts.longhorn.io
|
||||
failurePolicy: abort
|
||||
targetNamespace: longhorn-system
|
||||
createNamespace: true
|
||||
valuesContent: |-
|
||||
defaultSettings:
|
||||
defaultDataPath: /mnt/arcodange/longhorn
|
||||
vars:
|
||||
longhorn_helm_values: {} # https://github.com/longhorn/longhorn/blob/master/chart/values.yaml
|
||||
|
||||
- name: customize k3s traefik configuration https://docs.k3s.io/helm
|
||||
block:
|
||||
- name: Get my public IP
|
||||
community.general.ipify_facts:
|
||||
- become: true
|
||||
ansible.builtin.copy:
|
||||
dest: /var/lib/rancher/k3s/server/manifests/traefik-v3.yaml
|
||||
content: |-
|
||||
apiVersion: v1
|
||||
data:
|
||||
dynamic.yaml: |-
|
||||
{{ traefik_config_yaml | to_nice_yaml | indent( width=4 ) }}
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: traefik-configmap
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
name: traefik
|
||||
namespace: kube-system
|
||||
spec:
|
||||
repo: https://traefik.github.io/charts
|
||||
chart: traefik
|
||||
version: v37.4.0
|
||||
targetNamespace: kube-system
|
||||
valuesContent: |-
|
||||
{{ traefik_helm_values | to_nice_yaml | indent( width=4 ) }}
|
||||
---
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
name: wildcard-arcodange-lab
|
||||
namespace: kube-system
|
||||
spec:
|
||||
secretName: wildcard-arcodange-lab
|
||||
issuerRef:
|
||||
name: step-issuer
|
||||
kind: StepClusterIssuer
|
||||
group: certmanager.step.sm
|
||||
dnsNames:
|
||||
- arcodange.lab
|
||||
- "*.arcodange.lab"
|
||||
---
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: TLSStore
|
||||
metadata:
|
||||
name: default
|
||||
namespace: kube-system
|
||||
spec:
|
||||
defaultCertificate:
|
||||
secretName: wildcard-arcodange-lab
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: gitea-external
|
||||
namespace: kube-system
|
||||
spec:
|
||||
type: ExternalName
|
||||
externalName: {{ hostvars[groups.gitea[0]]['preferred_ip'] }}
|
||||
ports:
|
||||
- port: 3000
|
||||
targetPort: 3000
|
||||
vars:
|
||||
traefik_config_yaml:
|
||||
http:
|
||||
services:
|
||||
gitea:
|
||||
loadBalancer:
|
||||
servers:
|
||||
- url: "http://{{ hostvars[groups.gitea[0]]['preferred_ip'] }}:3000"
|
||||
routers:
|
||||
dashboard:
|
||||
# rule: Host(`traefik.arcodange.duckdns.org`)
|
||||
rule: Host(`traefik.arcodange.lab`)
|
||||
service: api@internal
|
||||
middlewares:
|
||||
- localIp
|
||||
# tls:
|
||||
# certResolver: letsencrypt
|
||||
# domains:
|
||||
# - main: "arcodange.duckdns.org"
|
||||
# sans:
|
||||
# - "traefik.arcodange.duckdns.org"
|
||||
entryPoints:
|
||||
- websecure
|
||||
- web
|
||||
acme-challenge:
|
||||
rule: Host(`arcodange.duckdns.org`) && PathPrefix(`/.well-known/acme-challenge`)
|
||||
service: acme-http@internal
|
||||
tls:
|
||||
certResolver: letsencrypt
|
||||
domains:
|
||||
- main: "arcodange.duckdns.org"
|
||||
sans:
|
||||
- "*.arcodange.duckdns.org"
|
||||
entryPoints:
|
||||
- websecure
|
||||
- web
|
||||
gitea:
|
||||
# rule: Host(`gitea.arcodange.duckdns.org`)
|
||||
rule: Host(`gitea.arcodange.lab`)
|
||||
service: gitea
|
||||
middlewares:
|
||||
- localIp
|
||||
# tls:
|
||||
# certResolver: letsencrypt
|
||||
# domains:
|
||||
# - main: "arcodange.duckdns.org"
|
||||
# sans:
|
||||
# - "gitea.arcodange.duckdns.org"
|
||||
entrypoints:
|
||||
- websecure
|
||||
middlewares:
|
||||
localIp:
|
||||
ipAllowList:
|
||||
sourceRange:
|
||||
- "172.16.0.0/12"
|
||||
- "10.42.0.0/16"
|
||||
- "192.168.1.0/24"
|
||||
- "{{ ipify_public_ip }}/32"
|
||||
# - "0.0.0.0/0"
|
||||
# ipStrategy:
|
||||
# depth: 1
|
||||
traefik_helm_values:
|
||||
deployment:
|
||||
kind: "Deployment"
|
||||
initContainers:
|
||||
- name: volume-permissions
|
||||
image: busybox:latest
|
||||
command: ["sh", "-c", "touch /data/acme.json; chmod -v 600 /data/acme.json"]
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
# default is https://github.com/traefik/traefik-helm-chart/blob/v25.0.0/traefik/values.yaml <- for v25 (`kubectl describe deployments.apps traefik -n kube-system | grep helm.sh/chart`)
|
||||
# current is https://github.com/traefik/traefik-helm-chart/blob/v37.4.0/traefik/values.yaml
|
||||
nodeSelector:
|
||||
node-role.kubernetes.io/control-plane: 'true' # make predictible choice of node to direct https traffic to this node and avoid NAT/loss of client IP
|
||||
service:
|
||||
spec:
|
||||
externalTrafficPolicy: Local
|
||||
ports:
|
||||
traefik:
|
||||
expose:
|
||||
default: true
|
||||
web:
|
||||
forwardedHeaders:
|
||||
trustedIPs: ["10.42.0.0/16"] #default k3s cidr
|
||||
ingressRoute:
|
||||
dashboard:
|
||||
enabled: true
|
||||
globalArguments: [] # deactivate --global.sendanonymoususage
|
||||
env:
|
||||
- name: POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
- name: LEGO_DISABLE_CNAME_SUPPORT
|
||||
value: 'true'
|
||||
logs:
|
||||
general:
|
||||
level: INFO
|
||||
# format: json
|
||||
access:
|
||||
enabled: true
|
||||
timezone: Europe/Paris
|
||||
# format: json
|
||||
podSecurityContext:
|
||||
runAsGroup: 65532
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65532
|
||||
fsGroup: 65532 # else the persistent volume might be owned by root and be unwriteable
|
||||
persistence:
|
||||
# -- Enable persistence using Persistent Volume Claims
|
||||
# ref: http://kubernetes.io/docs/user-guide/persistent-volumes/
|
||||
# It can be used to store TLS certificates, see `storage` in certResolvers
|
||||
enabled: true
|
||||
name: data
|
||||
# existingClaim: ""
|
||||
accessMode: ReadWriteOnce
|
||||
size: 128Mi
|
||||
storageClass: "longhorn"
|
||||
# volumeName: ""
|
||||
path: /data
|
||||
annotations: {}
|
||||
volumes:
|
||||
- name: traefik-configmap
|
||||
mountPath: /config
|
||||
type: configMap
|
||||
experimental:
|
||||
plugins:
|
||||
crowdsec-bouncer:
|
||||
moduleName: github.com/maxlerebourg/crowdsec-bouncer-traefik-plugin #https://plugins.traefik.io/plugins/6335346ca4caa9ddeffda116/crowdsec-bouncer-traefik-plugin
|
||||
version: v1.3.3
|
||||
additionalArguments:
|
||||
- '--providers.file.filename=/config/dynamic.yaml'
|
||||
- '--providers.kubernetesingress.ingressendpoint.publishedservice=kube-system/traefik'
|
||||
- "--providers.kubernetescrd.allowcrossnamespace=true"
|
||||
- "--providers.kubernetescrd.allowExternalNameServices=true"
|
||||
certificatesResolvers:
|
||||
letsencrypt:
|
||||
acme:
|
||||
# for challenge options cf. https://doc.traefik.io/traefik/https/acme/
|
||||
email: arcodange@gmail.com
|
||||
tlsChallenge: true
|
||||
dnsChallenge:
|
||||
# requires env variable DUCKDNS_TOKEN
|
||||
provider: duckdns
|
||||
propagation:
|
||||
delayBeforeChecks: 120
|
||||
disableChecks: true
|
||||
resolvers:
|
||||
- "1.1.1.1:53"
|
||||
- "8.8.8.8:53"
|
||||
httpChallenge:
|
||||
entryPoint: "web"
|
||||
# It has to match the path with a persistent volume
|
||||
storage: /data/acme.json
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: traefik-duckdns-token
|
||||
# MY_TOKEN=<my token (see https://www.duckdns.org/domains)>
|
||||
# kubectl create secret generic traefik-duckdns-token --from-literal="DUCKDNS_TOKEN=$MY_TOKEN" -n kube-system
|
||||
- name: touch manifests/traefik.yaml to trigger update
|
||||
ansible.builtin.file:
|
||||
path: /var/lib/rancher/k3s/server/manifests/traefik-v3.yaml
|
||||
state: touch
|
||||
become: true
|
||||
|
||||
|
||||
# ---
|
||||
|
||||
- name: redeploy traefik
|
||||
hosts: localhost
|
||||
tasks:
|
||||
- name: delete old traefik deployment
|
||||
kubernetes.core.k8s:
|
||||
api_version: v1
|
||||
name: traefik
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
state: "absent"
|
||||
- name: delete old deployment job so the k3s helm controller redeploy with our new configuration
|
||||
kubernetes.core.k8s:
|
||||
api_version: batch/v1
|
||||
name: helm-install-traefik
|
||||
kind: Job
|
||||
namespace: kube-system
|
||||
state: "absent"
|
||||
- name: get traefik deployment
|
||||
kubernetes.core.k8s_info:
|
||||
api_version: v1
|
||||
name: traefik
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
wait: true
|
||||
register: traefik_deployment
|
||||
- ansible.builtin.debug:
|
||||
var: traefik_deployment
|
||||
60
ansible/arcodange/factory/playbooks/system/k3s_dns.yml
Normal file
60
ansible/arcodange/factory/playbooks/system/k3s_dns.yml
Normal file
@@ -0,0 +1,60 @@
|
||||
# https://docs.k3s.io/advanced#coredns-custom-configuration-imports
|
||||
---
|
||||
- name: "Déclarer le ConfigMap coredns-custom pour arcodange.lab"
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
pihole_ips: "{{ groups['pihole'] | map('extract', hostvars) | map(attribute='preferred_ip') | list }}"
|
||||
coredns_namespace: "kube-system"
|
||||
|
||||
tasks:
|
||||
- name: "Créer / mettre à jour le ConfigMap coredns-custom"
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: coredns-custom
|
||||
namespace: "{{ coredns_namespace }}"
|
||||
data:
|
||||
arcodange-lab.server: |
|
||||
arcodange.lab:53 {
|
||||
errors
|
||||
cache 30
|
||||
forward . {{ pihole_ips | map('regex_replace', '^(.*)$', '\1:53') | join(' ') }}
|
||||
}
|
||||
|
||||
- name: "Mettre à jour le ConfigMap CoreDNS principal pour utiliser les Pi-holes HA"
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: coredns
|
||||
namespace: "{{ coredns_namespace }}"
|
||||
data:
|
||||
Corefile: |
|
||||
.:53 {
|
||||
errors
|
||||
health
|
||||
ready
|
||||
kubernetes cluster.local in-addr.arpa ip6.arpa {
|
||||
pods insecure
|
||||
fallthrough in-addr.arpa ip6.arpa
|
||||
}
|
||||
hosts /etc/coredns/NodeHosts {
|
||||
ttl 60
|
||||
reload 15s
|
||||
fallthrough
|
||||
}
|
||||
prometheus :9153
|
||||
cache 30
|
||||
loop
|
||||
reload
|
||||
import /etc/coredns/custom/*.override
|
||||
import /etc/coredns/custom/*.server
|
||||
forward . {{ pihole_ips | map('regex_replace', '^(.*)$', '\1:53') | join(' ') }}
|
||||
}
|
||||
172
ansible/arcodange/factory/playbooks/system/k3s_ssl.yml
Normal file
172
ansible/arcodange/factory/playbooks/system/k3s_ssl.yml
Normal file
@@ -0,0 +1,172 @@
|
||||
---
|
||||
- name: System K3S
|
||||
hosts: raspberries:&local
|
||||
|
||||
tasks:
|
||||
- name: prepare inventory for k3s external playbook
|
||||
tags: always
|
||||
ansible.builtin.add_host:
|
||||
hostname: "{{ item }}"
|
||||
groups:
|
||||
- k3s_cluster
|
||||
- "{{ ansible_loop.first | ternary('server', 'agent') }}"
|
||||
loop: "{{ groups.raspberries | intersect(groups.local) | sort }}"
|
||||
loop_control:
|
||||
extended: true
|
||||
extended_allitems: false
|
||||
|
||||
# =========================
|
||||
# Play 1 — Read step-ca PKI
|
||||
# =========================
|
||||
- name: Collect PKI material from step-ca
|
||||
hosts: localhost
|
||||
gather_facts: false
|
||||
|
||||
vars:
|
||||
step_ca_primary: pi1
|
||||
step_ca_user: step
|
||||
step_ca_root: "/home/step/.step/certs/root_ca.crt"
|
||||
tmp_dir: /tmp/step-ca-cert-manager
|
||||
|
||||
tasks:
|
||||
- name: Ensure local temp directory exists
|
||||
file:
|
||||
path: "{{ tmp_dir }}"
|
||||
state: directory
|
||||
mode: "0700"
|
||||
|
||||
- name: Fetch root CA
|
||||
fetch:
|
||||
src: "{{ step_ca_root }}"
|
||||
dest: "{{ tmp_dir }}/root_ca.crt"
|
||||
flat: true
|
||||
delegate_to: "{{ step_ca_primary }}"
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
run_once: true
|
||||
|
||||
- name: Read and decode PKI material
|
||||
slurp:
|
||||
src: "{{ item }}"
|
||||
loop:
|
||||
- "{{ tmp_dir }}/root_ca.crt"
|
||||
register: pki_raw
|
||||
|
||||
- name: Set PKI facts
|
||||
set_fact:
|
||||
root_ca_b64: "{{ (pki_raw.results | selectattr('item','equalto', tmp_dir + '/root_ca.crt') | first).content }}"
|
||||
|
||||
# =========================
|
||||
# Play 2 — Deploy to k3s
|
||||
# =========================
|
||||
- name: Deploy cert-manager and step-ca integration on k3s server
|
||||
hosts: server
|
||||
gather_facts: false
|
||||
become: true
|
||||
|
||||
vars:
|
||||
namespace: cert-manager
|
||||
jwk_provisioner_name: cert-manager
|
||||
jwk_secret_name: step-jwk-password
|
||||
clusterissuer_name: step-ca
|
||||
step_ca_url: "https://ssl-ca.arcodange.lab:8443"
|
||||
cert_manager_version: v1.19.2
|
||||
|
||||
tasks:
|
||||
|
||||
- name: Get cert-manager provisioner info from step-ca
|
||||
command: >
|
||||
step ca provisioner list
|
||||
register: provisioners_json
|
||||
delegate_to: "{{ step_ca_primary }}"
|
||||
become: true
|
||||
become_user: "{{ step_ca_user }}"
|
||||
run_once: true
|
||||
|
||||
- name: Set fact jwk_kid from provisioner
|
||||
set_fact:
|
||||
jwk_kid: >-
|
||||
{{
|
||||
(provisioners_json.stdout | from_json
|
||||
| selectattr('name', 'equalto', jwk_provisioner_name) | list
|
||||
| first).key.kid
|
||||
}}
|
||||
|
||||
- name: Compute PKI checksum
|
||||
set_fact:
|
||||
pki_checksum: >-
|
||||
{{
|
||||
(hostvars['localhost'].root_ca_b64
|
||||
~ jwk_kid
|
||||
~ step_ca_url
|
||||
~ cert_manager_version) | hash('sha256')
|
||||
}}
|
||||
|
||||
- name: Install cert-manager and step-ca via k3s static manifest
|
||||
copy:
|
||||
dest: /var/lib/rancher/k3s/server/manifests/cert-manager-step-ca.yaml
|
||||
mode: "0600"
|
||||
content: |-
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
name: cert-manager
|
||||
namespace: kube-system
|
||||
annotations:
|
||||
pki.arcodange.lab/checksum: "{{ pki_checksum }}"
|
||||
spec:
|
||||
chart: cert-manager
|
||||
repo: https://charts.jetstack.io
|
||||
version: {{ cert_manager_version }}
|
||||
targetNamespace: cert-manager
|
||||
createNamespace: true
|
||||
valuesContent: |-
|
||||
installCRDs: true
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ jwk_secret_name }}
|
||||
namespace: {{ namespace }}
|
||||
annotations:
|
||||
pki.arcodange.lab/checksum: "{{ pki_checksum }}"
|
||||
type: Opaque
|
||||
stringData:
|
||||
password: >-
|
||||
{{ hostvars[step_ca_primary].vault_step_ca_jwk_password }}
|
||||
---
|
||||
apiVersion: helm.cattle.io/v1
|
||||
kind: HelmChart
|
||||
metadata:
|
||||
name: step-issuer
|
||||
namespace: kube-system
|
||||
annotations:
|
||||
pki.arcodange.lab/checksum: "{{ pki_checksum }}"
|
||||
spec:
|
||||
chart: step-issuer
|
||||
repo: https://smallstep.github.io/helm-charts
|
||||
version: 1.9.11
|
||||
targetNamespace: {{ namespace }}
|
||||
createNamespace: false
|
||||
valuesContent: |-
|
||||
certManager:
|
||||
namespace: {{ namespace }}
|
||||
stepClusterIssuer:
|
||||
create: true
|
||||
caUrl: "{{ step_ca_url }}"
|
||||
caBundle: "{{ hostvars['localhost'].root_ca_b64 }}"
|
||||
provisioner:
|
||||
name: {{ jwk_provisioner_name }}
|
||||
kid: "{{ jwk_kid }}"
|
||||
passwordRef:
|
||||
name: {{ jwk_secret_name }}
|
||||
namespace: {{ namespace }}
|
||||
key: password
|
||||
# Override kube-rbac-proxy image to use ARM64-compatible version.
|
||||
# Note: pi3 (ARM64) requires an ARM64-compatible image, while pi2 (ARMv7) may work with AMD64 images.
|
||||
# The default image (gcr.io/kubebuilder/kube-rbac-proxy:v0.15.0) is AMD64-only and fails on pi3.
|
||||
kubeRBACproxy:
|
||||
image:
|
||||
repository: quay.io/brancz/kube-rbac-proxy
|
||||
tag: v0.15.0
|
||||
|
||||
161
ansible/arcodange/factory/playbooks/system/pki.md
Normal file
161
ansible/arcodange/factory/playbooks/system/pki.md
Normal file
@@ -0,0 +1,161 @@
|
||||
# PKI
|
||||
|
||||
Explications générées par chatgpt pour expliquer le setup de ssl via "step"
|
||||
|
||||
```mermaid
|
||||
---
|
||||
config:
|
||||
logLevel: debug
|
||||
theme: forest
|
||||
---
|
||||
flowchart TB
|
||||
%% PKI
|
||||
subgraph PKI["Step CA / PKI (Pi1)"]
|
||||
style PKI fill:#ffe0b2,stroke:#ff8c00,stroke-width:2px
|
||||
A[Step CA primaire]:::stepCA
|
||||
B[JWK Provisioner]:::jwk
|
||||
C[Root CA]:::root
|
||||
end
|
||||
|
||||
%% Contrôleur Ansible
|
||||
subgraph Controller["Contrôleur Ansible / Mac"]
|
||||
style Controller fill:#e0f7fa,stroke:#00acc1,stroke-width:2px
|
||||
D[Fetch JWK + Root CA]:::ansible
|
||||
E[Secrets K8s: step-jwk, step-root-ca]:::k8sSecret
|
||||
F[ClusterIssuer cert-manager]:::clusterIssuer
|
||||
end
|
||||
|
||||
%% K3s Cluster + Traefik
|
||||
subgraph K3sCluster["K3s Cluster"]
|
||||
style K3sCluster fill:#f1f8e9,stroke:#558b2f,stroke-width:2px
|
||||
T[Traefik Ingress Controller]:::traefik
|
||||
H[Webapp Pods]:::webapp
|
||||
G["Gitea Service (ExternalName → pi2.home:3000)"]:::gitea
|
||||
end
|
||||
|
||||
Users[Clients / Navigateurs]:::clients
|
||||
|
||||
%% Flèches
|
||||
%% PKI → Controller
|
||||
A --> B
|
||||
C --> D
|
||||
B --> D
|
||||
D --> E
|
||||
E --> F
|
||||
|
||||
%% ClusterIssuer → Traefik services
|
||||
F --> H
|
||||
F --> G
|
||||
|
||||
%% Traefik expose tous les services
|
||||
T --> H
|
||||
T --> G
|
||||
Users -->|HTTPS / HTTP| T
|
||||
|
||||
%% PKI direct (optional, for clarity)
|
||||
A -->|Sign initial cert| F
|
||||
|
||||
%% Styling classes
|
||||
classDef stepCA fill:#fff3e0,stroke:#ff6f00,stroke-width:1px
|
||||
classDef jwk fill:#fff9c4,stroke:#fbc02d,stroke-width:1px
|
||||
classDef root fill:#ffe0b2,stroke:#ff8c00,stroke-width:1px
|
||||
classDef ansible fill:#b2ebf2,stroke:#00acc1,stroke-width:1px
|
||||
classDef k8sSecret fill:#b3e5fc,stroke:#0288d1,stroke-width:1px
|
||||
classDef clusterIssuer fill:#81d4fa,stroke:#0277bd,stroke-width:1px
|
||||
classDef gitea fill:#c8e6c9,stroke:#388e3c,stroke-width:1px
|
||||
classDef webapp fill:#a5d6a7,stroke:#2e7d32,stroke-width:1px
|
||||
classDef traefik fill:#ffe082,stroke:#ff8f00,stroke-width:1px
|
||||
classDef clients fill:#eeeeee,stroke:#9e9e9e,stroke-width:1px
|
||||
```
|
||||
|
||||
- 🔵 PKI (Step CA) : la source de confiance. Toutes les certificats HTTPS proviennent de là.
|
||||
- 🔵 JWK Provisioner : autorise cert-manager à demander des certificats automatiquement.
|
||||
- 🟢 Contrôleur Ansible : centralise les clés, crée les Secrets K8s et ClusterIssuer.
|
||||
- 🟢 Secrets & ClusterIssuer : permettent à cert-manager dans K3s de s’authentifier et obtenir des certificats TLS.
|
||||
- 🟢 Webapp Pods : obtiennent leurs certificats via cert-manager et HTTPS fonctionne automatiquement.
|
||||
- 🔵 Gitea : reçoit directement un certificat signé par Step CA, sert HTTPS hors K3s.
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
%% PKI
|
||||
subgraph PKI["Step CA / PKI (Pi1)"]
|
||||
style PKI fill:#ffe0b2,stroke:#ff8c00,stroke-width:2px
|
||||
A[1️⃣ Initialisation Step CA primaire]:::stepCA
|
||||
B[2️⃣ Création JWK Provisioner pour K3s]:::jwk
|
||||
C[Root CA]:::root
|
||||
end
|
||||
|
||||
%% Contrôleur Ansible
|
||||
subgraph Controller["Contrôleur Ansible / Mac"]
|
||||
style Controller fill:#e0f7fa,stroke:#00acc1,stroke-width:2px
|
||||
D[3️⃣ Fetch JWK + Root CA depuis Step CA]:::ansible
|
||||
E[4️⃣ Création / Mise à jour des Secrets K8s]:::k8sSecret
|
||||
F[5️⃣ Création / Mise à jour ClusterIssuer cert-manager]:::clusterIssuer
|
||||
end
|
||||
|
||||
%% K3s Cluster + Traefik
|
||||
subgraph K3sCluster["K3s Cluster"]
|
||||
style K3sCluster fill:#f1f8e9,stroke:#558b2f,stroke-width:2px
|
||||
T[6️⃣ Traefik Ingress Controller]:::traefik
|
||||
H[7️⃣ Webapp Pods]:::webapp
|
||||
G["8️⃣ Gitea Service (ExternalName → pi2.home:3000)"]:::gitea
|
||||
end
|
||||
|
||||
Users[9️⃣ Client Mac / Navigateurs]:::clients
|
||||
|
||||
%% Flux
|
||||
A --> B
|
||||
C --> D
|
||||
B --> D
|
||||
D --> E
|
||||
E --> F
|
||||
F --> H
|
||||
F --> G
|
||||
T --> H
|
||||
T --> G
|
||||
Users -->|HTTPS / HTTP| T
|
||||
|
||||
%% Styling classes
|
||||
classDef stepCA fill:#fff3e0,stroke:#ff6f00,stroke-width:1px
|
||||
classDef jwk fill:#fff9c4,stroke:#fbc02d,stroke-width:1px
|
||||
classDef root fill:#ffe0b2,stroke:#ff8c00,stroke-width:1px
|
||||
classDef ansible fill:#b2ebf2,stroke:#00acc1,stroke-width:1px
|
||||
classDef k8sSecret fill:#b3e5fc,stroke:#0288d1,stroke-width:1px
|
||||
classDef clusterIssuer fill:#81d4fa,stroke:#0277bd,stroke-width:1px
|
||||
classDef gitea fill:#c8e6c9,stroke:#388e3c,stroke-width:1px
|
||||
classDef webapp fill:#a5d6a7,stroke:#2e7d32,stroke-width:1px
|
||||
classDef traefik fill:#ffe082,stroke:#ff8f00,stroke-width:1px
|
||||
classDef clients fill:#eeeeee,stroke:#9e9e9e,stroke-width:1px
|
||||
```
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
subgraph Cluster["Cluster Kubernetes (k3s)"]
|
||||
subgraph CertManager["Cert-Manager"]
|
||||
ClusterIssuer["ClusterIssuer\n(type: smallstep)"]
|
||||
end
|
||||
|
||||
subgraph Traefik["Traefik (Ingress Controller)"]
|
||||
TLSStore["TLSStore\n(Traefik v2+)"]
|
||||
IngressRoute["IngressRoute\n(TLS: my-tls-store)"]
|
||||
end
|
||||
|
||||
subgraph Apps["Applications"]
|
||||
App1[Service: my-app]
|
||||
App2[Service: my-api]
|
||||
end
|
||||
end
|
||||
|
||||
subgraph Smallstep["Smallstep PKI (step-ca)"]
|
||||
StepCA["step-ca\n(CA interne)"]
|
||||
end
|
||||
|
||||
%% Interactions
|
||||
ClusterIssuer -- "1. Demande de certificat\n(CertificateRequest)" --> StepCA
|
||||
StepCA -- "2. Émet un certificat\n(signé par la CA)" --> ClusterIssuer
|
||||
ClusterIssuer -- "3. Stocke le certificat\n(dans un Secret Kubernetes)" --> Secret[(Secret: my-app-tls)]
|
||||
Secret -- "4. Référencé par" --> TLSStore
|
||||
TLSStore -- "5. Fournit le certificat\n(TLS Termination)" --> IngressRoute
|
||||
IngressRoute -- "6. Route le trafic HTTPS\nvers" --> App1
|
||||
IngressRoute -- "6. Route le trafic HTTPS\nvers" --> App2
|
||||
```
|
||||
27
ansible/arcodange/factory/playbooks/system/rpi.yml
Normal file
27
ansible/arcodange/factory/playbooks/system/rpi.yml
Normal file
@@ -0,0 +1,27 @@
|
||||
- name: Raspberry pi general setup
|
||||
hosts: raspberries:&local
|
||||
gather_facts: yes
|
||||
tags: never
|
||||
become: yes
|
||||
|
||||
tasks:
|
||||
|
||||
- name: set hostname
|
||||
ansible.builtin.hostname:
|
||||
name: "{{ inventory_hostname }}"
|
||||
become: yes
|
||||
when: inventory_hostname != ansible_hostname
|
||||
|
||||
- name: Ensure dnsmasq user is in dip group for Pi-hole DNS
|
||||
ansible.builtin.user:
|
||||
name: dnsmasq
|
||||
groups: dip
|
||||
append: yes
|
||||
when: "'pihole' in group_names"
|
||||
|
||||
- name: Disable dnsmasq service on Pi-hole nodes to avoid port 53 conflict with pihole-FTL
|
||||
ansible.builtin.systemd:
|
||||
name: dnsmasq
|
||||
state: stopped
|
||||
enabled: no
|
||||
when: "'pihole' in group_names"
|
||||
31
ansible/arcodange/factory/playbooks/system/system.yml
Normal file
31
ansible/arcodange/factory/playbooks/system/system.yml
Normal file
@@ -0,0 +1,31 @@
|
||||
---
|
||||
|
||||
- name: Setup général des rpis
|
||||
ansible.builtin.import_playbook: rpi.yml
|
||||
|
||||
- name: dns
|
||||
ansible.builtin.import_playbook: ../dns/dns.yml
|
||||
|
||||
- name: ssl
|
||||
ansible.builtin.import_playbook: ../ssl/ssl.yml
|
||||
|
||||
- name: Préparer les disques pour Longhorn
|
||||
ansible.builtin.import_playbook: prepare_disks.yml
|
||||
|
||||
- name: Installer et configurer Docker
|
||||
ansible.builtin.import_playbook: system_docker.yml
|
||||
|
||||
- name: Installer le client iSCSI pour Longhorn
|
||||
ansible.builtin.import_playbook: iscsi_longhorn.yml
|
||||
|
||||
- name: Préparer l'inventaire et installer K3s
|
||||
ansible.builtin.import_playbook: system_k3s.yml
|
||||
|
||||
- name: Configurer K3S Core DNS
|
||||
ansible.builtin.import_playbook: k3s_dns.yml
|
||||
|
||||
- name: Configurer K3S Cert Issuer
|
||||
ansible.builtin.import_playbook: k3s_ssl.yml
|
||||
|
||||
- name: Configurer K3s (kubeconfig, Longhorn, Traefik)
|
||||
ansible.builtin.import_playbook: k3s_config.yml
|
||||
110
ansible/arcodange/factory/playbooks/system/system_docker.yml
Normal file
110
ansible/arcodange/factory/playbooks/system/system_docker.yml
Normal file
@@ -0,0 +1,110 @@
|
||||
- name: System Docker
|
||||
hosts: raspberries:&local
|
||||
gather_facts: yes
|
||||
tags: never
|
||||
become: yes
|
||||
|
||||
pre_tasks:
|
||||
|
||||
- name: Prevent apt source conflict
|
||||
ansible.builtin.file:
|
||||
state: absent
|
||||
path: /etc/apt/sources.list.d/docker.list
|
||||
become: yes
|
||||
|
||||
- name: Install role geerlingguy.docker
|
||||
community.general.ansible_galaxy_install:
|
||||
type: role
|
||||
name: geerlingguy.docker
|
||||
run_once: true
|
||||
delegate_to: localhost
|
||||
become: false
|
||||
|
||||
- ansible.builtin.debug:
|
||||
var: ansible_facts.machine
|
||||
|
||||
tasks:
|
||||
|
||||
- include_role:
|
||||
name: geerlingguy.docker
|
||||
|
||||
|
||||
- name: Créer le répertoire /etc/docker s'il n'existe pas
|
||||
ansible.builtin.file:
|
||||
path: /etc/docker
|
||||
state: directory
|
||||
mode: '0755'
|
||||
|
||||
- name: Check if daemon.json exists
|
||||
ansible.builtin.stat:
|
||||
path: /etc/docker/daemon.json
|
||||
register: docker_config_stat
|
||||
|
||||
- name: Lire la configuration Docker existante
|
||||
ansible.builtin.command: "cat /etc/docker/daemon.json"
|
||||
register: docker_config_raw
|
||||
changed_when: false
|
||||
when: docker_config_stat.stat.exists
|
||||
|
||||
- name: Initialiser la variable de config Docker
|
||||
ansible.builtin.set_fact:
|
||||
docker_config: {}
|
||||
|
||||
- name: Parser le JSON existant si le fichier existe
|
||||
ansible.builtin.set_fact:
|
||||
docker_config: "{{ docker_config_raw.stdout | from_json }}"
|
||||
when: docker_config_raw.stdout is defined and docker_config_raw.stdout != ""
|
||||
|
||||
- name: Mettre à jour la config du logger
|
||||
ansible.builtin.set_fact:
|
||||
docker_config: >
|
||||
{{ docker_config | combine({
|
||||
'log-driver': 'json-file',
|
||||
'log-opts': {
|
||||
'max-size': '10m',
|
||||
'max-file': '5'
|
||||
}
|
||||
}, recursive=True) }}
|
||||
|
||||
- name: Ensure Docker storage directory exists on external disk
|
||||
ansible.builtin.file:
|
||||
path: /mnt/arcodange/docker
|
||||
state: directory
|
||||
mode: '0755'
|
||||
owner: root
|
||||
group: docker
|
||||
when: ansible_facts.mounts | selectattr('mount', 'equalto', '/mnt/arcodange') | list | length > 0
|
||||
|
||||
- name: Configure Docker to use external storage
|
||||
ansible.builtin.set_fact:
|
||||
docker_config: >
|
||||
{{ docker_config | combine({
|
||||
'data-root': '/mnt/arcodange/docker',
|
||||
'storage-driver': 'overlay2'
|
||||
}, recursive=True) }}
|
||||
when: ansible_facts.mounts | selectattr('mount', 'equalto', '/mnt/arcodange') | list | length > 0
|
||||
|
||||
- name: Ensure docker_config is a dictionary
|
||||
ansible.builtin.set_fact:
|
||||
docker_config: "{{ docker_config if docker_config is mapping else {} }}"
|
||||
|
||||
- name: Écrire la configuration mise à jour
|
||||
ansible.builtin.copy:
|
||||
dest: /etc/docker/daemon.json
|
||||
content: "{{ docker_config | to_nice_json(indent=2) }}"
|
||||
mode: '0644'
|
||||
notify: Redémarrer Docker
|
||||
|
||||
handlers:
|
||||
- name: Redémarrer Docker
|
||||
ansible.builtin.service:
|
||||
name: docker
|
||||
state: restarted
|
||||
|
||||
post_tasks:
|
||||
- name: adding existing user '{{ ansible_user }}' to group docker
|
||||
user:
|
||||
name: '{{ ansible_user }}'
|
||||
groups: docker
|
||||
append: yes
|
||||
become: yes
|
||||
63
ansible/arcodange/factory/playbooks/system/system_k3s.yml
Normal file
63
ansible/arcodange/factory/playbooks/system/system_k3s.yml
Normal file
@@ -0,0 +1,63 @@
|
||||
- name: System K3S
|
||||
hosts: raspberries:&local
|
||||
|
||||
tasks:
|
||||
- name: prepare inventory for k3s external playbook
|
||||
tags: always
|
||||
ansible.builtin.add_host:
|
||||
hostname: "{{ item }}"
|
||||
groups:
|
||||
- k3s_cluster
|
||||
- "{{ ansible_loop.first | ternary('server', 'agent') }}"
|
||||
loop: "{{ groups.raspberries | intersect(groups.local) | sort }}"
|
||||
loop_control:
|
||||
extended: true
|
||||
extended_allitems: false
|
||||
|
||||
- name: Install collection k3s.orchestration
|
||||
local_action:
|
||||
module: community.general.ansible_galaxy_install
|
||||
type: collection
|
||||
name: git+https://github.com/k3s-io/k3s-ansible
|
||||
run_once: true
|
||||
|
||||
- name: Install socat for kubectl port forwarding
|
||||
ansible.builtin.apt:
|
||||
name: socat
|
||||
state: present
|
||||
update_cache: yes
|
||||
become: yes
|
||||
|
||||
- name: k3s
|
||||
ansible.builtin.import_playbook: k3s.orchestration.site
|
||||
# ansible.builtin.import_playbook: k3s.orchestration.upgrade
|
||||
# ansible.builtin.import_playbook: k3s.orchestration.reset
|
||||
vars:
|
||||
k3s_version: v1.34.3+k3s1
|
||||
extra_server_args: >-
|
||||
--docker --disable traefik
|
||||
--kubelet-arg="container-log-max-files=5"
|
||||
--kubelet-arg="container-log-max-size=10Mi"
|
||||
extra_agent_args: >-
|
||||
--docker
|
||||
--kubelet-arg="container-log-max-files=5"
|
||||
--kubelet-arg="container-log-max-size=10Mi"
|
||||
api_endpoint: "{{ hostvars[groups['server'][0]]['ansible_host'] | default(groups['server'][0]) }}"
|
||||
|
||||
- name: how to reach k3s
|
||||
hosts: server
|
||||
tasks:
|
||||
- name: copy /etc/rancher/k3s/k3s.yaml to ~/.kube/config from the k3s server and replace 127.0.0.1 with the server ip or hostname
|
||||
run_once: true
|
||||
block:
|
||||
- ansible.builtin.fetch:
|
||||
src: /etc/rancher/k3s/k3s.yaml
|
||||
dest: ~/.kube/config
|
||||
flat: true
|
||||
become: true
|
||||
run_once: true
|
||||
- local_action:
|
||||
module: ansible.builtin.replace
|
||||
path: ~/.kube/config
|
||||
regexp: 'server: https://127.0.0.1:6443'
|
||||
replace: 'server: https://{{ ansible_default_ipv4.address }}:6443'
|
||||
@@ -5,77 +5,6 @@
|
||||
# debugger: on_failed
|
||||
|
||||
tasks:
|
||||
- name: Récupérer le nom du pod CrowdSec LAPI
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: tools
|
||||
label_selectors:
|
||||
- k8s-app = crowdsec
|
||||
- type = lapi
|
||||
register: crowdsec_lapi_pods
|
||||
|
||||
- name: Vérifier qu'un pod a été trouvé
|
||||
assert:
|
||||
that: crowdsec_lapi_pods.resources | length > 0
|
||||
fail_msg: "Aucun pod CrowdSec LAPI trouvé dans le namespace 'tools' avec les labels 'k8s-app=crowdsec, type=lapi'."
|
||||
|
||||
- name: Définir le nom du pod CrowdSec LAPI
|
||||
set_fact:
|
||||
crowdsec_lapi_pod_name: "{{ crowdsec_lapi_pods.resources[0].metadata.name }}"
|
||||
|
||||
- name: Récupérer la clé API du bouncer CrowdSec
|
||||
kubernetes.core.k8s_exec:
|
||||
namespace: tools
|
||||
pod: "{{ crowdsec_lapi_pod_name }}"
|
||||
container: crowdsec-lapi
|
||||
command: >
|
||||
cscli bouncers add traefik-plugin
|
||||
register: bouncer_key_result
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Supprimer le bouncer existant en cas d'échec
|
||||
kubernetes.core.k8s_exec:
|
||||
namespace: tools
|
||||
pod: "{{ crowdsec_lapi_pod_name }}"
|
||||
container: crowdsec-lapi
|
||||
command: >
|
||||
cscli bouncers delete traefik-plugin
|
||||
when: bouncer_key_result.failed
|
||||
|
||||
- name: Réessayer de récupérer la clé API
|
||||
kubernetes.core.k8s_exec:
|
||||
namespace: tools
|
||||
pod: "{{ crowdsec_lapi_pod_name }}"
|
||||
container: crowdsec-lapi
|
||||
command: >
|
||||
cscli bouncers add traefik-plugin
|
||||
register: bouncer_key_result
|
||||
when: bouncer_key_result.failed
|
||||
|
||||
- name: Créer le Middleware Traefik pour CrowdSec
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: crowdsec
|
||||
namespace: kube-system
|
||||
spec:
|
||||
plugin:
|
||||
crowdsec-bouncer:
|
||||
enabled: true
|
||||
crowdsecMode: stream
|
||||
crowdsecLapiScheme: http
|
||||
crowdsecLapiHost: crowdsec-service.tools.svc.cluster.local:8080
|
||||
crowdsecLapiKey: "{{ bouncer_key_result.stdout_lines[2].strip() }}"
|
||||
htttTimeoutSeconds: 60
|
||||
crowdsecAppsecEnabled: false
|
||||
crowdsecAppsecHost: crowdsec:7422
|
||||
crowdsecAppsecFailureBlock: true
|
||||
crowdsecAppsecUnreachableBlock: true
|
||||
forwardedHeadersTrustedIPs:
|
||||
- 10.0.10.23/32
|
||||
- 10.0.20.0/24
|
||||
clientTrustedIPs:
|
||||
- 192.168.1.0/24
|
||||
- name: Setup crowdsec middleware for traefik
|
||||
include_role:
|
||||
name: crowdsec
|
||||
@@ -8,7 +8,7 @@
|
||||
- name: gitea_admin_password
|
||||
prompt: Enter gitea admin password
|
||||
unsafe: true # password can contain uncommon chars such as '{'
|
||||
|
||||
|
||||
roles:
|
||||
- arcodange.factory.gitea_token
|
||||
|
||||
|
||||
@@ -0,0 +1 @@
|
||||
traefik_pvc_name: traefik
|
||||
@@ -0,0 +1,94 @@
|
||||
---
|
||||
- name: Inject captcha.html into Traefik PVC
|
||||
block:
|
||||
|
||||
# ---------------------
|
||||
# Scale to 0
|
||||
# ---------------------
|
||||
- name: Scale Traefik to 0
|
||||
kubernetes.core.k8s_scale:
|
||||
api_version: apps/v1
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
name: traefik
|
||||
replicas: 0
|
||||
|
||||
# ---------------------
|
||||
# Create Job
|
||||
# ---------------------
|
||||
- name: Deploy captcha injection Job
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
namespace: kube-system
|
||||
definition:
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: inject-captcha
|
||||
spec:
|
||||
backoffLimit: 0
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: traefik-data
|
||||
persistentVolumeClaim:
|
||||
claimName: "{{ traefik_pvc_name }}"
|
||||
containers:
|
||||
- name: write-captcha
|
||||
image: alpine:3.20
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
echo "Writing captcha.html into PVC..."
|
||||
cat << 'EOF' > /data/captcha.html
|
||||
{{ lookup('template', 'captcha.html.j2') | indent(20) }}
|
||||
EOF
|
||||
volumeMounts:
|
||||
- name: traefik-data
|
||||
mountPath: /data
|
||||
|
||||
# ---------------------
|
||||
# Wait for job success
|
||||
# ---------------------
|
||||
- name: Wait for Job completion
|
||||
kubernetes.core.k8s_info:
|
||||
api_version: batch/v1
|
||||
kind: Job
|
||||
name: inject-captcha
|
||||
namespace: kube-system
|
||||
register: job_status
|
||||
until: job_status.resources[0].status.succeeded | default(0) | int > 0
|
||||
retries: 20
|
||||
delay: 5
|
||||
|
||||
# ---------------------
|
||||
# Clean Job
|
||||
# ---------------------
|
||||
- name: Remove captcha injection Job
|
||||
kubernetes.core.k8s:
|
||||
state: absent
|
||||
api_version: batch/v1
|
||||
kind: Job
|
||||
name: inject-captcha
|
||||
namespace: kube-system
|
||||
|
||||
rescue:
|
||||
- name: Log failure
|
||||
ansible.builtin.debug:
|
||||
msg: "An error occurred during captcha injection. Traefik will still be scaled back up."
|
||||
|
||||
always:
|
||||
# ---------------------
|
||||
# Ensure Traefik is scaled back to 1 NO MATTER WHAT
|
||||
# ---------------------
|
||||
- name: Ensure Traefik is scaled back to 1
|
||||
kubernetes.core.k8s_scale:
|
||||
api_version: apps/v1
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
name: traefik
|
||||
replicas: 1
|
||||
wait: yes
|
||||
wait_timeout: 300
|
||||
@@ -0,0 +1,186 @@
|
||||
- name: Créer le ServiceAccount pour l'authentification Vault
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: factory-ansible-tool-crowdsec-traefik-plugin
|
||||
namespace: kube-system
|
||||
wait: yes
|
||||
wait_timeout: 30
|
||||
|
||||
- name: Créer la ressource VaultAuth
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: secrets.hashicorp.com/v1beta1
|
||||
kind: VaultAuth
|
||||
metadata:
|
||||
name: factory-ansible-tool-crowdsec
|
||||
namespace: kube-system
|
||||
spec:
|
||||
method: kubernetes
|
||||
mount: kubernetes
|
||||
kubernetes:
|
||||
role: factory_crowdsec_conf
|
||||
serviceAccount: factory-ansible-tool-crowdsec-traefik-plugin
|
||||
audiences:
|
||||
- vault
|
||||
wait: yes
|
||||
wait_timeout: 30
|
||||
|
||||
- name: Créer la ressource VaultStaticSecret
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: secrets.hashicorp.com/v1beta1
|
||||
kind: VaultStaticSecret
|
||||
metadata:
|
||||
name: factory-ansible-tool-crowdsec-turnstile-secret
|
||||
namespace: kube-system
|
||||
spec:
|
||||
type: kv-v2
|
||||
mount: kvv2
|
||||
path: cms/factory/turnstile
|
||||
destination:
|
||||
name: factory-ansible-tool-crowdsec-traefik-plugin-captcha-params
|
||||
create: true
|
||||
refreshAfter: 30s
|
||||
vaultAuthRef: factory-ansible-tool-crowdsec
|
||||
wait: yes
|
||||
wait_timeout: 30
|
||||
|
||||
- name: Récupérer le secret Kubernetes
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Secret
|
||||
name: factory-ansible-tool-crowdsec-traefik-plugin-captcha-params
|
||||
namespace: kube-system
|
||||
register: crowdsec_captcha_secret
|
||||
|
||||
- name: Récupérer le nom du pod CrowdSec LAPI
|
||||
kubernetes.core.k8s_info:
|
||||
kind: Pod
|
||||
namespace: tools
|
||||
label_selectors:
|
||||
- k8s-app = crowdsec
|
||||
- type = lapi
|
||||
register: crowdsec_lapi_pods
|
||||
|
||||
- name: Vérifier qu'un pod a été trouvé
|
||||
assert:
|
||||
that: crowdsec_lapi_pods.resources | length > 0
|
||||
fail_msg: "Aucun pod CrowdSec LAPI trouvé dans le namespace 'tools' avec les labels 'k8s-app=crowdsec, type=lapi'."
|
||||
|
||||
- name: Définir le nom du pod CrowdSec LAPI
|
||||
set_fact:
|
||||
crowdsec_lapi_pod_name: "{{ crowdsec_lapi_pods.resources[0].metadata.name }}"
|
||||
|
||||
- name: Récupérer la clé API du bouncer CrowdSec
|
||||
kubernetes.core.k8s_exec:
|
||||
namespace: tools
|
||||
pod: "{{ crowdsec_lapi_pod_name }}"
|
||||
container: crowdsec-lapi
|
||||
command: >
|
||||
cscli bouncers add traefik-plugin
|
||||
register: bouncer_key_result
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Supprimer le bouncer existant en cas d'échec
|
||||
kubernetes.core.k8s_exec:
|
||||
namespace: tools
|
||||
pod: "{{ crowdsec_lapi_pod_name }}"
|
||||
container: crowdsec-lapi
|
||||
command: >
|
||||
cscli bouncers delete traefik-plugin
|
||||
when: bouncer_key_result.failed
|
||||
|
||||
- name: Réessayer de récupérer la clé API
|
||||
kubernetes.core.k8s_exec:
|
||||
namespace: tools
|
||||
pod: "{{ crowdsec_lapi_pod_name }}"
|
||||
container: crowdsec-lapi
|
||||
command: >
|
||||
cscli bouncers add traefik-plugin
|
||||
register: bouncer_key_result
|
||||
when: bouncer_key_result.failed
|
||||
|
||||
- name: Inject captcha.html into Traefik PVC
|
||||
include_tasks: inject_captcha_html.yml
|
||||
tags: never
|
||||
|
||||
- name: Créer le Middleware Traefik pour CrowdSec
|
||||
kubernetes.core.k8s:
|
||||
state: present
|
||||
definition:
|
||||
apiVersion: traefik.io/v1alpha1
|
||||
kind: Middleware
|
||||
metadata:
|
||||
name: crowdsec
|
||||
namespace: kube-system
|
||||
spec:
|
||||
plugin:
|
||||
crowdsec-bouncer:
|
||||
enabled: true
|
||||
logLevel: DEBUG
|
||||
crowdsecMode: stream
|
||||
crowdsecLapiScheme: http
|
||||
crowdsecLapiHost: crowdsec-service.tools.svc.cluster.local:8080
|
||||
crowdsecLapiKey: "{{ bouncer_key_result.stdout_lines[2].strip() }}"
|
||||
htttTimeoutSeconds: 60
|
||||
crowdsecAppsecEnabled: false
|
||||
crowdsecAppsecHost: crowdsec:7422
|
||||
crowdsecAppsecFailureBlock: true
|
||||
crowdsecAppsecUnreachableBlock: true
|
||||
forwardedHeadersTrustedIPs:
|
||||
- 10.0.10.23/32
|
||||
- 10.0.20.0/24
|
||||
clientTrustedIPs:
|
||||
- 192.168.1.0/24
|
||||
- 10.42.0.0/16
|
||||
captchaProvider: turnstile
|
||||
captchaSiteKey: "{{ crowdsec_captcha_secret.resources[0].data.sitekey | b64decode }}"
|
||||
captchaSecretKey: "{{ crowdsec_captcha_secret.resources[0].data.secret | b64decode }}"
|
||||
captchaHTMLFilePath: "/data/captcha.html"
|
||||
redisCacheEnabled: true
|
||||
redisCacheHost: "redis.tools:6379"
|
||||
redisCacheDatabase: "0"
|
||||
redisCacheUnreachableBlock: false
|
||||
|
||||
- name: Supprimer les pods crowdsec en état Error pour forcer leur redémarrage
|
||||
ansible.builtin.shell: |
|
||||
kubectl get pods -n tools -l k8s-app=crowdsec \
|
||||
--field-selector=status.phase=Failed -o name | xargs -r kubectl delete -n tools
|
||||
changed_when: false
|
||||
ignore_errors: yes
|
||||
|
||||
- name: Redémarrer traefik pour prendre la nouvelle configuration du middleware
|
||||
block:
|
||||
# ---------------------
|
||||
# Scale to 0
|
||||
# ---------------------
|
||||
- name: Scale Traefik to 0
|
||||
kubernetes.core.k8s_scale:
|
||||
api_version: apps/v1
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
name: traefik
|
||||
replicas: 0
|
||||
rescue:
|
||||
- name: Log failure
|
||||
ansible.builtin.debug:
|
||||
msg: "An error occurred during traefik scale down. Traefik will still be scaled back up."
|
||||
|
||||
always:
|
||||
# ---------------------
|
||||
# Ensure Traefik is scaled back to 1 NO MATTER WHAT
|
||||
# ---------------------
|
||||
- name: Ensure Traefik is scaled back to 1
|
||||
kubernetes.core.k8s_scale:
|
||||
api_version: apps/v1
|
||||
kind: Deployment
|
||||
namespace: kube-system
|
||||
name: traefik
|
||||
replicas: 1
|
||||
wait: yes
|
||||
wait_timeout: 300
|
||||
@@ -0,0 +1,18 @@
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<title>Captcha verification</title>
|
||||
<script src="https://challenges.cloudflare.com/turnstile/v0/api.js" async defer></script>
|
||||
</head>
|
||||
<body>
|
||||
<form method="POST">
|
||||
<div class="cf-turnstile"
|
||||
data-sitekey="{{ crowdsec_captcha_secret.resources[0].data.sitekey | b64decode }}"
|
||||
data-theme="auto"
|
||||
data-size="normal">
|
||||
</div>
|
||||
<button type="submit">Valider</button>
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
@@ -2,7 +2,7 @@ vault_unseal_keys_path: ~/.arcodange/cluster-keys.json
|
||||
vault_unseal_keys_shares: 1
|
||||
vault_unseal_keys_key_threshold: 1 # keys_key_threshold <= keys_shares
|
||||
|
||||
vault_address: https://vault.arcodange.duckdns.org
|
||||
vault_address: https://vault.arcodange.lab
|
||||
|
||||
vault_oidc_gitea_setupGiteaAppJS: '{{ role_path }}/files/playwright_setupGiteaApp.js'
|
||||
|
||||
|
||||
@@ -19,7 +19,7 @@ variable "admin_email" {
|
||||
}
|
||||
variable "gitea_app" {
|
||||
type = object({
|
||||
url = optional(string, "https://gitea.arcodange.duckdns.org/")
|
||||
url = optional(string, "https://gitea.arcodange.lab")
|
||||
id = string
|
||||
secret = string
|
||||
description = optional(string, "Arcodange Gitea Auth")
|
||||
@@ -39,10 +39,10 @@ variable "gitea_admin_token" {
|
||||
sensitive = true
|
||||
}
|
||||
|
||||
# kubectl -n kube-system exec $(kubectl -n kube-system get pod -l app.kubernetes.io/name=traefik -o jsonpath="{.items[0]['.metadata.name']}") -- cat /data/acme.json | jq '(.letsencrypt.Certificates | map(select(.domain.main=="arcodange.duckdns.org")))[0]' | jq '.certificate' -r | base64 -d | openssl x509
|
||||
# variable "ca_pem" {
|
||||
# type = string
|
||||
# }
|
||||
# same as vault CA
|
||||
variable "ca_pem" {
|
||||
type = string
|
||||
}
|
||||
terraform {
|
||||
required_providers {
|
||||
vault = {
|
||||
@@ -63,10 +63,10 @@ resource "vault_jwt_auth_backend" "gitea" {
|
||||
path = "gitea"
|
||||
type = "oidc"
|
||||
oidc_discovery_url = var.gitea_app.url
|
||||
# oidc_discovery_ca_pem = var.ca_pem
|
||||
oidc_discovery_ca_pem = file(var.ca_pem)
|
||||
oidc_client_id = var.gitea_app.id
|
||||
oidc_client_secret = var.gitea_app.secret
|
||||
bound_issuer = var.gitea_app.url
|
||||
bound_issuer = trimsuffix(var.gitea_app.url, "/")
|
||||
|
||||
tune {
|
||||
allowed_response_headers = []
|
||||
@@ -91,7 +91,8 @@ resource "vault_jwt_auth_backend_role" "gitea" {
|
||||
allowed_redirect_uris = [
|
||||
"http://localhost:8250/oidc/callback", # for command line login
|
||||
"${var.vault_address}/ui/vault/auth/gitea/oidc/callback",
|
||||
"https://webapp.arcodange.duckdns.org/oauth-callback",
|
||||
"https://webapp.arcodange.fr/oauth-callback",
|
||||
"https://webapp.arcodange.lab/oauth-callback",
|
||||
]
|
||||
}
|
||||
|
||||
@@ -101,8 +102,8 @@ resource "vault_jwt_auth_backend" "gitea_jwt" {
|
||||
path = "gitea_jwt"
|
||||
type = "jwt"
|
||||
oidc_discovery_url = var.gitea_app.url
|
||||
# oidc_discovery_ca_pem = var.ca_pem
|
||||
bound_issuer = var.gitea_app.url
|
||||
oidc_discovery_ca_pem = file(var.ca_pem)
|
||||
bound_issuer = trimsuffix(var.gitea_app.url, "/")
|
||||
|
||||
tune {
|
||||
allowed_response_headers = []
|
||||
@@ -166,7 +167,7 @@ resource "vault_kv_secret" "google_credentials" {
|
||||
path = "${vault_mount.kvv1.path}/google/credentials"
|
||||
data_json = jsonencode(
|
||||
{
|
||||
credentials = file("~/.config/gcloud/application_default_credentials.json")
|
||||
credentials = file("/root/.config/gcloud/application_default_credentials.json")
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ const username = process.env.GITEA_USER;
|
||||
const password = process.env.GITEA_PASSWORD;
|
||||
const debug = Boolean(process.env.DEBUG);
|
||||
const vaultAddress = process.env.VAULT_ADDRESS || 'http://localhost:8200';
|
||||
const giteaAddress = process.env.GITEA_ADDRESS || 'https://gitea.arcodange.duckdns.org';
|
||||
const giteaAddress = process.env.GITEA_ADDRESS || 'https://gitea.arcodange.lab';
|
||||
|
||||
if (!username || !password) {
|
||||
console.error('Veuillez définir les variables d\'environnement GITEA_USER et GITEA_PASSWORD.');
|
||||
@@ -22,7 +22,7 @@ const browser = await chromium.launch({
|
||||
log: (name, severity, message, args) => console.warn(`${severity}| ${name} :: ${message} __ ${args}`)
|
||||
},
|
||||
});
|
||||
const context = await browser.newContext({locale: "gb-GB"});
|
||||
const context = await browser.newContext({locale: "gb-GB", ignoreHTTPSErrors: true}); // Using self signed cert - could improve with NODE_EXTRA_CA_CERTS env variable
|
||||
const page = await context.newPage();
|
||||
|
||||
async function doLogin() {
|
||||
@@ -75,7 +75,8 @@ async function setupApp() {
|
||||
await applicationsPanel.locator('textarea[name="redirect_uris"]').fill([
|
||||
'http://localhost:8250/oidc/callback', // for command line login
|
||||
`${vaultAddress}/ui/vault/auth/gitea/oidc/callback`,
|
||||
'https://webapp.arcodange.duckdns.org/oauth-callback',
|
||||
'https://webapp.arcodange.lab/oauth-callback',
|
||||
'https://webapp.arcodange.fr/oauth-callback',
|
||||
].join('\n'));
|
||||
await applicationsPanel.locator('form[action="/-/admin/applications/oauth2"] > button').dblclick()
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
GITEA_USER: '{{ gitea_admin_user }}'
|
||||
GITEA_PASSWORD: '{{ gitea_admin_password }}'
|
||||
VAULT_ADDRESS: '{{ vault_address }}'
|
||||
NODE_EXTRA_CA_CERTS: ''
|
||||
|
||||
- include_role:
|
||||
name: arcodange.factory.playwright
|
||||
@@ -23,6 +24,37 @@
|
||||
|
||||
volume_name: tofu-{{ ansible_date_time.iso8601.replace(':','-') }}
|
||||
|
||||
- name: Check SSL certificate for Gitea
|
||||
shell: >-
|
||||
openssl s_client -connect gitea.arcodange.lab:443 -CAfile /etc/ssl/certs/arcodange-root.pem -servername gitea.arcodange.lab < /dev/null 2>&1 | grep -E "Verify return code:|subject=|issuer="
|
||||
register: ssl_check
|
||||
ignore_errors: true
|
||||
|
||||
- name: Debug SSL certificate check
|
||||
debug:
|
||||
var: ssl_check.stdout_lines
|
||||
|
||||
|
||||
|
||||
# WARNING : this disables AND wipes ALL gitea_cicd_* per-app JWT roles
|
||||
# (created by tools/hashicorp-vault/iac/) every time it runs. Default is OFF
|
||||
# to preserve those roles across normal ansible runs ; opt-in only when you
|
||||
# really want to rebuild the OIDC backend from scratch (e.g. config drift on
|
||||
# bound_issuer or similar).
|
||||
- name: Delete existing Gitea OIDC backends if they exist
|
||||
include_tasks: vault_cmd.yml
|
||||
vars:
|
||||
vault_cmd: vault auth disable {{ backend_name }}
|
||||
vault_cmd_can_fail: true
|
||||
vault_cmd_json_attr: ''
|
||||
vault_cmd_output_var: false
|
||||
loop:
|
||||
- gitea
|
||||
- gitea_jwt
|
||||
loop_control:
|
||||
loop_var: backend_name
|
||||
when: vault_oidc_force_reset | default(false) | bool
|
||||
|
||||
- name: use tofu to provision vault
|
||||
block:
|
||||
- shell: docker volume create {{ volume_name }}
|
||||
@@ -31,6 +63,8 @@
|
||||
-v {{ volume_name }}:/tofu -w /tofu
|
||||
-v {{ role_path }}/files/hashicorp_vault.tf:/tofu/hashicorp_vault.tf
|
||||
-v ~/.config/gcloud:/root/.config/gcloud
|
||||
-v /etc/ssl/certs/arcodange-root.pem:/etc/ssl/custom/arcodange-root.pem:ro
|
||||
-e VAULT_CACERT=/etc/ssl/custom/arcodange-root.pem
|
||||
--entrypoint=''
|
||||
ghcr.io/opentofu/opentofu:latest
|
||||
{{ command }}
|
||||
@@ -44,6 +78,7 @@
|
||||
# -var='vault_token={{ vault_root_token }}'
|
||||
# -var='postgres_admin_credentials={{ postgres_admin_credentials | to_json }}'
|
||||
# -var='gitea_admin_token={{ gitea_admin_token }}'
|
||||
# -var="ca_pem=/etc/ssl/custom/arcodange-root.pem"
|
||||
- >-
|
||||
tofu apply -auto-approve -no-color
|
||||
-var='gitea_app={{ gitea_app | to_json }}'
|
||||
@@ -51,6 +86,7 @@
|
||||
-var='vault_token={{ vault_root_token }}'
|
||||
-var='postgres_admin_credentials={{ postgres_admin_credentials | to_json }}'
|
||||
-var='gitea_admin_token={{ gitea_admin_token }}'
|
||||
-var="ca_pem=/etc/ssl/custom/arcodange-root.pem"
|
||||
loop_control:
|
||||
loop_var: command
|
||||
extended: true
|
||||
@@ -71,8 +107,28 @@
|
||||
gitea_secret_name: vault_oauth__sh_b64
|
||||
gitea_secret_value: >-
|
||||
{{ lookup('ansible.builtin.template', 'oidc_jwt_token.sh.j2', template_vars = {
|
||||
'GITEA_BASE_URL': 'https://gitea.arcodange.duckdns.org',
|
||||
'GITEA_BASE_URL': 'https://gitea.arcodange.lab',
|
||||
'OIDC_CLIENT_ID': gitea_app.id,
|
||||
'OIDC_CLIENT_SECRET': gitea_app.secret,
|
||||
}) | b64encode }}
|
||||
gitea_owner_type: 'org' # value != 'user'
|
||||
gitea_owner_type: 'org' # value != 'user'
|
||||
|
||||
# Also propagate the same secret to user-owned namespaces. Gitea Action secrets
|
||||
# are scoped per owner, so repos under a user account cannot read org-level
|
||||
# secrets. Extend this list if other personal-namespace apps need vault auth.
|
||||
- name: Propagate vault_oauth__sh_b64 to user-owned namespaces
|
||||
include_role:
|
||||
name: arcodange.factory.gitea_secret
|
||||
vars:
|
||||
gitea_secret_name: vault_oauth__sh_b64
|
||||
gitea_secret_value: >-
|
||||
{{ lookup('ansible.builtin.template', 'oidc_jwt_token.sh.j2', template_vars = {
|
||||
'GITEA_BASE_URL': 'https://gitea.arcodange.lab',
|
||||
'OIDC_CLIENT_ID': gitea_app.id,
|
||||
'OIDC_CLIENT_SECRET': gitea_app.secret,
|
||||
}) | b64encode }}
|
||||
gitea_owner_type: 'user'
|
||||
gitea_owner_name: '{{ item }}'
|
||||
loop: '{{ gitea_secret_propagation_users }}'
|
||||
loop_control:
|
||||
label: '{{ item }}'
|
||||
@@ -4,10 +4,10 @@ set -eu
|
||||
# Variables à ajuster selon ta configuration
|
||||
CLIENT_ID="{{ OIDC_CLIENT_ID }}"
|
||||
CLIENT_SECRET="{{ OIDC_CLIENT_SECRET }}"
|
||||
REDIRECT_URI="{{ OIDC_CLIENT_CALLBACK | default('https://webapp.arcodange.duckdns.org/oauth-callback') }}" # Redirige ici après l'authentification
|
||||
AUTH_URL="{{ GITEA_BASE_URL | default('https://gitea.arcodange.duckdns.org') }}/login/oauth/authorize"
|
||||
TOKEN_URL="{{ GITEA_BASE_URL | default('https://gitea.arcodange.duckdns.org') }}/login/oauth/access_token"
|
||||
ISSUER="https://gitea.arcodange.duckdns.org/"
|
||||
REDIRECT_URI="{{ OIDC_CLIENT_CALLBACK | default('https://webapp.arcodange.lab/oauth-callback') }}" # Redirige ici après l'authentification
|
||||
AUTH_URL="{{ GITEA_BASE_URL | default('https://gitea.arcodange.lab') }}/login/oauth/authorize"
|
||||
TOKEN_URL="{{ GITEA_BASE_URL | default('https://gitea.arcodange.lab') }}/login/oauth/access_token"
|
||||
ISSUER="https://gitea.arcodange.lab/"
|
||||
# SCOPE="openid email profile groups" # Scope que tu souhaites obtenir - profile groups
|
||||
SCOPE="email openid read:user" # Scope que tu souhaites obtenir - profile groups
|
||||
set +u
|
||||
@@ -26,7 +26,7 @@ poll_state() {
|
||||
#echo "Tentative $attempt/$MAX_ATTEMPTS: Requête à l'endpoint /retrieve pour state=$STATE..."
|
||||
|
||||
# Effectuer la requête GET
|
||||
RESPONSE=$(curl -s -w "%{http_code}" -o /tmp/response_body "https://webapp.arcodange.duckdns.org/retrieve?state=$STATE")
|
||||
RESPONSE=$(curl -s -w "%{http_code}" -o /tmp/response_body "https://webapp.arcodange.lab/retrieve?state=$STATE")
|
||||
HTTP_CODE=$(tail -n1 <<< "$RESPONSE")
|
||||
|
||||
if [ "$HTTP_CODE" == "200" ]; then
|
||||
@@ -50,6 +50,9 @@ poll_state() {
|
||||
return 1
|
||||
}
|
||||
|
||||
# 0. Installer le certificat arcodange.lab (droits sudo)
|
||||
# curl https://ssl-ca.arcodange.lab:8443/roots.pem -ks > /usr/local/share/ca-certificates/arcodange-root.crt && update-ca-certificates 2>/dev/null >/dev/null && export VAULT_CACERT=/usr/local/share/ca-certificates/arcodange-root.crt || echo "couldn't install self signed .crt" >&2
|
||||
|
||||
# 1. Rediriger l'utilisateur vers l'URL d'authentification
|
||||
echo "Ouvrez le lien suivant dans votre navigateur pour vous authentifier dans Gitea:"
|
||||
echo "$AUTH_URL?client_id=$CLIENT_ID&redirect_uri=$REDIRECT_URI&response_type=code&scope=$(sed 's/ /%20/g' <<<$SCOPE)&state=$STATE"
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# to see generated tokens
|
||||
# go to https://gitea.arcodange.duckdns.org/user/settings/applications
|
||||
# go to https://gitea.arcodange.lab/user/settings/applications
|
||||
|
||||
- when: >-
|
||||
lookup('ansible.builtin.varnames', '^' ~ gitea_token_fact_name ~ '$') | length == 0
|
||||
|
||||
@@ -7,7 +7,7 @@ const username = process.env.GITEA_USER;
|
||||
const password = process.env.GITEA_PASSWORD;
|
||||
const debug = Boolean(process.env.DEBUG);
|
||||
const vaultAddress = process.env.VAULT_ADDRESS || 'http://localhost:8200';
|
||||
const giteaAddress = process.env.GITEA_ADDRESS || 'https://gitea.arcodange.duckdns.org';
|
||||
const giteaAddress = process.env.GITEA_ADDRESS || 'https://gitea.arcodange.lab';
|
||||
|
||||
if (!username || !password) {
|
||||
console.error('Veuillez définir les variables d\'environnement GITEA_USER et GITEA_PASSWORD.');
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
kubectl -n kube-system exec
|
||||
$(kubectl -n kube-system get pod -l app.kubernetes.io/name=traefik
|
||||
-o jsonpath="{.items[0]['.metadata.name']}") --
|
||||
cat /data/acme.json | jq '(.letsencrypt.Certificates | map(select(.domain.main=="*.arcodange.duckdns.org")))[0]'
|
||||
cat /data/acme.json | jq '(.letsencrypt.Certificates | map(select(.domain.main=="*.arcodange.lab")))[0]'
|
||||
| jq '.certificate' -r | base64 -d | openssl x509
|
||||
register: traefik_certs_cmd
|
||||
- set_fact:
|
||||
|
||||
@@ -3,8 +3,9 @@ roles:
|
||||
- name: geerlingguy.docker
|
||||
|
||||
collections:
|
||||
- name: community.general
|
||||
- name: community.docker
|
||||
- name: ansible.posix
|
||||
- name: community.crypto
|
||||
- name: community.docker
|
||||
- name: community.general
|
||||
- name: kubernetes.core
|
||||
- name: git+https://github.com/k3s-io/k3s-ansible.git
|
||||
@@ -1,4 +1,5 @@
|
||||
{{- range $app_name, $app_attr := .Values.gitea_applications -}}
|
||||
{{- $org := default "arcodange-org" $app_attr.org -}}
|
||||
---
|
||||
apiVersion: argoproj.io/v1alpha1
|
||||
kind: Application
|
||||
@@ -14,16 +15,20 @@ metadata:
|
||||
spec:
|
||||
project: default
|
||||
source:
|
||||
repoURL: https://gitea.arcodange.duckdns.org/arcodange-org/{{ $app_name }}
|
||||
repoURL: https://gitea.arcodange.lab/{{ $org }}/{{ $app_name }}
|
||||
targetRevision: HEAD
|
||||
path: chart
|
||||
destination:
|
||||
server: https://kubernetes.default.svc
|
||||
namespace: {{ $app_name }}
|
||||
syncPolicy:
|
||||
{{- if $app_attr.syncPolicy }}
|
||||
{{- toYaml $app_attr.syncPolicy | nindent 4 }}
|
||||
{{- else }}
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
{{- end }}
|
||||
syncOptions:
|
||||
- CreateNamespace=true
|
||||
{{ end }}
|
||||
@@ -6,19 +6,30 @@ gitea_applications:
|
||||
annotations: {}
|
||||
tools:
|
||||
annotations: {}
|
||||
syncPolicy:
|
||||
automated:
|
||||
prune: true
|
||||
selfHeal: true
|
||||
webapp:
|
||||
annotations: {}
|
||||
annotations:
|
||||
argocd-image-updater.argoproj.io/image-list: webapp=gitea.arcodange.lab/arcodange-org/webapp:latest
|
||||
argocd-image-updater.argoproj.io/webapp.update-strategy: digest
|
||||
erp:
|
||||
annotations: {}
|
||||
cms:
|
||||
annotations:
|
||||
argocd-image-updater.argoproj.io/image-list: cms=gitea.arcodange.duckdns.org/arcodange-org/cms:latest
|
||||
argocd-image-updater.argoproj.io/image-list: cms=gitea.arcodange.lab/arcodange-org/cms:latest
|
||||
argocd-image-updater.argoproj.io/cms.update-strategy: digest
|
||||
dance-lessons-coach:
|
||||
org: arcodange
|
||||
annotations:
|
||||
argocd-image-updater.argoproj.io/image-list: dance-lessons-coach=gitea.arcodange.lab/arcodange/dance-lessons-coach:latest
|
||||
argocd-image-updater.argoproj.io/dance-lessons-coach.update-strategy: digest
|
||||
|
||||
argocd_image_updater_chart_values:
|
||||
config:
|
||||
argocd:
|
||||
grpcWeb: false
|
||||
serverAddress: "https://argocd.arcodange.duckdns.org/"
|
||||
serverAddress: "https://argocd.arcodange.lab/"
|
||||
insecure: true
|
||||
plaintext: true
|
||||
@@ -9,7 +9,7 @@
|
||||
>L'unsealKey, le vaultRootToken initial et l'authentification au backend terraform sont pour le moment configurés sur le controleur ansible (Macbook Pro).
|
||||
|
||||
>[!NOTE]
|
||||
> Vault est déployé via [argo cd](https://gitea.arcodange.duckdns.org/arcodange-org/tools/src/branch/main/hashicorp-vault)
|
||||
> Vault est déployé via [argo cd](https://gitea.arcodange.lab/arcodange-org/tools/src/branch/main/hashicorp-vault)
|
||||
|
||||
```mermaid
|
||||
%%{init: { 'logLevel': 'debug', 'theme': 'base',
|
||||
|
||||
@@ -71,6 +71,7 @@ module "cf_arcodange_cms_token" {
|
||||
"zone:Zone Settings Write",
|
||||
"zone:DNS Write",
|
||||
"account:Cloudflare Tunnel Write",
|
||||
"account:Turnstile Sites Write",
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,12 +24,12 @@ terraform {
|
||||
}
|
||||
|
||||
provider "gitea" { # https://registry.terraform.io/providers/go-gitea/gitea/latest/docs
|
||||
base_url = "https://gitea.arcodange.duckdns.org"
|
||||
base_url = "https://gitea.arcodange.lab"
|
||||
# use GITEA_TOKEN env var
|
||||
}
|
||||
|
||||
provider "vault" {
|
||||
address = "https://vault.arcodange.duckdns.org"
|
||||
address = "https://vault.arcodange.lab"
|
||||
auth_login_jwt { # TERRAFORM_VAULT_AUTH_JWT environment variable
|
||||
mount = "gitea_jwt"
|
||||
role = "gitea_cicd"
|
||||
|
||||
@@ -29,7 +29,7 @@ provider "postgresql" {
|
||||
}
|
||||
|
||||
provider vault {
|
||||
address = "https://vault.arcodange.duckdns.org"
|
||||
address = "https://vault.arcodange.lab"
|
||||
auth_login_jwt { # TERRAFORM_VAULT_AUTH_JWT environment variable
|
||||
mount = "gitea_jwt"
|
||||
role = "gitea_cicd"
|
||||
|
||||
@@ -1,4 +1,7 @@
|
||||
applications = [
|
||||
"webapp",
|
||||
"erp",
|
||||
"crowdsec",
|
||||
"plausible",
|
||||
"dance-lessons-coach",
|
||||
]
|
||||
11
pyproject.toml
Normal file
11
pyproject.toml
Normal file
@@ -0,0 +1,11 @@
|
||||
[project]
|
||||
name = "arcodange-factory"
|
||||
version = "0.0.0"
|
||||
description = "Ansible automation for the Arcodange factory homelab"
|
||||
requires-python = ">=3.12,<3.13"
|
||||
dependencies = [
|
||||
"ansible-core",
|
||||
"kubernetes",
|
||||
"jmespath",
|
||||
"dnspython",
|
||||
]
|
||||
64
ssl.md
Normal file
64
ssl.md
Normal file
@@ -0,0 +1,64 @@
|
||||
# Distribution du Root CA Step-CA
|
||||
|
||||
Ce guide explique comment installer le certificat racine Step-CA sur tous les appareils pour que TLS fonctionne avec la PKI interne.
|
||||
|
||||
---
|
||||
|
||||
## Pré-requis
|
||||
|
||||
- Le certificat racine est récupéré depuis `step_ca_primary` (pi1) : `/home/step/.step/certs/root_ca.crt`
|
||||
- Les machines cibles sont :
|
||||
- pi1, pi2, pi3 (Raspbian / Debian)
|
||||
- localhost (Mac)
|
||||
|
||||
---
|
||||
|
||||
## 1. Copier le certificat sur les RPi
|
||||
|
||||
```bash
|
||||
scp pi1:/home/step/.step/certs/root_ca.crt /tmp/root_ca.crt
|
||||
````
|
||||
|
||||
Puis sur chaque Pi (idempotent) :
|
||||
```bash
|
||||
for pi in pi1 pi2 pi3
|
||||
do
|
||||
ssh $pi "sudo cp /home/step/.step/certs/root_ca.crt /usr/local/share/ca-certificates/arcodange-root.crt && sudo chmod 644 /usr/local/share/ca-certificates/arcodange-root.crt && sudo update-ca-certificates"
|
||||
ssh $pi 'sudo apt install -y libnss3-tools && certutil -d sql:/home/pi/.pki/nssdb -A -t "C,," -n "arcodange-root" -i /usr/local/share/ca-certificates/arcodange-root.crt'
|
||||
done
|
||||
```
|
||||
|
||||
Vérification rapide sur chaque Pi :
|
||||
```bash
|
||||
ssh pi1 "sudo openssl verify /usr/local/share/ca-certificates/arcodange-root.crt"
|
||||
ssh pi2 "sudo openssl verify /usr/local/share/ca-certificates/arcodange-root.crt"
|
||||
ssh pi3 "sudo openssl verify /usr/local/share/ca-certificates/arcodange-root.crt"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 2. Copier le certificat sur Mac (localhost)
|
||||
|
||||
```bash
|
||||
scp pi1:/home/step/.step/certs/root_ca.crt /tmp/root_ca.crt
|
||||
sudo security add-trusted-cert -d -r trustRoot -k /Library/Keychains/System.keychain /tmp/root_ca.crt
|
||||
sudo scp pi@pi1:/etc/ssl/certs/arcodange-root.pem /etc/ssl/certs/arcodange-root.pem
|
||||
```
|
||||
|
||||
Vérification :
|
||||
```bash
|
||||
security verify-cert -c /tmp/root_ca.crt
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Redémarrer les services TLS si nécessaire
|
||||
|
||||
Sur les RPi (optionnel, si vous utilisez Docker, containerd ou k3s par exemple).
|
||||
|
||||
---
|
||||
|
||||
## 4
|
||||
|
||||
Autre commande pratique:
|
||||
> `curl https://ssl-ca.arcodange.lab:8443/roots.pem -ks > /usr/local/share/ca-certificates/arcodange-root.crt && update-ca-certificates 2>/dev/null >/dev/null`
|
||||
342
uv.lock
generated
Normal file
342
uv.lock
generated
Normal file
@@ -0,0 +1,342 @@
|
||||
version = 1
|
||||
revision = 3
|
||||
requires-python = "==3.12.*"
|
||||
|
||||
[[package]]
|
||||
name = "ansible-core"
|
||||
version = "2.20.5"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "cryptography" },
|
||||
{ name = "jinja2" },
|
||||
{ name = "packaging" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "resolvelib" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9d/ec/690cc73e38c3546eabc8ef4118e0d7be1758a598bc23eed3e24ca1f346a7/ansible_core-2.20.5.tar.gz", hash = "sha256:82e3049d95e6e02e5d20d4a5a8e10533a55e0cc52e878e4cf77166c45410f16f", size = 3339511, upload-time = "2026-04-21T00:48:27.175Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/f9/e1/4454505e725b84ae670229565dfc20c4075480199647bf4874cf337c560e/ansible_core-2.20.5-py3-none-any.whl", hash = "sha256:ff6ff15c6a37fda07dc7400207e17e93727b24173ca48c068b3311a50d75ecc3", size = 2416843, upload-time = "2026-04-21T00:48:25.413Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arcodange-factory"
|
||||
version = "0.0.0"
|
||||
source = { virtual = "." }
|
||||
dependencies = [
|
||||
{ name = "ansible-core" },
|
||||
{ name = "dnspython" },
|
||||
{ name = "jmespath" },
|
||||
{ name = "kubernetes" },
|
||||
]
|
||||
|
||||
[package.metadata]
|
||||
requires-dist = [
|
||||
{ name = "ansible-core" },
|
||||
{ name = "dnspython" },
|
||||
{ name = "jmespath" },
|
||||
{ name = "kubernetes" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "certifi"
|
||||
version = "2026.4.22"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/25/ee/6caf7a40c36a1220410afe15a1cc64993a1f864871f698c0f93acb72842a/certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580", size = 137077, upload-time = "2026-04-22T11:26:11.191Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/22/30/7cd8fdcdfbc5b869528b079bfb76dcdf6056b1a2097a662e5e8c04f42965/certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a", size = 135707, upload-time = "2026-04-22T11:26:09.372Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cffi"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "pycparser", marker = "implementation_name != 'PyPy'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "charset-normalizer"
|
||||
version = "3.4.7"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328, upload-time = "2026-04-02T09:26:24.331Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061, upload-time = "2026-04-02T09:26:25.568Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031, upload-time = "2026-04-02T09:26:26.865Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239, upload-time = "2026-04-02T09:26:28.044Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/4b/f8/d0118a2f5f23b02cd166fa385c60f9b0d4f9194f574e2b31cef350ad7223/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5649fd1c7bade02f320a462fdefd0b4bd3ce036065836d4f42e0de958038e116", size = 216589, upload-time = "2026-04-02T09:26:29.239Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b1/f1/6d2b0b261b6c4ceef0fcb0d17a01cc5bc53586c2d4796fa04b5c540bc13d/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:203104ed3e428044fd943bc4bf45fa73c0730391f9621e37fe39ecf477b128cb", size = 202733, upload-time = "2026-04-02T09:26:30.5Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/c0/7b1f943f7e87cc3db9626ba17807d042c38645f0a1d4415c7a14afb5591f/charset_normalizer-3.4.7-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:298930cec56029e05497a76988377cbd7457ba864beeea92ad7e844fe74cd1f1", size = 212652, upload-time = "2026-04-02T09:26:31.709Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/38/dd/5a9ab159fe45c6e72079398f277b7d2b523e7f716acc489726115a910097/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:708838739abf24b2ceb208d0e22403dd018faeef86ddac04319a62ae884c4f15", size = 211229, upload-time = "2026-04-02T09:26:33.282Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/ff/531a1cad5ca855d1c1a8b69cb71abfd6d85c0291580146fda7c82857caa1/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:0f7eb884681e3938906ed0434f20c63046eacd0111c4ba96f27b76084cd679f5", size = 203552, upload-time = "2026-04-02T09:26:34.845Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c1/4c/a5fb52d528a8ca41f7598cb619409ece30a169fbdf9cdce592e53b46c3a6/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:4dc1e73c36828f982bfe79fadf5919923f8a6f4df2860804db9a98c48824ce8d", size = 230806, upload-time = "2026-04-02T09:26:36.152Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316, upload-time = "2026-04-02T09:26:37.672Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274, upload-time = "2026-04-02T09:26:38.93Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468, upload-time = "2026-04-02T09:26:40.17Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460, upload-time = "2026-04-02T09:26:41.416Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330, upload-time = "2026-04-02T09:26:42.554Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828, upload-time = "2026-04-02T09:26:44.075Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958, upload-time = "2026-04-02T09:28:37.794Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "48.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9f/a9/db8f313fdcd85d767d4973515e1db101f9c71f95fced83233de224673757/cryptography-48.0.0.tar.gz", hash = "sha256:5c3932f4436d1cccb036cb0eaef46e6e2db91035166f1ad6505c3c9d5a635920", size = 832984, upload-time = "2026-05-04T22:59:38.133Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/df/3d/01f6dd9190170a5a241e0e98c2d04be3664a9e6f5b9b872cde63aff1c3dd/cryptography-48.0.0-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:0c558d2cdffd8f4bbb30fc7134c74d2ca9a476f830bb053074498fbc86f41ed6", size = 8001587, upload-time = "2026-05-04T22:57:36.803Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b2/6e/e90527eef33f309beb811cf7c982c3aeffcce8e3edb178baa4ca3ae4a6fa/cryptography-48.0.0-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f5333311663ea94f75dd408665686aaf426563556bb5283554a3539177e03b8c", size = 4690433, upload-time = "2026-05-04T22:57:40.373Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/90/04/673510ed51ddff56575f306cf1617d80411ee76831ccd3097599140efdfe/cryptography-48.0.0-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7995ef305d7165c3f11ae07f2517e5a4f1d5c18da1376a0a9ed496336b69e5f3", size = 4710620, upload-time = "2026-05-04T22:57:42.935Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/14/d5/e9c4ef932c8d800490c34d8bd589d64a31d5890e27ec9e9ad532be893294/cryptography-48.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:40ba1f85eaa6959837b1d51c9767e230e14612eea4ef110ee8854ada22da1bf5", size = 4696283, upload-time = "2026-05-04T22:57:45.294Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/29/174b9dfb60b12d59ecfc6cfa04bc88c21b42a54f01b8aae09bb6e51e4c7f/cryptography-48.0.0-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:369a6348999f94bbd53435c894377b20ab95f25a9065c283570e70150d8abc3c", size = 5296573, upload-time = "2026-05-04T22:57:47.933Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/95/38/0d29a6fd7d0d1373f0c0c88a04ba20e359b257753ac497564cd660fc1d55/cryptography-48.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a0e692c683f4df67815a2d258b324e66f4738bd7a96a218c826dce4f4bd05d8f", size = 4743677, upload-time = "2026-05-04T22:57:50.067Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/30/be/eef653013d5c63b6a490529e0316f9ac14a37602965d4903efed1399f32b/cryptography-48.0.0-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:18349bbc56f4743c8b12dc32e2bccb2cf83ee8b69a3bba74ef8ae857e26b3d25", size = 4330808, upload-time = "2026-05-04T22:57:52.301Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/84/9e/500463e87abb7a0a0f9f256ec21123ecde0a7b5541a15e840ea54551fd81/cryptography-48.0.0-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:7e8eac43dfca5c4cccc6dad9a80504436fca53bb9bc3100a2386d730fbe6b602", size = 4695941, upload-time = "2026-05-04T22:57:54.603Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e3/dc/7303087450c2ec9e7fbb750e17c2abfbc658f23cbd0e54009509b7cc4091/cryptography-48.0.0-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:9ccdac7d40688ecb5a3b4a604b8a88c8002e3442d6c60aead1db2a89a041560c", size = 5252579, upload-time = "2026-05-04T22:57:57.207Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d0/c0/7101d3b7215edcdc90c45da544961fd8ed2d6448f77577460fa75a8443f7/cryptography-48.0.0-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:bd72e68b06bb1e96913f97dd4901119bc17f39d4586a5adf2d3e47bc2b9d58b5", size = 4743326, upload-time = "2026-05-04T22:57:59.535Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ac/d8/5b833bad13016f562ab9d063d68199a4bd121d18458e439515601d3357ec/cryptography-48.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:59baa2cb386c4f0b9905bd6eb4c2a79a69a128408fd31d32ca4d7102d4156321", size = 4826672, upload-time = "2026-05-04T22:58:01.996Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/98/e1/7074eb8bf3c135558c73fc2bcf0f5633f912e6fb87e868a55c454080ef09/cryptography-48.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:9249e3cd978541d665967ac2cb2787fd6a62bddf1e75b3e347a594d7dacf4f74", size = 4972574, upload-time = "2026-05-04T22:58:03.968Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/04/70/e5a1b41d325f797f39427aa44ef8baf0be500065ab6d8e10369d850d4a4f/cryptography-48.0.0-cp311-abi3-win32.whl", hash = "sha256:9c459db21422be75e2809370b829a87eb37f74cd785fc4aa9ea1e5f43b47cda4", size = 3294868, upload-time = "2026-05-04T22:58:06.467Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f4/ac/8ac51b4a5fc5932eb7ee5c517ba7dc8cd834f0048962b6b352f00f41ebf9/cryptography-48.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:5b012212e08b8dd5edc78ef54da83dd9892fd9105323b3993eff6bea65dc21d7", size = 3817107, upload-time = "2026-05-04T22:58:08.845Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/f2/63/61d4a4e1c6b6bab6ce1e213cd36a24c415d90e76d78c5eb8577c5541d2e8/cryptography-48.0.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:58d00498e8933e4a194f3076aee1b4a97dfec1a6da444535755822fe5d8b0b86", size = 7983482, upload-time = "2026-05-04T22:58:43.769Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/d5/ac/f5b5995b87770c693e2596559ffafe195b4033a57f14a82268a2842953f3/cryptography-48.0.0-cp39-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:614d0949f4790582d2cc25553abd09dd723025f0c0e7c67376a1d77196743d6e", size = 4683266, upload-time = "2026-05-04T22:58:46.064Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/c6/8b14f67e18338fbc4adb76f66c001f5c3610b3e2d1837f268f47a347dbbb/cryptography-48.0.0-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7ce4bfae76319a532a2dc68f82cc32f5676ee792a983187dac07183690e5c66f", size = 4696228, upload-time = "2026-05-04T22:58:48.22Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ea/73/f808fbae9514bd91b47875b003f13e284c8c6bdfd904b7944e803937eec1/cryptography-48.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:2eb992bbd4661238c5a397594c83f5b4dc2bc5b848c365c8f991b6780efcc5c7", size = 4689097, upload-time = "2026-05-04T22:58:50.9Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/93/01/d86632d7d28db8ae83221995752eeb6639ffb374c2d22955648cf8d52797/cryptography-48.0.0-cp39-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:22a5cb272895dce158b2cacdfdc3debd299019659f42947dbdac6f32d68fe832", size = 5283582, upload-time = "2026-05-04T22:58:53.017Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/02/e1/50edc7a50334807cc4791fc4a0ce7468b4a1416d9138eab358bfc9a3d70b/cryptography-48.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2b4d59804e8408e2fea7d1fbaf218e5ec984325221db76e6a241a9abd6cdd95c", size = 4730479, upload-time = "2026-05-04T22:58:55.611Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/6f/af/99a582b1b1641ff5911ac559beb45097cf79efd4ead4657f578ef1af2d47/cryptography-48.0.0-cp39-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:984a20b0f62a26f48a3396c72e4bc34c66e356d356bf370053066b3b6d54634a", size = 4326481, upload-time = "2026-05-04T22:58:57.607Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/90/ee/89aa26a06ef0a7d7611788ffd571a7c50e368cc6a4d5eef8b4884e866edb/cryptography-48.0.0-cp39-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:5a5ed8fde7a1d09376ca0b40e68cd59c69fe23b1f9768bd5824f54681626032a", size = 4688713, upload-time = "2026-05-04T22:59:00.077Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/70/ba/bcb1b0bb7a33d4c7c0c4d4c7874b4a62ae4f56113a5f4baefa362dfb1f0f/cryptography-48.0.0-cp39-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:8cd666227ef7af430aa5914a9910e0ddd703e75f039cef0825cd0da71b6b711a", size = 5238165, upload-time = "2026-05-04T22:59:02.317Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c9/70/ca4003b1ce5ca3dc3186ada51908c8a9b9ff7d5cab83cc0d43ee14ec144f/cryptography-48.0.0-cp39-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:9071196d81abc88b3516ac8cdfad32e2b66dd4a5393a8e68a961e9161ddc6239", size = 4729947, upload-time = "2026-05-04T22:59:05.255Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/44/a0/4ec7cf774207905aef1a8d11c3750d5a1db805eb380ee4e16df317870128/cryptography-48.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1e2d54c8be6152856a36f0882ab231e70f8ec7f14e93cf87db8a2ed056bf160c", size = 4822059, upload-time = "2026-05-04T22:59:07.802Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/75/a2e55f99c16fcac7b5d6c1eb19ad8e00799854d6be5ca845f9259eae1681/cryptography-48.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a5da777e32ffed6f85a7b2b3f7c5cbc88c146bfcd0a1d7baf5fcc6c52ee35dd4", size = 4960575, upload-time = "2026-05-04T22:59:09.851Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/b8/23/6e6f32143ab5d8b36ca848a502c4bcd477ae75b9e1677e3530d669062578/cryptography-48.0.0-cp39-abi3-win32.whl", hash = "sha256:77a2ccbbe917f6710e05ba9adaa25fb5075620bf3ea6fb751997875aff4ae4bd", size = 3279117, upload-time = "2026-05-04T22:59:12.019Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9d/9a/0fea98a70cf1749d41d738836f6349d97945f7c89433a259a6c2642eefeb/cryptography-48.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:16cd65b9330583e4619939b3a3843eec1e6e789744bb01e7c7e2e62e33c239c8", size = 3792100, upload-time = "2026-05-04T22:59:14.884Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dnspython"
|
||||
version = "2.8.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/8c/8b/57666417c0f90f08bcafa776861060426765fdb422eb10212086fb811d26/dnspython-2.8.0.tar.gz", hash = "sha256:181d3c6996452cb1189c4046c61599b84a5a86e099562ffde77d26984ff26d0f", size = 368251, upload-time = "2025-09-07T18:58:00.022Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ba/5a/18ad964b0086c6e62e2e7500f7edc89e3faa45033c71c1893d34eed2b2de/dnspython-2.8.0-py3-none-any.whl", hash = "sha256:01d9bbc4a2d76bf0db7c1f729812ded6d912bd318d3b1cf81d30c0f845dbf3af", size = 331094, upload-time = "2025-09-07T18:57:58.071Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "durationpy"
|
||||
version = "0.10"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/9d/a4/e44218c2b394e31a6dd0d6b095c4e1f32d0be54c2a4b250032d717647bab/durationpy-0.10.tar.gz", hash = "sha256:1fa6893409a6e739c9c72334fc65cca1f355dbdd93405d30f726deb5bde42fba", size = 3335, upload-time = "2025-05-17T13:52:37.26Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b0/0d/9feae160378a3553fa9a339b0e9c1a048e147a4127210e286ef18b730f03/durationpy-0.10-py3-none-any.whl", hash = "sha256:3b41e1b601234296b4fb368338fdcd3e13e0b4fb5b67345948f4f2bf9868b286", size = 3922, upload-time = "2025-05-17T13:52:36.463Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "3.13"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/ce/cc/762dfb036166873f0059f3b7de4565e1b5bc3d6f28a414c13da27e442f99/idna-3.13.tar.gz", hash = "sha256:585ea8fe5d69b9181ec1afba340451fba6ba764af97026f92a91d4eef164a242", size = 194210, upload-time = "2026-04-22T16:42:42.314Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5d/13/ad7d7ca3808a898b4612b6fe93cde56b53f3034dcde235acb1f0e1df24c6/idna-3.13-py3-none-any.whl", hash = "sha256:892ea0cde124a99ce773decba204c5552b69c3c67ffd5f232eb7696135bc8bb3", size = 68629, upload-time = "2026-04-22T16:42:40.909Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jinja2"
|
||||
version = "3.1.6"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "markupsafe" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115, upload-time = "2025-03-05T20:05:02.478Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jmespath"
|
||||
version = "1.1.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kubernetes"
|
||||
version = "35.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "durationpy" },
|
||||
{ name = "python-dateutil" },
|
||||
{ name = "pyyaml" },
|
||||
{ name = "requests" },
|
||||
{ name = "requests-oauthlib" },
|
||||
{ name = "six" },
|
||||
{ name = "urllib3" },
|
||||
{ name = "websocket-client" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/2c/8f/85bf51ad4150f64e8c665daf0d9dfe9787ae92005efb9a4d1cba592bd79d/kubernetes-35.0.0.tar.gz", hash = "sha256:3d00d344944239821458b9efd484d6df9f011da367ecb155dadf9513f05f09ee", size = 1094642, upload-time = "2026-01-16T01:05:27.76Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/70/05b685ea2dffcb2adbf3cdcea5d8865b7bc66f67249084cf845012a0ff13/kubernetes-35.0.0-py2.py3-none-any.whl", hash = "sha256:39e2b33b46e5834ef6c3985ebfe2047ab39135d41de51ce7641a7ca5b372a13d", size = 2017602, upload-time = "2026-01-16T01:05:25.991Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "markupsafe"
|
||||
version = "3.0.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947, upload-time = "2025-09-27T18:36:33.86Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962, upload-time = "2025-09-27T18:36:35.099Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760, upload-time = "2025-09-27T18:36:36.001Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529, upload-time = "2025-09-27T18:36:36.906Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015, upload-time = "2025-09-27T18:36:37.868Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540, upload-time = "2025-09-27T18:36:38.761Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105, upload-time = "2025-09-27T18:36:39.701Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906, upload-time = "2025-09-27T18:36:40.689Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "oauthlib"
|
||||
version = "3.3.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/0b/5f/19930f824ffeb0ad4372da4812c50edbd1434f678c90c2733e1188edfc63/oauthlib-3.3.1.tar.gz", hash = "sha256:0f0f8aa759826a193cf66c12ea1af1637f87b9b4622d46e866952bb022e538c9", size = 185918, upload-time = "2025-06-19T22:48:08.269Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/be/9c/92789c596b8df838baa98fa71844d84283302f7604ed565dafe5a6b5041a/oauthlib-3.3.1-py3-none-any.whl", hash = "sha256:88119c938d2b8fb88561af5f6ee0eec8cc8d552b7bb1f712743136eb7523b7a1", size = 160065, upload-time = "2025-06-19T22:48:06.508Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "26.2"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/d7/f1/e7a6dd94a8d4a5626c03e4e99c87f241ba9e350cd9e6d75123f992427270/packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661", size = 228134, upload-time = "2026-04-24T20:15:23.917Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/df/b2/87e62e8c3e2f4b32e5fe99e0b86d576da1312593b39f47d8ceef365e95ed/packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e", size = 100195, upload-time = "2026-04-24T20:15:22.081Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pycparser"
|
||||
version = "3.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "python-dateutil"
|
||||
version = "2.9.0.post0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "six" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432, upload-time = "2024-03-01T18:36:20.211Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyyaml"
|
||||
version = "6.0.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011, upload-time = "2025-09-25T21:32:15.21Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870, upload-time = "2025-09-25T21:32:16.431Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089, upload-time = "2025-09-25T21:32:17.56Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181, upload-time = "2025-09-25T21:32:18.834Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658, upload-time = "2025-09-25T21:32:20.209Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003, upload-time = "2025-09-25T21:32:21.167Z" },
|
||||
{ url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344, upload-time = "2025-09-25T21:32:22.617Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests"
|
||||
version = "2.33.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "certifi" },
|
||||
{ name = "charset-normalizer" },
|
||||
{ name = "idna" },
|
||||
{ name = "urllib3" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/5f/a4/98b9c7c6428a668bf7e42ebb7c79d576a1c3c1e3ae2d47e674b468388871/requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517", size = 134120, upload-time = "2026-03-30T16:09:15.531Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/d7/8e/7540e8a2036f79a125c1d2ebadf69ed7901608859186c856fa0388ef4197/requests-2.33.1-py3-none-any.whl", hash = "sha256:4e6d1ef462f3626a1f0a0a9c42dd93c63bad33f9f1c1937509b8c5c8718ab56a", size = 64947, upload-time = "2026-03-30T16:09:13.83Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "requests-oauthlib"
|
||||
version = "2.0.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
dependencies = [
|
||||
{ name = "oauthlib" },
|
||||
{ name = "requests" },
|
||||
]
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/42/f2/05f29bc3913aea15eb670be136045bf5c5bbf4b99ecb839da9b422bb2c85/requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9", size = 55650, upload-time = "2024-03-22T20:32:29.939Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/3b/5d/63d4ae3b9daea098d5d6f5da83984853c1bbacd5dc826764b249fe119d24/requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36", size = 24179, upload-time = "2024-03-22T20:32:28.055Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "resolvelib"
|
||||
version = "1.2.1"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/1d/14/4669927e06631070edb968c78fdb6ce8992e27c9ab2cde4b3993e22ac7af/resolvelib-1.2.1.tar.gz", hash = "sha256:7d08a2022f6e16ce405d60b68c390f054efcfd0477d4b9bd019cc941c28fad1c", size = 24575, upload-time = "2025-10-11T01:07:44.582Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/e2/23/c941a0d0353681ca138489983c4309e0f5095dfd902e1357004f2357ddf2/resolvelib-1.2.1-py3-none-any.whl", hash = "sha256:fb06b66c8da04172d9e72a21d7d06186d8919e32ae5ab5cdf5b9d920be805ac2", size = 18737, upload-time = "2025-10-11T01:07:43.081Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "six"
|
||||
version = "1.17.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/94/e7/b2c673351809dca68a0e064b6af791aa332cf192da575fd474ed7d6f16a2/six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81", size = 34031, upload-time = "2024-12-04T17:35:28.174Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/b7/ce/149a00dd41f10bc29e5921b496af8b574d8413afcd5e30dfa0ed46c2cc5e/six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274", size = 11050, upload-time = "2024-12-04T17:35:26.475Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "urllib3"
|
||||
version = "2.6.3"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/c7/24/5f1b3bdffd70275f6661c76461e25f024d5a38a46f04aaca912426a2b1d3/urllib3-2.6.3.tar.gz", hash = "sha256:1b62b6884944a57dbe321509ab94fd4d3b307075e0c2eae991ac71ee15ad38ed", size = 435556, upload-time = "2026-01-07T16:24:43.925Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "websocket-client"
|
||||
version = "1.9.0"
|
||||
source = { registry = "https://pypi.org/simple" }
|
||||
sdist = { url = "https://files.pythonhosted.org/packages/2c/41/aa4bf9664e4cda14c3b39865b12251e8e7d239f4cd0e3cc1b6c2ccde25c1/websocket_client-1.9.0.tar.gz", hash = "sha256:9e813624b6eb619999a97dc7958469217c3176312b3a16a4bd1bc7e08a46ec98", size = 70576, upload-time = "2025-10-07T21:16:36.495Z" }
|
||||
wheels = [
|
||||
{ url = "https://files.pythonhosted.org/packages/34/db/b10e48aa8fff7407e67470363eac595018441cf32d5e1001567a7aeba5d2/websocket_client-1.9.0-py3-none-any.whl", hash = "sha256:af248a825037ef591efbf6ed20cc5faa03d3b47b9e5a2230a529eeee1c1fc3ef", size = 82616, upload-time = "2025-10-07T21:16:34.951Z" },
|
||||
]
|
||||
Reference in New Issue
Block a user