From a3f0586c77e6310c66e25626c6cde169a63891ca Mon Sep 17 00:00:00 2001 From: Gabriel Radureau Date: Tue, 30 Jun 2026 15:53:13 +0200 Subject: [PATCH] feat(backup): skip-if-unchanged + scheduled CronJob in the chart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Builds on the dedicated backup (erp#31). Skip-if-unchanged: each half (DB / documents) carries a content fingerprint at erp//.fp-{db,docs} and is dumped+uploaded only if it differs from the last run — a quiet ERP day re-uploads nothing. Fingerprint = durable BUSINESS content only: DB = count+max(tms) over tms tables EXCEPT volatile churn (llx_const, llx_user, session/cron); docs EXCLUDE */temp/* (Dolibarr stats cache) — from both the fingerprint and the tar. Proven live: 1st run uploads both, immediate 2nd run skips both (uploaded=0). Automation: the in-container logic moves to chart/files/backup-job.sh (single source of truth, read by the orchestrator AND the chart). New chart/templates/backup-cronjob.yaml renders a daily CronJob + ConfigMap + VaultStaticSecret, gated by backup.enabled (default false). Helm-verified: off by default (0 CronJobs), on renders correctly, env-aware (PREFIX erp/prod vs erp/sandbox), script embedded. Activation (documented): store GCS HMAC creds at kvv2/ (default erp/backup), grant the erp `auth` Vault role read on it (tools change), set backup.enabled=true. Until then the orchestrator runs on demand. Co-Authored-By: Claude Opus 4.7 (1M context) --- chart/files/backup-job.sh | 87 +++++++++++++++++++++++++++++ chart/templates/backup-cronjob.yaml | 77 +++++++++++++++++++++++++ chart/values.yaml | 12 ++++ ops/backup/README.md | 35 ++++++++---- ops/backup/backup-job.sh | 56 ------------------- ops/backup/dolibarr-backup.sh | 2 +- 6 files changed, 202 insertions(+), 67 deletions(-) create mode 100755 chart/files/backup-job.sh create mode 100644 chart/templates/backup-cronjob.yaml delete mode 100755 ops/backup/backup-job.sh diff --git a/chart/files/backup-job.sh b/chart/files/backup-job.sh new file mode 100755 index 0000000..22aff2d --- /dev/null +++ b/chart/files/backup-job.sh @@ -0,0 +1,87 @@ +#!/bin/sh +# In-container backup logic for Dolibarr — the single source of truth shared by the +# manual orchestrator (ops/backup/dolibarr-backup.sh) and the scheduled CronJob +# (chart/templates/backup-cronjob.yaml). Driven entirely by environment: +# BUCKET PREFIX DB PGHOST (config) +# PGUSER PGPASSWORD (DB creds, from vso-db-credentials) +# AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_ENDPOINTS (S3 creds) +# Dumps the DB (pg_dump -Fc) + tars the documents mounted at /docs, pushes both to +# s3://$BUCKET/$PREFIX/{db,docs}/, then prunes to a tiered retention. +# +# Skip-if-unchanged: each half carries a content fingerprint at $PREFIX/.fp-{db,docs}; +# a half is dumped+uploaded ONLY if its fingerprint differs from the last run, so a +# quiet ERP day re-uploads nothing. DB fingerprint = count + max(tms) over every +# tms-bearing table (catches insert/update/delete); docs = path|size|mtime per file. +set -eu +apk add --no-cache aws-cli tar gzip findutils >/dev/null 2>&1 || { echo "ABORT apk add"; exit 1; } +: "${BUCKET:?}"; : "${PREFIX:?}"; : "${DB:?}"; : "${PGHOST:?}" +export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" +# GCS / S3-compatible stores reject aws-cli v2.23+ default integrity checksums +# ("SignatureDoesNotMatch / Invalid argument") — only sign/validate when required. +export AWS_REQUEST_CHECKSUM_CALCULATION=when_required +export AWS_RESPONSE_CHECKSUM_VALIDATION=when_required +S3() { aws --endpoint-url "$AWS_ENDPOINTS" s3 "$@"; } +PSQL() { PGPASSWORD="$PGPASSWORD" psql -h "$PGHOST" -U "$PGUSER" -d "$DB" -tAc "$1"; } + +TS=$(date -u +%Y-%m-%dT%H-%M-%SZ) +echo "timestamp=$TS db=$DB -> s3://$BUCKET/$PREFIX" + +# --- fingerprints: has either half changed since last time? --- +# Restrict to durable BUSINESS content — ignore volatile noise that changes every +# cron tick / page view (else a quiet ERP would never skip): +# DB: exclude llx_const, llx_user (login/counter churn), session/cron tables +# docs: exclude */temp/* (Dolibarr stats cache regenerated constantly) +# Excludes are identical for the fingerprint AND the upload, so "unchanged" means +# "the backed-up set is unchanged". +DENY="'llx_const','llx_user','llx_session','llx_cronjob','llx_user_param'" +GEN=$(PSQL "select coalesce(string_agg(format('select count(*) c, coalesce(max(tms)::text,''0'') m from %I', table_name), ' union all '), 'select 0 c, ''0'' m') from information_schema.columns where column_name='tms' and table_schema='public' and table_name not in ($DENY)") +FP_DB=$(PSQL "$GEN" | sort | md5sum | cut -d' ' -f1) +FP_DOCS=$(find /docs -type f -not -path '*/temp/*' -printf '%p|%s|%T@\n' 2>/dev/null | sort | md5sum | cut -d' ' -f1) +LAST_DB=$(S3 cp "s3://$BUCKET/$PREFIX/.fp-db" - 2>/dev/null || true) +LAST_DOCS=$(S3 cp "s3://$BUCKET/$PREFIX/.fp-docs" - 2>/dev/null || true) + +uploaded=0 +if [ "$FP_DB" != "$LAST_DB" ]; then + pg_dump -h "$PGHOST" -U "$PGUSER" -d "$DB" -Fc -f /tmp/db.dump + S3 cp /tmp/db.dump "s3://$BUCKET/$PREFIX/db/$TS.dump" + printf '%s' "$FP_DB" | S3 cp - "s3://$BUCKET/$PREFIX/.fp-db" + echo "db: backed up ($(wc -c < /tmp/db.dump) bytes)"; uploaded=1 +else + echo "db: unchanged — skipped" +fi +if [ "$FP_DOCS" != "$LAST_DOCS" ]; then + tar -C /docs --exclude='*/temp/*' -czf /tmp/docs.tar.gz . 2>/dev/null + S3 cp /tmp/docs.tar.gz "s3://$BUCKET/$PREFIX/docs/$TS.tar.gz" + printf '%s' "$FP_DOCS" | S3 cp - "s3://$BUCKET/$PREFIX/.fp-docs" + echo "docs: backed up ($(wc -c < /tmp/docs.tar.gz) bytes)"; uploaded=1 +else + echo "docs: unchanged — skipped" +fi + +# --- tiered retention prune (daily 30d / monthly 12m / yearly ~10y); always runs --- +cat > /tmp/prune.py <<'PY' +import sys, datetime +keys=[k.strip() for k in open(sys.argv[1]) if k.strip()] +now=datetime.datetime.strptime(sys.argv[2][:10], "%Y-%m-%d").date() +def d(k): + try: return datetime.datetime.strptime(k[:10], "%Y-%m-%d").date() + except Exception: return None +dated=sorted([(d(k),k) for k in keys if d(k)], key=lambda x:x[0]) +keep=set(); bymonth={}; byyear={} +for dt,k in dated: + age=(now-dt).days + if age <= 30: keep.add(k) + elif age <= 365: bymonth[(dt.year,dt.month)]=k + elif age <= 3660: byyear[dt.year]=k +keep |= set(bymonth.values()) | set(byyear.values()) +for dt,k in dated: + if k not in keep: print(k) +PY +for SUB in db docs; do + S3 ls "s3://$BUCKET/$PREFIX/$SUB/" | awk '{print $4}' > /tmp/keys.$SUB || true + python3 /tmp/prune.py "/tmp/keys.$SUB" "$TS" > /tmp/del.$SUB || true + while read -r DK; do + [ -n "$DK" ] && S3 rm "s3://$BUCKET/$PREFIX/$SUB/$DK" && echo "pruned $SUB/$DK" + done < /tmp/del.$SUB +done +echo "DONE (uploaded=$uploaded)." diff --git a/chart/templates/backup-cronjob.yaml b/chart/templates/backup-cronjob.yaml new file mode 100644 index 0000000..01a389e --- /dev/null +++ b/chart/templates/backup-cronjob.yaml @@ -0,0 +1,77 @@ +{{- if .Values.backup.enabled }} +# Dedicated Dolibarr backup (ops/backup/README.md): DB + documents -> offsite GCS, +# tiered retention, skip-if-unchanged. Disabled by default — enable once the S3 +# creds VaultStaticSecret below resolves (the `auth` Vault role must be allowed to +# read kvv2/{{ .Values.backup.vaultS3Path }}). +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "erp.fullname" . }}-backup-job + labels: + {{- include "erp.labels" . | nindent 4 }} +data: + backup-job.sh: | + {{- .Files.Get "files/backup-job.sh" | nindent 4 }} +--- +apiVersion: secrets.hashicorp.com/v1beta1 +kind: VaultStaticSecret +metadata: + name: {{ include "erp.fullname" . }}-backup-s3 + namespace: {{ .Release.Namespace }} +spec: + type: kv-v2 + mount: kvv2 + # kvv2/ must hold AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_ENDPOINTS + # (the GCS HMAC creds — same shape as longhorn-gcs-backup-credentials). + path: {{ .Values.backup.vaultS3Path }} + destination: + name: dolibarr-backup-s3 + create: true + refreshAfter: 24h + vaultAuthRef: auth +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ include "erp.fullname" . }}-backup + labels: + {{- include "erp.labels" . | nindent 4 }} +spec: + schedule: {{ .Values.backup.schedule | quote }} + concurrencyPolicy: Forbid + successfulJobsHistoryLimit: 3 + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 1 + template: + spec: + restartPolicy: Never + volumes: + - name: docs + persistentVolumeClaim: + claimName: {{ include "erp.fullname" . }} + readOnly: true + - name: job + configMap: + name: {{ include "erp.fullname" . }}-backup-job + containers: + - name: backup + image: {{ .Values.backup.image | quote }} + envFrom: + - secretRef: + name: dolibarr-backup-s3 + env: + - { name: BUCKET, value: {{ .Values.backup.bucket | quote }} } + - { name: PREFIX, value: {{ printf "erp/%s" .Values.env | quote }} } + - { name: DB, value: {{ .Values.db.name | quote }} } + - { name: PGHOST, value: {{ .Values.backup.pgHost | quote }} } + - name: PGUSER + valueFrom: { secretKeyRef: { name: vso-db-credentials, key: username } } + - name: PGPASSWORD + valueFrom: { secretKeyRef: { name: vso-db-credentials, key: password } } + volumeMounts: + - { name: docs, mountPath: /docs, readOnly: true } + - { name: job, mountPath: /job } + command: ["/bin/sh", "/job/backup-job.sh"] +{{- end }} diff --git a/chart/values.yaml b/chart/values.yaml index 7259ff0..e0ba995 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -132,3 +132,15 @@ nodeSelector: {} tolerations: [] affinity: {} + +# Dedicated offsite backup of the Dolibarr DB + documents (see ops/backup/README.md). +# DISABLED by default — enable once the S3 creds VaultStaticSecret resolves (the +# `auth` Vault role must be granted read on kvv2/). The manual +# orchestrator ops/backup/dolibarr-backup.sh works today without this. +backup: + enabled: false + schedule: "0 3 * * *" # daily 03:00 UTC + bucket: arcodange-backup + pgHost: "192.168.1.202" # direct Postgres host (matches ops/sandbox + ops/backup) + image: postgres:16-alpine + vaultS3Path: erp/backup # kvv2/ → AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_ENDPOINTS diff --git a/ops/backup/README.md b/ops/backup/README.md index 6f55ac4..ffc0931 100644 --- a/ops/backup/README.md +++ b/ops/backup/README.md @@ -25,6 +25,13 @@ This tool backs up **both halves** of Dolibarr state to the existing object stor then prunes to a **tiered retention**: daily for 30 days, monthly for 12 months, yearly for ~10 years. +**Skip-if-unchanged:** each half carries a content fingerprint at `erp//.fp-{db,docs}` +and is dumped+uploaded only if it **differs** from the last run — so a quiet ERP day +re-uploads nothing. The fingerprint is over **durable business content only**: the DB +side is `count + max(tms)` over every `tms` table *except* volatile ones (`llx_const`, +`llx_user`, sessions/cron), and the documents side excludes `*/temp/*` (Dolibarr's +constantly-regenerated stats cache) — from both the fingerprint *and* the tar. + ## Safety (mirrors `ops/sandbox/sandbox-lifecycle.sh`) - **prod is read-only**: `pg_dump` and `tar` only read; the only writes go to the @@ -45,9 +52,9 @@ ops/backup/dolibarr-backup.sh backup --env sandbox ops/backup/dolibarr-backup.sh list --env prod ``` -`backup-job.sh` is the in-container logic (env-driven: `BUCKET PREFIX DB PGHOST` + -the mounted DB/S3 creds) — the single source of truth, also intended for the -scheduled CronJob (see "Automation" below). +`chart/files/backup-job.sh` is the in-container logic (env-driven: `BUCKET PREFIX +DB PGHOST` + the mounted DB/S3 creds) — the single source of truth shared by this +orchestrator and the scheduled CronJob (see "Automation" below). **Status:** the first real prod backup was taken 2026-06-30 (`erp/prod/db/…` 1.2 MB, `erp/prod/docs/…` 12.5 MB). Proven end-to-end live on the @@ -62,14 +69,22 @@ sandbox (dump + tar + GCS upload + retention prune). The sandbox iso-prod refresh (`ops/sandbox/sandbox-lifecycle.sh`) is the natural restore-drill bench. A `restore` subcommand is wired next. -## Automation (next step — gated on creds) +## Automation — the CronJob (gated on creds) -The recurring form is a k8s **CronJob** (ArgoCD-managed, in the chart) running the -same `backup-job.sh` daily. It needs its **own** S3 creds rather than borrowing the -Longhorn secret cross-namespace: a `VaultStaticSecret` in the erp namespace reading -the GCS backup creds, which requires the `erp` Vault role to be granted read on that -path (a `tools` change). Until that lands, run the orchestrator above on demand / -from a host cron — it works today by borrowing the Longhorn creds transiently. +The recurring form ships in the chart (`chart/templates/backup-cronjob.yaml`, +`backup.enabled=false` by default): a daily **CronJob** (ConfigMap-mounted +`backup-job.sh`) with its **own** S3 creds via a `VaultStaticSecret` — no +cross-namespace borrowing of the Longhorn secret. To activate: + +1. store the GCS HMAC creds (`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` / + `AWS_ENDPOINTS`, same shape as `longhorn-gcs-backup-credentials`) at + `kvv2/` (default `erp/backup`); +2. grant the erp `auth` Vault role read on that path (a `tools` change) if its + policy doesn't already cover it; +3. set `backup.enabled: true` (+ tune `schedule`). + +Until then, run the orchestrator above on demand / from a host cron — it works +today by borrowing the Longhorn creds transiently. > The generic Longhorn gap (the orphaned `default` group) should be fixed too, as a > platform concern — but this dedicated, offsite, 10-year-retention backup is the diff --git a/ops/backup/backup-job.sh b/ops/backup/backup-job.sh deleted file mode 100755 index 2055910..0000000 --- a/ops/backup/backup-job.sh +++ /dev/null @@ -1,56 +0,0 @@ -#!/bin/sh -# In-container backup logic for Dolibarr — the single source of truth shared by the -# manual orchestrator (ops/backup/dolibarr-backup.sh) and the scheduled CronJob -# (chart/templates/backup-cronjob.yaml). Driven entirely by environment: -# BUCKET PREFIX DB PGHOST (config) -# PGUSER PGPASSWORD (DB creds, from vso-db-credentials) -# AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_ENDPOINTS (S3 creds) -# It dumps the DB (pg_dump -Fc) + tars the documents mounted at /docs, pushes both -# to s3://$BUCKET/$PREFIX/{db,docs}/, then prunes to a tiered retention. -set -eu -apk add --no-cache aws-cli tar gzip >/dev/null 2>&1 || { echo "ABORT apk add"; exit 1; } -: "${BUCKET:?}"; : "${PREFIX:?}"; : "${DB:?}"; : "${PGHOST:?}" -export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}" -# GCS / S3-compatible stores reject aws-cli v2.23+ default integrity checksums -# ("SignatureDoesNotMatch / Invalid argument") — only sign/validate when required. -export AWS_REQUEST_CHECKSUM_CALCULATION=when_required -export AWS_RESPONSE_CHECKSUM_VALIDATION=when_required -S3() { aws --endpoint-url "$AWS_ENDPOINTS" s3 "$@"; } - -TS=$(date -u +%Y-%m-%dT%H-%M-%SZ) -echo "timestamp=$TS db=$DB -> s3://$BUCKET/$PREFIX" -pg_dump -h "$PGHOST" -U "$PGUSER" -d "$DB" -Fc -f /tmp/db.dump -echo "db.dump $(wc -c < /tmp/db.dump) bytes" -tar -C /docs -czf /tmp/docs.tar.gz . 2>/dev/null -echo "docs.tar.gz $(wc -c < /tmp/docs.tar.gz) bytes" -S3 cp /tmp/db.dump "s3://$BUCKET/$PREFIX/db/$TS.dump" -S3 cp /tmp/docs.tar.gz "s3://$BUCKET/$PREFIX/docs/$TS.tar.gz" -echo "uploaded to s3://$BUCKET/$PREFIX/{db,docs}/$TS.*" - -# tiered retention: daily 30d / monthly 12m (latest per month) / yearly ~10y -cat > /tmp/prune.py <<'PY' -import sys, datetime -keys=[k.strip() for k in open(sys.argv[1]) if k.strip()] -now=datetime.datetime.strptime(sys.argv[2][:10], "%Y-%m-%d").date() -def d(k): - try: return datetime.datetime.strptime(k[:10], "%Y-%m-%d").date() - except Exception: return None -dated=sorted([(d(k),k) for k in keys if d(k)], key=lambda x:x[0]) -keep=set(); bymonth={}; byyear={} -for dt,k in dated: - age=(now-dt).days - if age <= 30: keep.add(k) - elif age <= 365: bymonth[(dt.year,dt.month)]=k - elif age <= 3660: byyear[dt.year]=k -keep |= set(bymonth.values()) | set(byyear.values()) -for dt,k in dated: - if k not in keep: print(k) -PY -for SUB in db docs; do - S3 ls "s3://$BUCKET/$PREFIX/$SUB/" | awk '{print $4}' > /tmp/keys.$SUB || true - python3 /tmp/prune.py "/tmp/keys.$SUB" "$TS" > /tmp/del.$SUB || true - while read -r DK; do - [ -n "$DK" ] && S3 rm "s3://$BUCKET/$PREFIX/$SUB/$DK" && echo "pruned $SUB/$DK" - done < /tmp/del.$SUB -done -echo "DONE." diff --git a/ops/backup/dolibarr-backup.sh b/ops/backup/dolibarr-backup.sh index 2712079..2fa33b9 100755 --- a/ops/backup/dolibarr-backup.sh +++ b/ops/backup/dolibarr-backup.sh @@ -86,7 +86,7 @@ run_backup() { log "Copying GCS creds into a transient secret in $NS (values stay base64)" copy_s3_secret log "Backup ${ENV}: DB=$DB PVC=$PVC -> s3://$BUCKET/$PREFIX/{db,docs}/" - local B64; B64="$(b64 "$(cat "${SCRIPT_DIR}/backup-job.sh")")" + local B64; B64="$(b64 "$(cat "${SCRIPT_DIR}/../../chart/files/backup-job.sh")")" kubectl delete job dolibarr-backup -n "$NS" --ignore-not-found >/dev/null 2>&1 || true kubectl apply -f - >/dev/null <