feat(backup): skip-if-unchanged + scheduled CronJob in the chart
Builds on the dedicated backup (erp#31).
Skip-if-unchanged: each half (DB / documents) carries a content fingerprint at
erp/<env>/.fp-{db,docs} and is dumped+uploaded only if it differs from the last
run — a quiet ERP day re-uploads nothing. Fingerprint = durable BUSINESS content
only: DB = count+max(tms) over tms tables EXCEPT volatile churn (llx_const,
llx_user, session/cron); docs EXCLUDE */temp/* (Dolibarr stats cache) — from both
the fingerprint and the tar. Proven live: 1st run uploads both, immediate 2nd run
skips both (uploaded=0).
Automation: the in-container logic moves to chart/files/backup-job.sh (single
source of truth, read by the orchestrator AND the chart). New
chart/templates/backup-cronjob.yaml renders a daily CronJob + ConfigMap +
VaultStaticSecret, gated by backup.enabled (default false). Helm-verified: off by
default (0 CronJobs), on renders correctly, env-aware (PREFIX erp/prod vs
erp/sandbox), script embedded.
Activation (documented): store GCS HMAC creds at kvv2/<backup.vaultS3Path>
(default erp/backup), grant the erp `auth` Vault role read on it (tools change),
set backup.enabled=true. Until then the orchestrator runs on demand.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
87
chart/files/backup-job.sh
Executable file
87
chart/files/backup-job.sh
Executable file
@@ -0,0 +1,87 @@
|
||||
#!/bin/sh
|
||||
# In-container backup logic for Dolibarr — the single source of truth shared by the
|
||||
# manual orchestrator (ops/backup/dolibarr-backup.sh) and the scheduled CronJob
|
||||
# (chart/templates/backup-cronjob.yaml). Driven entirely by environment:
|
||||
# BUCKET PREFIX DB PGHOST (config)
|
||||
# PGUSER PGPASSWORD (DB creds, from vso-db-credentials)
|
||||
# AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_ENDPOINTS (S3 creds)
|
||||
# Dumps the DB (pg_dump -Fc) + tars the documents mounted at /docs, pushes both to
|
||||
# s3://$BUCKET/$PREFIX/{db,docs}/, then prunes to a tiered retention.
|
||||
#
|
||||
# Skip-if-unchanged: each half carries a content fingerprint at $PREFIX/.fp-{db,docs};
|
||||
# a half is dumped+uploaded ONLY if its fingerprint differs from the last run, so a
|
||||
# quiet ERP day re-uploads nothing. DB fingerprint = count + max(tms) over every
|
||||
# tms-bearing table (catches insert/update/delete); docs = path|size|mtime per file.
|
||||
set -eu
|
||||
apk add --no-cache aws-cli tar gzip findutils >/dev/null 2>&1 || { echo "ABORT apk add"; exit 1; }
|
||||
: "${BUCKET:?}"; : "${PREFIX:?}"; : "${DB:?}"; : "${PGHOST:?}"
|
||||
export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
|
||||
# GCS / S3-compatible stores reject aws-cli v2.23+ default integrity checksums
|
||||
# ("SignatureDoesNotMatch / Invalid argument") — only sign/validate when required.
|
||||
export AWS_REQUEST_CHECKSUM_CALCULATION=when_required
|
||||
export AWS_RESPONSE_CHECKSUM_VALIDATION=when_required
|
||||
S3() { aws --endpoint-url "$AWS_ENDPOINTS" s3 "$@"; }
|
||||
PSQL() { PGPASSWORD="$PGPASSWORD" psql -h "$PGHOST" -U "$PGUSER" -d "$DB" -tAc "$1"; }
|
||||
|
||||
TS=$(date -u +%Y-%m-%dT%H-%M-%SZ)
|
||||
echo "timestamp=$TS db=$DB -> s3://$BUCKET/$PREFIX"
|
||||
|
||||
# --- fingerprints: has either half changed since last time? ---
|
||||
# Restrict to durable BUSINESS content — ignore volatile noise that changes every
|
||||
# cron tick / page view (else a quiet ERP would never skip):
|
||||
# DB: exclude llx_const, llx_user (login/counter churn), session/cron tables
|
||||
# docs: exclude */temp/* (Dolibarr stats cache regenerated constantly)
|
||||
# Excludes are identical for the fingerprint AND the upload, so "unchanged" means
|
||||
# "the backed-up set is unchanged".
|
||||
DENY="'llx_const','llx_user','llx_session','llx_cronjob','llx_user_param'"
|
||||
GEN=$(PSQL "select coalesce(string_agg(format('select count(*) c, coalesce(max(tms)::text,''0'') m from %I', table_name), ' union all '), 'select 0 c, ''0'' m') from information_schema.columns where column_name='tms' and table_schema='public' and table_name not in ($DENY)")
|
||||
FP_DB=$(PSQL "$GEN" | sort | md5sum | cut -d' ' -f1)
|
||||
FP_DOCS=$(find /docs -type f -not -path '*/temp/*' -printf '%p|%s|%T@\n' 2>/dev/null | sort | md5sum | cut -d' ' -f1)
|
||||
LAST_DB=$(S3 cp "s3://$BUCKET/$PREFIX/.fp-db" - 2>/dev/null || true)
|
||||
LAST_DOCS=$(S3 cp "s3://$BUCKET/$PREFIX/.fp-docs" - 2>/dev/null || true)
|
||||
|
||||
uploaded=0
|
||||
if [ "$FP_DB" != "$LAST_DB" ]; then
|
||||
pg_dump -h "$PGHOST" -U "$PGUSER" -d "$DB" -Fc -f /tmp/db.dump
|
||||
S3 cp /tmp/db.dump "s3://$BUCKET/$PREFIX/db/$TS.dump"
|
||||
printf '%s' "$FP_DB" | S3 cp - "s3://$BUCKET/$PREFIX/.fp-db"
|
||||
echo "db: backed up ($(wc -c < /tmp/db.dump) bytes)"; uploaded=1
|
||||
else
|
||||
echo "db: unchanged — skipped"
|
||||
fi
|
||||
if [ "$FP_DOCS" != "$LAST_DOCS" ]; then
|
||||
tar -C /docs --exclude='*/temp/*' -czf /tmp/docs.tar.gz . 2>/dev/null
|
||||
S3 cp /tmp/docs.tar.gz "s3://$BUCKET/$PREFIX/docs/$TS.tar.gz"
|
||||
printf '%s' "$FP_DOCS" | S3 cp - "s3://$BUCKET/$PREFIX/.fp-docs"
|
||||
echo "docs: backed up ($(wc -c < /tmp/docs.tar.gz) bytes)"; uploaded=1
|
||||
else
|
||||
echo "docs: unchanged — skipped"
|
||||
fi
|
||||
|
||||
# --- tiered retention prune (daily 30d / monthly 12m / yearly ~10y); always runs ---
|
||||
cat > /tmp/prune.py <<'PY'
|
||||
import sys, datetime
|
||||
keys=[k.strip() for k in open(sys.argv[1]) if k.strip()]
|
||||
now=datetime.datetime.strptime(sys.argv[2][:10], "%Y-%m-%d").date()
|
||||
def d(k):
|
||||
try: return datetime.datetime.strptime(k[:10], "%Y-%m-%d").date()
|
||||
except Exception: return None
|
||||
dated=sorted([(d(k),k) for k in keys if d(k)], key=lambda x:x[0])
|
||||
keep=set(); bymonth={}; byyear={}
|
||||
for dt,k in dated:
|
||||
age=(now-dt).days
|
||||
if age <= 30: keep.add(k)
|
||||
elif age <= 365: bymonth[(dt.year,dt.month)]=k
|
||||
elif age <= 3660: byyear[dt.year]=k
|
||||
keep |= set(bymonth.values()) | set(byyear.values())
|
||||
for dt,k in dated:
|
||||
if k not in keep: print(k)
|
||||
PY
|
||||
for SUB in db docs; do
|
||||
S3 ls "s3://$BUCKET/$PREFIX/$SUB/" | awk '{print $4}' > /tmp/keys.$SUB || true
|
||||
python3 /tmp/prune.py "/tmp/keys.$SUB" "$TS" > /tmp/del.$SUB || true
|
||||
while read -r DK; do
|
||||
[ -n "$DK" ] && S3 rm "s3://$BUCKET/$PREFIX/$SUB/$DK" && echo "pruned $SUB/$DK"
|
||||
done < /tmp/del.$SUB
|
||||
done
|
||||
echo "DONE (uploaded=$uploaded)."
|
||||
77
chart/templates/backup-cronjob.yaml
Normal file
77
chart/templates/backup-cronjob.yaml
Normal file
@@ -0,0 +1,77 @@
|
||||
{{- if .Values.backup.enabled }}
|
||||
# Dedicated Dolibarr backup (ops/backup/README.md): DB + documents -> offsite GCS,
|
||||
# tiered retention, skip-if-unchanged. Disabled by default — enable once the S3
|
||||
# creds VaultStaticSecret below resolves (the `auth` Vault role must be allowed to
|
||||
# read kvv2/{{ .Values.backup.vaultS3Path }}).
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "erp.fullname" . }}-backup-job
|
||||
labels:
|
||||
{{- include "erp.labels" . | nindent 4 }}
|
||||
data:
|
||||
backup-job.sh: |
|
||||
{{- .Files.Get "files/backup-job.sh" | nindent 4 }}
|
||||
---
|
||||
apiVersion: secrets.hashicorp.com/v1beta1
|
||||
kind: VaultStaticSecret
|
||||
metadata:
|
||||
name: {{ include "erp.fullname" . }}-backup-s3
|
||||
namespace: {{ .Release.Namespace }}
|
||||
spec:
|
||||
type: kv-v2
|
||||
mount: kvv2
|
||||
# kvv2/<path> must hold AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_ENDPOINTS
|
||||
# (the GCS HMAC creds — same shape as longhorn-gcs-backup-credentials).
|
||||
path: {{ .Values.backup.vaultS3Path }}
|
||||
destination:
|
||||
name: dolibarr-backup-s3
|
||||
create: true
|
||||
refreshAfter: 24h
|
||||
vaultAuthRef: auth
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: {{ include "erp.fullname" . }}-backup
|
||||
labels:
|
||||
{{- include "erp.labels" . | nindent 4 }}
|
||||
spec:
|
||||
schedule: {{ .Values.backup.schedule | quote }}
|
||||
concurrencyPolicy: Forbid
|
||||
successfulJobsHistoryLimit: 3
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 1
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: Never
|
||||
volumes:
|
||||
- name: docs
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "erp.fullname" . }}
|
||||
readOnly: true
|
||||
- name: job
|
||||
configMap:
|
||||
name: {{ include "erp.fullname" . }}-backup-job
|
||||
containers:
|
||||
- name: backup
|
||||
image: {{ .Values.backup.image | quote }}
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: dolibarr-backup-s3
|
||||
env:
|
||||
- { name: BUCKET, value: {{ .Values.backup.bucket | quote }} }
|
||||
- { name: PREFIX, value: {{ printf "erp/%s" .Values.env | quote }} }
|
||||
- { name: DB, value: {{ .Values.db.name | quote }} }
|
||||
- { name: PGHOST, value: {{ .Values.backup.pgHost | quote }} }
|
||||
- name: PGUSER
|
||||
valueFrom: { secretKeyRef: { name: vso-db-credentials, key: username } }
|
||||
- name: PGPASSWORD
|
||||
valueFrom: { secretKeyRef: { name: vso-db-credentials, key: password } }
|
||||
volumeMounts:
|
||||
- { name: docs, mountPath: /docs, readOnly: true }
|
||||
- { name: job, mountPath: /job }
|
||||
command: ["/bin/sh", "/job/backup-job.sh"]
|
||||
{{- end }}
|
||||
@@ -132,3 +132,15 @@ nodeSelector: {}
|
||||
tolerations: []
|
||||
|
||||
affinity: {}
|
||||
|
||||
# Dedicated offsite backup of the Dolibarr DB + documents (see ops/backup/README.md).
|
||||
# DISABLED by default — enable once the S3 creds VaultStaticSecret resolves (the
|
||||
# `auth` Vault role must be granted read on kvv2/<vaultS3Path>). The manual
|
||||
# orchestrator ops/backup/dolibarr-backup.sh works today without this.
|
||||
backup:
|
||||
enabled: false
|
||||
schedule: "0 3 * * *" # daily 03:00 UTC
|
||||
bucket: arcodange-backup
|
||||
pgHost: "192.168.1.202" # direct Postgres host (matches ops/sandbox + ops/backup)
|
||||
image: postgres:16-alpine
|
||||
vaultS3Path: erp/backup # kvv2/<this> → AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY / AWS_ENDPOINTS
|
||||
|
||||
@@ -25,6 +25,13 @@ This tool backs up **both halves** of Dolibarr state to the existing object stor
|
||||
then prunes to a **tiered retention**: daily for 30 days, monthly for 12 months,
|
||||
yearly for ~10 years.
|
||||
|
||||
**Skip-if-unchanged:** each half carries a content fingerprint at `erp/<env>/.fp-{db,docs}`
|
||||
and is dumped+uploaded only if it **differs** from the last run — so a quiet ERP day
|
||||
re-uploads nothing. The fingerprint is over **durable business content only**: the DB
|
||||
side is `count + max(tms)` over every `tms` table *except* volatile ones (`llx_const`,
|
||||
`llx_user`, sessions/cron), and the documents side excludes `*/temp/*` (Dolibarr's
|
||||
constantly-regenerated stats cache) — from both the fingerprint *and* the tar.
|
||||
|
||||
## Safety (mirrors `ops/sandbox/sandbox-lifecycle.sh`)
|
||||
|
||||
- **prod is read-only**: `pg_dump` and `tar` only read; the only writes go to the
|
||||
@@ -45,9 +52,9 @@ ops/backup/dolibarr-backup.sh backup --env sandbox
|
||||
ops/backup/dolibarr-backup.sh list --env prod
|
||||
```
|
||||
|
||||
`backup-job.sh` is the in-container logic (env-driven: `BUCKET PREFIX DB PGHOST` +
|
||||
the mounted DB/S3 creds) — the single source of truth, also intended for the
|
||||
scheduled CronJob (see "Automation" below).
|
||||
`chart/files/backup-job.sh` is the in-container logic (env-driven: `BUCKET PREFIX
|
||||
DB PGHOST` + the mounted DB/S3 creds) — the single source of truth shared by this
|
||||
orchestrator and the scheduled CronJob (see "Automation" below).
|
||||
|
||||
**Status:** the first real prod backup was taken 2026-06-30
|
||||
(`erp/prod/db/…` 1.2 MB, `erp/prod/docs/…` 12.5 MB). Proven end-to-end live on the
|
||||
@@ -62,14 +69,22 @@ sandbox (dump + tar + GCS upload + retention prune).
|
||||
The sandbox iso-prod refresh (`ops/sandbox/sandbox-lifecycle.sh`) is the natural
|
||||
restore-drill bench. A `restore` subcommand is wired next.
|
||||
|
||||
## Automation (next step — gated on creds)
|
||||
## Automation — the CronJob (gated on creds)
|
||||
|
||||
The recurring form is a k8s **CronJob** (ArgoCD-managed, in the chart) running the
|
||||
same `backup-job.sh` daily. It needs its **own** S3 creds rather than borrowing the
|
||||
Longhorn secret cross-namespace: a `VaultStaticSecret` in the erp namespace reading
|
||||
the GCS backup creds, which requires the `erp` Vault role to be granted read on that
|
||||
path (a `tools` change). Until that lands, run the orchestrator above on demand /
|
||||
from a host cron — it works today by borrowing the Longhorn creds transiently.
|
||||
The recurring form ships in the chart (`chart/templates/backup-cronjob.yaml`,
|
||||
`backup.enabled=false` by default): a daily **CronJob** (ConfigMap-mounted
|
||||
`backup-job.sh`) with its **own** S3 creds via a `VaultStaticSecret` — no
|
||||
cross-namespace borrowing of the Longhorn secret. To activate:
|
||||
|
||||
1. store the GCS HMAC creds (`AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` /
|
||||
`AWS_ENDPOINTS`, same shape as `longhorn-gcs-backup-credentials`) at
|
||||
`kvv2/<backup.vaultS3Path>` (default `erp/backup`);
|
||||
2. grant the erp `auth` Vault role read on that path (a `tools` change) if its
|
||||
policy doesn't already cover it;
|
||||
3. set `backup.enabled: true` (+ tune `schedule`).
|
||||
|
||||
Until then, run the orchestrator above on demand / from a host cron — it works
|
||||
today by borrowing the Longhorn creds transiently.
|
||||
|
||||
> The generic Longhorn gap (the orphaned `default` group) should be fixed too, as a
|
||||
> platform concern — but this dedicated, offsite, 10-year-retention backup is the
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
#!/bin/sh
|
||||
# In-container backup logic for Dolibarr — the single source of truth shared by the
|
||||
# manual orchestrator (ops/backup/dolibarr-backup.sh) and the scheduled CronJob
|
||||
# (chart/templates/backup-cronjob.yaml). Driven entirely by environment:
|
||||
# BUCKET PREFIX DB PGHOST (config)
|
||||
# PGUSER PGPASSWORD (DB creds, from vso-db-credentials)
|
||||
# AWS_ACCESS_KEY_ID AWS_SECRET_ACCESS_KEY AWS_ENDPOINTS (S3 creds)
|
||||
# It dumps the DB (pg_dump -Fc) + tars the documents mounted at /docs, pushes both
|
||||
# to s3://$BUCKET/$PREFIX/{db,docs}/, then prunes to a tiered retention.
|
||||
set -eu
|
||||
apk add --no-cache aws-cli tar gzip >/dev/null 2>&1 || { echo "ABORT apk add"; exit 1; }
|
||||
: "${BUCKET:?}"; : "${PREFIX:?}"; : "${DB:?}"; : "${PGHOST:?}"
|
||||
export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
|
||||
# GCS / S3-compatible stores reject aws-cli v2.23+ default integrity checksums
|
||||
# ("SignatureDoesNotMatch / Invalid argument") — only sign/validate when required.
|
||||
export AWS_REQUEST_CHECKSUM_CALCULATION=when_required
|
||||
export AWS_RESPONSE_CHECKSUM_VALIDATION=when_required
|
||||
S3() { aws --endpoint-url "$AWS_ENDPOINTS" s3 "$@"; }
|
||||
|
||||
TS=$(date -u +%Y-%m-%dT%H-%M-%SZ)
|
||||
echo "timestamp=$TS db=$DB -> s3://$BUCKET/$PREFIX"
|
||||
pg_dump -h "$PGHOST" -U "$PGUSER" -d "$DB" -Fc -f /tmp/db.dump
|
||||
echo "db.dump $(wc -c < /tmp/db.dump) bytes"
|
||||
tar -C /docs -czf /tmp/docs.tar.gz . 2>/dev/null
|
||||
echo "docs.tar.gz $(wc -c < /tmp/docs.tar.gz) bytes"
|
||||
S3 cp /tmp/db.dump "s3://$BUCKET/$PREFIX/db/$TS.dump"
|
||||
S3 cp /tmp/docs.tar.gz "s3://$BUCKET/$PREFIX/docs/$TS.tar.gz"
|
||||
echo "uploaded to s3://$BUCKET/$PREFIX/{db,docs}/$TS.*"
|
||||
|
||||
# tiered retention: daily 30d / monthly 12m (latest per month) / yearly ~10y
|
||||
cat > /tmp/prune.py <<'PY'
|
||||
import sys, datetime
|
||||
keys=[k.strip() for k in open(sys.argv[1]) if k.strip()]
|
||||
now=datetime.datetime.strptime(sys.argv[2][:10], "%Y-%m-%d").date()
|
||||
def d(k):
|
||||
try: return datetime.datetime.strptime(k[:10], "%Y-%m-%d").date()
|
||||
except Exception: return None
|
||||
dated=sorted([(d(k),k) for k in keys if d(k)], key=lambda x:x[0])
|
||||
keep=set(); bymonth={}; byyear={}
|
||||
for dt,k in dated:
|
||||
age=(now-dt).days
|
||||
if age <= 30: keep.add(k)
|
||||
elif age <= 365: bymonth[(dt.year,dt.month)]=k
|
||||
elif age <= 3660: byyear[dt.year]=k
|
||||
keep |= set(bymonth.values()) | set(byyear.values())
|
||||
for dt,k in dated:
|
||||
if k not in keep: print(k)
|
||||
PY
|
||||
for SUB in db docs; do
|
||||
S3 ls "s3://$BUCKET/$PREFIX/$SUB/" | awk '{print $4}' > /tmp/keys.$SUB || true
|
||||
python3 /tmp/prune.py "/tmp/keys.$SUB" "$TS" > /tmp/del.$SUB || true
|
||||
while read -r DK; do
|
||||
[ -n "$DK" ] && S3 rm "s3://$BUCKET/$PREFIX/$SUB/$DK" && echo "pruned $SUB/$DK"
|
||||
done < /tmp/del.$SUB
|
||||
done
|
||||
echo "DONE."
|
||||
@@ -86,7 +86,7 @@ run_backup() {
|
||||
log "Copying GCS creds into a transient secret in $NS (values stay base64)"
|
||||
copy_s3_secret
|
||||
log "Backup ${ENV}: DB=$DB PVC=$PVC -> s3://$BUCKET/$PREFIX/{db,docs}/"
|
||||
local B64; B64="$(b64 "$(cat "${SCRIPT_DIR}/backup-job.sh")")"
|
||||
local B64; B64="$(b64 "$(cat "${SCRIPT_DIR}/../../chart/files/backup-job.sh")")"
|
||||
kubectl delete job dolibarr-backup -n "$NS" --ignore-not-found >/dev/null 2>&1 || true
|
||||
kubectl apply -f - >/dev/null <<EOF
|
||||
apiVersion: batch/v1
|
||||
|
||||
Reference in New Issue
Block a user