From 434be7488ddf6af235bf9b03922584d7ad2095a8 Mon Sep 17 00:00:00 2001 From: Gabriel Radureau Date: Tue, 30 Jun 2026 06:59:39 +0200 Subject: [PATCH] fix(ops): sandbox refresh-from-prod actually restores now (pg_restore -U + self-heal pause) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit refresh-from-prod was structurally broken and silently no-op'd the restore: 1. pg_restore lacked -U, so the postgres image connected as its OS user `root` and auth-failed. The failure was swallowed by `|| echo "ignorable warnings"`, so the script reported success while the DROP OWNED had already emptied the DB. E2's original seed was a manual process, so this path had never really run. Fix: pass `-h $PGHOST -U $SB_PGUSER`; don't trust pg_restore's exit code (it returns non-zero on the harmless "schema public already exists" notice) — verify by counting restored llx_* tables and FAIL the Job if < 250. 2. erp-sandbox is ArgoCD-managed with self-heal ON, which reverts the `kubectl scale --replicas=0` within seconds — so the seed ran with Dolibarr still connected. Fix: pause self-heal for the duration, re-arm it after; app restore + self-heal restoration + secret cleanup are guarded by an EXIT trap so an interrupt can't strand the sandbox at replicas=0 / self-heal off. Validated end-to-end on the live sandbox: 295 llx tables, company=Arcodange, owner=erp_sandbox_role, self-heal re-armed, pod 1/1. README documents the self-heal pause and the iso-prod consequence (ai_agent_sandbox is wiped → re-provision). Co-Authored-By: Claude Opus 4.7 (1M context) --- ops/sandbox/README.md | 28 +++++++++++++++++++------ ops/sandbox/sandbox-lifecycle.sh | 35 ++++++++++++++++++++++++-------- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/ops/sandbox/README.md b/ops/sandbox/README.md index a86339f..1159fc4 100644 --- a/ops/sandbox/README.md +++ b/ops/sandbox/README.md @@ -27,12 +27,28 @@ in `factory postgres/iac/providers.tf`, used **only** in the human-gated ./sandbox-lifecycle.sh refresh # both, in order ``` -`refresh-from-prod` scales the sandbox pod to 0, dumps the full prod `public` -schema (read-only), wipes the sandbox's app objects, restores, and scales back -up. It dumps the **whole** schema (not just `llx_*`) so app helper functions and -their triggers (e.g. `update_modified_column_tms()`) come over; it filters out -the provisioner-owned `user_lookup` pgbouncer function from the restore TOC -because that object already exists per-environment and is not app data. +`refresh-from-prod` pauses ArgoCD self-heal on the `erp-sandbox` Application (else +self-heal reverts the scale-down within seconds and the seed would run with the app +still connected), scales the sandbox pod to 0, dumps the full prod `public` schema +(read-only), wipes the sandbox's app objects, restores, scales back up and re-arms +self-heal. App restore and self-heal restoration are guarded by an EXIT trap, so an +interrupt can't leave the sandbox scaled to 0 with self-heal off. It dumps the +**whole** schema (not just `llx_*`) so app helper functions and their triggers (e.g. +`update_modified_column_tms()`) come over; it filters out the provisioner-owned +`user_lookup` pgbouncer function from the restore TOC because that object already +exists per-environment and is not app data. + +> Note: `pg_restore` runs with an explicit `-U` (the sandbox role) — without it the +> postgres image connects as its OS user `root` and auth-fails. Its exit code is not +> trusted (it returns non-zero on the harmless "schema public already exists" +> notice); success is verified by counting restored `llx_*` tables. + +> Heads-up: a full refresh is **iso-prod**, so it overwrites `llx_user` with prod's +> and wipes the `ai_agent_sandbox` write user + its API key (and resets +> `DOLI_INSTANCE_UNIQUE_ID` to prod's, invalidating any prior key). After a refresh, +> re-run `test/provisionSandbox.ts` to recreate the agent (it re-grants its rights, +> incl. `banque lire`) and refresh the `dolibarr-sandbox-write` skill `.env` from the +> new key file. ## Two fidelity caveats (by design — see ADR-0003) diff --git a/ops/sandbox/sandbox-lifecycle.sh b/ops/sandbox/sandbox-lifecycle.sh index 044119a..f17f624 100755 --- a/ops/sandbox/sandbox-lifecycle.sh +++ b/ops/sandbox/sandbox-lifecycle.sh @@ -43,9 +43,22 @@ prod_pod() { kubectl get pod -n "$PROD_NS" -l app.kubernetes.io/instance=erp -o cleanup_secret() { kubectl delete secret "$TMP_PROD_SECRET" -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true; } +# erp-sandbox is ArgoCD-managed with self-heal ON, which reverts `kubectl scale +# --replicas=0` within seconds — so without pausing it the seed runs while Dolibarr +# is still connected, and the restore collides with the app re-creating tables. +# Pause self-heal for the duration so the scale-down holds; always re-arm it. +ARGOCD_NS="argocd"; ARGOCD_APP="erp-sandbox" +set_selfheal() { kubectl patch application "$ARGOCD_APP" -n "$ARGOCD_NS" --type merge \ + -p "{\"spec\":{\"syncPolicy\":{\"automated\":{\"selfHeal\":$1,\"prune\":true}}}}" >/dev/null 2>&1 || true; } +# Safety net (EXIT trap): whatever happens, bring the app back, re-arm self-heal, drop the secret. +restore_state() { set_selfheal true; kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null 2>&1 || true; cleanup_secret; } + refresh_from_prod() { command -v python3 >/dev/null || die "python3 required to copy the prod secret without exposing it" - trap cleanup_secret EXIT + trap restore_state EXIT + + log "Pausing ArgoCD self-heal so the scale-to-0 holds (else it is reverted in seconds)" + set_selfheal false log "Copying prod DB creds into a transient, read-only-intent secret in $SB_NS (values stay base64)" kubectl get secret vso-db-credentials -n "$PROD_NS" -o json \ @@ -95,13 +108,18 @@ spec: # 2. wipe sandbox app objects (everything owned by the app role; infra untouched) PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -v ON_ERROR_STOP=1 \\ -c "DROP OWNED BY $SB_ROLE CASCADE;" - # 3. restore golden, owned by the sandbox role - PGPASSWORD=\$SB_PGPASSWORD \\ - pg_restore -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump \\ - && echo "restore: clean" || echo "restore: completed with ignorable warnings" - # 4. verify + # 3. restore golden, owned by the sandbox role. MUST pass -U: without it + # pg_restore connects as the container's OS user (root) and auth-fails. + # pg_restore also exits non-zero on the harmless "schema public already + # exists" notice, so its exit code is NOT trustworthy — verify by count below. Q() { PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -tAc "\$1"; } - echo "llx tables=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'") company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'") lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'") owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")" + PGPASSWORD=\$SB_PGPASSWORD \\ + pg_restore -h "\$PGHOST" -U "\$SB_PGUSER" -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump 2>/tmp/restore.err \\ + && echo "restore: clean" || echo "restore: pg_restore rc=\$? — verifying by table count, not exit code" + # 4. verify — FAIL the Job if the restore did not actually populate the schema + N=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'") + [ "\$N" -ge 250 ] || { echo "ABORT: only \$N llx tables after restore — restore failed. Last errors:"; tail -5 /tmp/restore.err; exit 1; } + echo "llx tables=\$N company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'") lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'") owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")" echo "DONE." EOF kubectl wait --for=condition=complete job/sandbox-seed -n "$SB_NS" --timeout=300s >/dev/null 2>&1 \ @@ -109,7 +127,8 @@ EOF kubectl logs -n "$SB_NS" job/sandbox-seed | sed 's/^/ /' kubectl delete job sandbox-seed -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true - log "Scaling erp-sandbox back to 1" + log "Restoring app (replicas=1) + re-arming ArgoCD self-heal" + set_selfheal true kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null cleanup_secret; trap - EXIT log "Refresh complete. Run 'sync-documents' to also copy the company logo + uploads."