fix(ops): sandbox refresh-from-prod actually restores now (pg_restore -U + self-heal pause)

refresh-from-prod was structurally broken and silently no-op'd the restore:

1. pg_restore lacked -U, so the postgres image connected as its OS user `root`
   and auth-failed. The failure was swallowed by `|| echo "ignorable warnings"`,
   so the script reported success while the DROP OWNED had already emptied the DB.
   E2's original seed was a manual process, so this path had never really run.
   Fix: pass `-h $PGHOST -U $SB_PGUSER`; don't trust pg_restore's exit code (it
   returns non-zero on the harmless "schema public already exists" notice) — verify
   by counting restored llx_* tables and FAIL the Job if < 250.

2. erp-sandbox is ArgoCD-managed with self-heal ON, which reverts the
   `kubectl scale --replicas=0` within seconds — so the seed ran with Dolibarr
   still connected. Fix: pause self-heal for the duration, re-arm it after; app
   restore + self-heal restoration + secret cleanup are guarded by an EXIT trap so
   an interrupt can't strand the sandbox at replicas=0 / self-heal off.

Validated end-to-end on the live sandbox: 295 llx tables, company=Arcodange,
owner=erp_sandbox_role, self-heal re-armed, pod 1/1. README documents the self-heal
pause and the iso-prod consequence (ai_agent_sandbox is wiped → re-provision).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-30 06:59:39 +02:00
parent 0688e3d7fd
commit 434be7488d
2 changed files with 49 additions and 14 deletions

View File

@@ -43,9 +43,22 @@ prod_pod() { kubectl get pod -n "$PROD_NS" -l app.kubernetes.io/instance=erp -o
cleanup_secret() { kubectl delete secret "$TMP_PROD_SECRET" -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true; }
# erp-sandbox is ArgoCD-managed with self-heal ON, which reverts `kubectl scale
# --replicas=0` within seconds — so without pausing it the seed runs while Dolibarr
# is still connected, and the restore collides with the app re-creating tables.
# Pause self-heal for the duration so the scale-down holds; always re-arm it.
ARGOCD_NS="argocd"; ARGOCD_APP="erp-sandbox"
set_selfheal() { kubectl patch application "$ARGOCD_APP" -n "$ARGOCD_NS" --type merge \
-p "{\"spec\":{\"syncPolicy\":{\"automated\":{\"selfHeal\":$1,\"prune\":true}}}}" >/dev/null 2>&1 || true; }
# Safety net (EXIT trap): whatever happens, bring the app back, re-arm self-heal, drop the secret.
restore_state() { set_selfheal true; kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null 2>&1 || true; cleanup_secret; }
refresh_from_prod() {
command -v python3 >/dev/null || die "python3 required to copy the prod secret without exposing it"
trap cleanup_secret EXIT
trap restore_state EXIT
log "Pausing ArgoCD self-heal so the scale-to-0 holds (else it is reverted in seconds)"
set_selfheal false
log "Copying prod DB creds into a transient, read-only-intent secret in $SB_NS (values stay base64)"
kubectl get secret vso-db-credentials -n "$PROD_NS" -o json \
@@ -95,13 +108,18 @@ spec:
# 2. wipe sandbox app objects (everything owned by the app role; infra untouched)
PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -v ON_ERROR_STOP=1 \\
-c "DROP OWNED BY $SB_ROLE CASCADE;"
# 3. restore golden, owned by the sandbox role
PGPASSWORD=\$SB_PGPASSWORD \\
pg_restore -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump \\
&& echo "restore: clean" || echo "restore: completed with ignorable warnings"
# 4. verify
# 3. restore golden, owned by the sandbox role. MUST pass -U: without it
# pg_restore connects as the container's OS user (root) and auth-fails.
# pg_restore also exits non-zero on the harmless "schema public already
# exists" notice, so its exit code is NOT trustworthy — verify by count below.
Q() { PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -tAc "\$1"; }
echo "llx tables=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'") company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'") lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'") owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")"
PGPASSWORD=\$SB_PGPASSWORD \\
pg_restore -h "\$PGHOST" -U "\$SB_PGUSER" -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump 2>/tmp/restore.err \\
&& echo "restore: clean" || echo "restore: pg_restore rc=\$? — verifying by table count, not exit code"
# 4. verify — FAIL the Job if the restore did not actually populate the schema
N=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'")
[ "\$N" -ge 250 ] || { echo "ABORT: only \$N llx tables after restore — restore failed. Last errors:"; tail -5 /tmp/restore.err; exit 1; }
echo "llx tables=\$N company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'") lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'") owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")"
echo "DONE."
EOF
kubectl wait --for=condition=complete job/sandbox-seed -n "$SB_NS" --timeout=300s >/dev/null 2>&1 \
@@ -109,7 +127,8 @@ EOF
kubectl logs -n "$SB_NS" job/sandbox-seed | sed 's/^/ /'
kubectl delete job sandbox-seed -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true
log "Scaling erp-sandbox back to 1"
log "Restoring app (replicas=1) + re-arming ArgoCD self-heal"
set_selfheal true
kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null
cleanup_secret; trap - EXIT
log "Refresh complete. Run 'sync-documents' to also copy the company logo + uploads."