Merge pull request 'fix(ops): sandbox refresh-from-prod actually restores (pg_restore -U + self-heal pause)' (#29) from claude/sandbox-lifecycle-restore-fix into main
This commit was merged in pull request #29.
This commit is contained in:
@@ -27,12 +27,28 @@ in `factory postgres/iac/providers.tf`, used **only** in the human-gated
|
|||||||
./sandbox-lifecycle.sh refresh # both, in order
|
./sandbox-lifecycle.sh refresh # both, in order
|
||||||
```
|
```
|
||||||
|
|
||||||
`refresh-from-prod` scales the sandbox pod to 0, dumps the full prod `public`
|
`refresh-from-prod` pauses ArgoCD self-heal on the `erp-sandbox` Application (else
|
||||||
schema (read-only), wipes the sandbox's app objects, restores, and scales back
|
self-heal reverts the scale-down within seconds and the seed would run with the app
|
||||||
up. It dumps the **whole** schema (not just `llx_*`) so app helper functions and
|
still connected), scales the sandbox pod to 0, dumps the full prod `public` schema
|
||||||
their triggers (e.g. `update_modified_column_tms()`) come over; it filters out
|
(read-only), wipes the sandbox's app objects, restores, scales back up and re-arms
|
||||||
the provisioner-owned `user_lookup` pgbouncer function from the restore TOC
|
self-heal. App restore and self-heal restoration are guarded by an EXIT trap, so an
|
||||||
because that object already exists per-environment and is not app data.
|
interrupt can't leave the sandbox scaled to 0 with self-heal off. It dumps the
|
||||||
|
**whole** schema (not just `llx_*`) so app helper functions and their triggers (e.g.
|
||||||
|
`update_modified_column_tms()`) come over; it filters out the provisioner-owned
|
||||||
|
`user_lookup` pgbouncer function from the restore TOC because that object already
|
||||||
|
exists per-environment and is not app data.
|
||||||
|
|
||||||
|
> Note: `pg_restore` runs with an explicit `-U` (the sandbox role) — without it the
|
||||||
|
> postgres image connects as its OS user `root` and auth-fails. Its exit code is not
|
||||||
|
> trusted (it returns non-zero on the harmless "schema public already exists"
|
||||||
|
> notice); success is verified by counting restored `llx_*` tables.
|
||||||
|
|
||||||
|
> Heads-up: a full refresh is **iso-prod**, so it overwrites `llx_user` with prod's
|
||||||
|
> and wipes the `ai_agent_sandbox` write user + its API key (and resets
|
||||||
|
> `DOLI_INSTANCE_UNIQUE_ID` to prod's, invalidating any prior key). After a refresh,
|
||||||
|
> re-run `test/provisionSandbox.ts` to recreate the agent (it re-grants its rights,
|
||||||
|
> incl. `banque lire`) and refresh the `dolibarr-sandbox-write` skill `.env` from the
|
||||||
|
> new key file.
|
||||||
|
|
||||||
## Two fidelity caveats (by design — see ADR-0003)
|
## Two fidelity caveats (by design — see ADR-0003)
|
||||||
|
|
||||||
|
|||||||
@@ -43,9 +43,22 @@ prod_pod() { kubectl get pod -n "$PROD_NS" -l app.kubernetes.io/instance=erp -o
|
|||||||
|
|
||||||
cleanup_secret() { kubectl delete secret "$TMP_PROD_SECRET" -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true; }
|
cleanup_secret() { kubectl delete secret "$TMP_PROD_SECRET" -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true; }
|
||||||
|
|
||||||
|
# erp-sandbox is ArgoCD-managed with self-heal ON, which reverts `kubectl scale
|
||||||
|
# --replicas=0` within seconds — so without pausing it the seed runs while Dolibarr
|
||||||
|
# is still connected, and the restore collides with the app re-creating tables.
|
||||||
|
# Pause self-heal for the duration so the scale-down holds; always re-arm it.
|
||||||
|
ARGOCD_NS="argocd"; ARGOCD_APP="erp-sandbox"
|
||||||
|
set_selfheal() { kubectl patch application "$ARGOCD_APP" -n "$ARGOCD_NS" --type merge \
|
||||||
|
-p "{\"spec\":{\"syncPolicy\":{\"automated\":{\"selfHeal\":$1,\"prune\":true}}}}" >/dev/null 2>&1 || true; }
|
||||||
|
# Safety net (EXIT trap): whatever happens, bring the app back, re-arm self-heal, drop the secret.
|
||||||
|
restore_state() { set_selfheal true; kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null 2>&1 || true; cleanup_secret; }
|
||||||
|
|
||||||
refresh_from_prod() {
|
refresh_from_prod() {
|
||||||
command -v python3 >/dev/null || die "python3 required to copy the prod secret without exposing it"
|
command -v python3 >/dev/null || die "python3 required to copy the prod secret without exposing it"
|
||||||
trap cleanup_secret EXIT
|
trap restore_state EXIT
|
||||||
|
|
||||||
|
log "Pausing ArgoCD self-heal so the scale-to-0 holds (else it is reverted in seconds)"
|
||||||
|
set_selfheal false
|
||||||
|
|
||||||
log "Copying prod DB creds into a transient, read-only-intent secret in $SB_NS (values stay base64)"
|
log "Copying prod DB creds into a transient, read-only-intent secret in $SB_NS (values stay base64)"
|
||||||
kubectl get secret vso-db-credentials -n "$PROD_NS" -o json \
|
kubectl get secret vso-db-credentials -n "$PROD_NS" -o json \
|
||||||
@@ -95,13 +108,18 @@ spec:
|
|||||||
# 2. wipe sandbox app objects (everything owned by the app role; infra untouched)
|
# 2. wipe sandbox app objects (everything owned by the app role; infra untouched)
|
||||||
PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -v ON_ERROR_STOP=1 \\
|
PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -v ON_ERROR_STOP=1 \\
|
||||||
-c "DROP OWNED BY $SB_ROLE CASCADE;"
|
-c "DROP OWNED BY $SB_ROLE CASCADE;"
|
||||||
# 3. restore golden, owned by the sandbox role
|
# 3. restore golden, owned by the sandbox role. MUST pass -U: without it
|
||||||
PGPASSWORD=\$SB_PGPASSWORD \\
|
# pg_restore connects as the container's OS user (root) and auth-fails.
|
||||||
pg_restore -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump \\
|
# pg_restore also exits non-zero on the harmless "schema public already
|
||||||
&& echo "restore: clean" || echo "restore: completed with ignorable warnings"
|
# exists" notice, so its exit code is NOT trustworthy — verify by count below.
|
||||||
# 4. verify
|
|
||||||
Q() { PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -tAc "\$1"; }
|
Q() { PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -tAc "\$1"; }
|
||||||
echo "llx tables=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'") company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'") lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'") owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")"
|
PGPASSWORD=\$SB_PGPASSWORD \\
|
||||||
|
pg_restore -h "\$PGHOST" -U "\$SB_PGUSER" -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump 2>/tmp/restore.err \\
|
||||||
|
&& echo "restore: clean" || echo "restore: pg_restore rc=\$? — verifying by table count, not exit code"
|
||||||
|
# 4. verify — FAIL the Job if the restore did not actually populate the schema
|
||||||
|
N=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'")
|
||||||
|
[ "\$N" -ge 250 ] || { echo "ABORT: only \$N llx tables after restore — restore failed. Last errors:"; tail -5 /tmp/restore.err; exit 1; }
|
||||||
|
echo "llx tables=\$N company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'") lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'") owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")"
|
||||||
echo "DONE."
|
echo "DONE."
|
||||||
EOF
|
EOF
|
||||||
kubectl wait --for=condition=complete job/sandbox-seed -n "$SB_NS" --timeout=300s >/dev/null 2>&1 \
|
kubectl wait --for=condition=complete job/sandbox-seed -n "$SB_NS" --timeout=300s >/dev/null 2>&1 \
|
||||||
@@ -109,7 +127,8 @@ EOF
|
|||||||
kubectl logs -n "$SB_NS" job/sandbox-seed | sed 's/^/ /'
|
kubectl logs -n "$SB_NS" job/sandbox-seed | sed 's/^/ /'
|
||||||
kubectl delete job sandbox-seed -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true
|
kubectl delete job sandbox-seed -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true
|
||||||
|
|
||||||
log "Scaling erp-sandbox back to 1"
|
log "Restoring app (replicas=1) + re-arming ArgoCD self-heal"
|
||||||
|
set_selfheal true
|
||||||
kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null
|
kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null
|
||||||
cleanup_secret; trap - EXIT
|
cleanup_secret; trap - EXIT
|
||||||
log "Refresh complete. Run 'sync-documents' to also copy the company logo + uploads."
|
log "Refresh complete. Run 'sync-documents' to also copy the company logo + uploads."
|
||||||
|
|||||||
Reference in New Issue
Block a user