erp/ops/sandbox/sandbox-lifecycle.sh

#!/usr/bin/env bash
#
# sandbox-lifecycle.sh — seed / refresh the erp-sandbox Dolibarr from prod, and
# sync its uploaded documents, with prod integrity guaranteed structurally.
#
# Implements ADR-0003 (factory vibe/ADR/0003-sandbox-state-lifecycle.md):
#   - prod is read ONLY (pg_dump runs in a default_transaction_read_only session);
#   - the restore writes ONLY to erp-sandbox, using the sandbox's own dynamic
#     credentials (a member of erp_sandbox_role, which owns only the sandbox DB),
#     so it is structurally incapable of touching prod 'erp' (owned by erp_role);
#   - no DROP/CREATE DATABASE, no CREATEDB, no superuser — wipe is
#     `DROP OWNED BY erp_sandbox_role CASCADE`, reload is `pg_restore`.
#
# The only prod-capable credential on the platform is the superuser provider in
# factory postgres/iac, exercised solely in the human-gated postgres.yaml CI —
# this script never uses it.
#
# Requires: kubectl (context on the lab cluster), and a postgres:16 image
# reachable by the cluster. Run from anywhere.
#
# Usage:
#   ./sandbox-lifecycle.sh refresh-from-prod   # iso-prod seed: prod DB -> erp-sandbox
#   ./sandbox-lifecycle.sh sync-documents      # copy mycompany/ uploads (logo, PDFs)
#   ./sandbox-lifecycle.sh refresh             # refresh-from-prod + sync-documents
#
set -euo pipefail

PROD_NS="erp"
SB_NS="erp-sandbox"
PROD_DB="erp"
SB_DB="erp-sandbox"
SB_ROLE="erp_sandbox_role"          # snake-case owner role (ADR-0002 elision rule)
PGHOST="192.168.1.202"              # direct Postgres (NOT pgbouncer — pooler breaks pg_dump)
PG_IMAGE="postgres:16-alpine"
DOC_ROOT="/var/www/documents"       # dolibarr_main_data_root
TMP_PROD_SECRET="prod-db-ro-temp"   # transient copy of prod creds, deleted on exit

log() { printf '\033[1;36m==>\033[0m %s\n' "$*"; }
die() { printf '\033[1;31mABORT:\033[0m %s\n' "$*" >&2; exit 1; }

sb_pod() { kubectl get pod -n "$SB_NS" -l app.kubernetes.io/instance=erp-sandbox -o name 2>/dev/null | head -1; }
prod_pod() { kubectl get pod -n "$PROD_NS" -l app.kubernetes.io/instance=erp -o name 2>/dev/null | head -1; }

cleanup_secret() { kubectl delete secret "$TMP_PROD_SECRET" -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true; }

# erp-sandbox is ArgoCD-managed with self-heal ON, which reverts `kubectl scale
# --replicas=0` within seconds — so without pausing it the seed runs while Dolibarr
# is still connected, and the restore collides with the app re-creating tables.
# Pause self-heal for the duration so the scale-down holds; always re-arm it.
ARGOCD_NS="argocd"; ARGOCD_APP="erp-sandbox"
set_selfheal() { kubectl patch application "$ARGOCD_APP" -n "$ARGOCD_NS" --type merge \
  -p "{\"spec\":{\"syncPolicy\":{\"automated\":{\"selfHeal\":$1,\"prune\":true}}}}" >/dev/null 2>&1 || true; }
# Safety net (EXIT trap): whatever happens, bring the app back, re-arm self-heal, drop the secret.
restore_state() { set_selfheal true; kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null 2>&1 || true; cleanup_secret; }

refresh_from_prod() {
  command -v python3 >/dev/null || die "python3 required to copy the prod secret without exposing it"
  trap restore_state EXIT

  log "Pausing ArgoCD self-heal so the scale-to-0 holds (else it is reverted in seconds)"
  set_selfheal false

  log "Copying prod DB creds into a transient, read-only-intent secret in $SB_NS (values stay base64)"
  kubectl get secret vso-db-credentials -n "$PROD_NS" -o json \
    | python3 -c "import json,sys; d=json.load(sys.stdin); d['metadata']={'name':'$TMP_PROD_SECRET','namespace':'$SB_NS'}; d.pop('status',None); d['data']={k:d['data'][k] for k in ('username','password')}; print(json.dumps(d))" \
    | kubectl apply -f - >/dev/null

  log "Scaling erp-sandbox to 0 (exclusive DB access for the restore)"
  kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=0 >/dev/null
  kubectl wait --for=delete pod -l app.kubernetes.io/instance=erp-sandbox -n "$SB_NS" --timeout=120s >/dev/null 2>&1 || true

  log "Running the seed Job (pg_dump prod read-only -> DROP OWNED -> pg_restore into sandbox)"
  kubectl delete job sandbox-seed -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true
  kubectl apply -f - >/dev/null <<EOF
apiVersion: batch/v1
kind: Job
metadata: { name: sandbox-seed, namespace: $SB_NS }
spec:
  backoffLimit: 0
  ttlSecondsAfterFinished: 900
  template:
    spec:
      restartPolicy: Never
      containers:
        - name: seed
          image: $PG_IMAGE
          env:
            - { name: PROD_PGUSER,    valueFrom: { secretKeyRef: { name: $TMP_PROD_SECRET,   key: username } } }
            - { name: PROD_PGPASSWORD, valueFrom: { secretKeyRef: { name: $TMP_PROD_SECRET,  key: password } } }
            - { name: SB_PGUSER,      valueFrom: { secretKeyRef: { name: vso-db-credentials, key: username } } }
            - { name: SB_PGPASSWORD,  valueFrom: { secretKeyRef: { name: vso-db-credentials, key: password } } }
            - { name: PGHOST,   value: "$PGHOST" }
            - { name: PGSSLMODE, value: "disable" }
          command: ["/bin/sh","-c"]
          args:
            - |
              set -eu
              SBDB=\$(PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -tAc 'select current_database()')
              [ "\$SBDB" = "$SB_DB" ] || { echo "ABORT: target is '\$SBDB' not $SB_DB"; exit 1; }
              echo "source=$PROD_DB (read-only)  target=$SB_DB  ok"
              # 1. dump prod — full public schema (incl. helper functions + triggers), read-only session
              PGPASSWORD=\$PROD_PGPASSWORD PGOPTIONS='-c default_transaction_read_only=on' \\
                pg_dump -h "\$PGHOST" -U "\$PROD_PGUSER" -d $PROD_DB -n public -Fc -f /tmp/golden.dump
              # drop provisioner-owned infra (pgbouncer user_lookup) from the TOC: it already
              # exists in the sandbox and is not app data, so restoring it conflicts.
              pg_restore -l /tmp/golden.dump | grep -vi 'user_lookup' > /tmp/golden.toc
              echo "dump: \$(ls -l /tmp/golden.dump | awk '{print \$5}') bytes, tables=\$(grep -c 'TABLE DATA ' /tmp/golden.toc)"
              # 2. wipe sandbox app objects (everything owned by the app role; infra untouched)
              PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -v ON_ERROR_STOP=1 \\
                -c "DROP OWNED BY $SB_ROLE CASCADE;"
              # 3. restore golden, owned by the sandbox role. MUST pass -U: without it
              # pg_restore connects as the container's OS user (root) and auth-fails.
              # pg_restore also exits non-zero on the harmless "schema public already
              # exists" notice, so its exit code is NOT trustworthy — verify by count below.
              Q() { PGPASSWORD=\$SB_PGPASSWORD psql -h "\$PGHOST" -U "\$SB_PGUSER" -d $SB_DB -tAc "\$1"; }
              PGPASSWORD=\$SB_PGPASSWORD \\
                pg_restore -h "\$PGHOST" -U "\$SB_PGUSER" -L /tmp/golden.toc --no-owner --role=$SB_ROLE -d $SB_DB /tmp/golden.dump 2>/tmp/restore.err \\
                && echo "restore: clean" || echo "restore: pg_restore rc=\$? — verifying by table count, not exit code"
              # 4. verify — FAIL the Job if the restore did not actually populate the schema
              N=\$(Q "select count(*) from pg_tables where schemaname='public' and tablename like 'llx_%'")
              [ "\$N" -ge 250 ] || { echo "ABORT: only \$N llx tables after restore — restore failed. Last errors:"; tail -5 /tmp/restore.err; exit 1; }
              echo "llx tables=\$N  company=\$(Q "select value from llx_const where name='MAIN_INFO_SOCIETE_NOM'")  lang=\$(Q "select value from llx_const where name='MAIN_LANG_DEFAULT'")  owner=\$(Q "select tableowner from pg_tables where tablename='llx_societe'")"
              echo "DONE."
EOF
  kubectl wait --for=condition=complete job/sandbox-seed -n "$SB_NS" --timeout=300s >/dev/null 2>&1 \
    || die "seed Job did not complete — see: kubectl logs -n $SB_NS job/sandbox-seed"
  kubectl logs -n "$SB_NS" job/sandbox-seed | sed 's/^/    /'
  kubectl delete job sandbox-seed -n "$SB_NS" --ignore-not-found >/dev/null 2>&1 || true

  log "Restoring app (replicas=1) + re-arming ArgoCD self-heal"
  set_selfheal true
  kubectl scale deploy erp-sandbox -n "$SB_NS" --replicas=1 >/dev/null
  cleanup_secret; trap - EXIT
  log "Refresh complete. Run 'sync-documents' to also copy the company logo + uploads."
}

sync_documents() {
  local pp sp
  pp=$(prod_pod); sp=$(sb_pod)
  [ -n "$pp" ] || die "no prod erp pod found"
  [ -n "$sp" ] || die "no erp-sandbox pod found"
  log "Syncing $DOC_ROOT/mycompany (logo + uploads) ${pp##*/} -> ${sp##*/} via tar pipe"
  kubectl exec -n "$PROD_NS" "${pp#pod/}" -- tar -C "$DOC_ROOT" -cf - mycompany 2>/dev/null \
    | kubectl exec -i -n "$SB_NS" "${sp#pod/}" -- tar -C "$DOC_ROOT" -xf -
  log "Documents synced. (For a one-shot logo only, scope the tar to mycompany/logos.)"
}

case "${1:-}" in
  refresh-from-prod) refresh_from_prod ;;
  sync-documents)    sync_documents ;;
  refresh)           refresh_from_prod; sync_documents ;;
  *) echo "usage: $0 {refresh-from-prod|sync-documents|refresh}" >&2; exit 2 ;;
esac