erp/.claude/skills/arcodange-email-ingest/scripts/email-inspect.sh

#!/usr/bin/env bash
# Inspect one email by id and propose a Dolibarr supplier-invoice draft.
#
# Usage:
#   email-inspect.sh <messageId> [--folder PATH]    # default folder: /Inbox/books
#                                [--save-pdf DIR]   # save PDF attachments under DIR/
#                                [--json]           # emit a single JSON object on stdout
#
# Pipeline (read-only):
#   1. Find the message (in the given folder, default /Inbox/books).
#   2. List attachments via /attachmentinfo.
#   3. For each PDF attachment: download, run pdftotext, extract supplier-side
#      heuristics (name, totals, dates, ref).
#   4. Emit a draft "Dolibarr-ready" record per attachment so the operator can
#      hand-create the supplier invoice in the Dolibarr UI.
#
# This skill DOES NOT write to Dolibarr. Auto-creation of supplier invoices is
# V9 candidate.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ZOHO_CURL="${SCRIPT_DIR}/zoho-curl.sh"

if [[ $# -lt 1 ]]; then
  echo "email-inspect.sh: missing <messageId>" >&2
  echo "  Hint: bin/arcodange email list  to see candidate ids." >&2
  exit 2
fi
MID="$1"; shift || true
FOLDER="/Inbox/books"; SAVE_PDF_DIR=""; FMT="text"
while [[ $# -gt 0 ]]; do
  case "$1" in
    --folder)   FOLDER="$2"; shift 2 ;;
    --save-pdf) SAVE_PDF_DIR="$2"; shift 2 ;;
    --json)     FMT="json"; shift ;;
    -h|--help)  sed -n '2,18p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
    *) echo "email-inspect.sh: unknown arg: $1" >&2; exit 2 ;;
  esac
done

command -v pdftotext >/dev/null || { echo "email-inspect.sh: pdftotext not found (brew install poppler)" >&2; exit 2; }

WORK="$(mktemp -d -t emailinspect.XXXXXX)"
trap 'rm -rf "${WORK}"' EXIT

# 1. accountId + folderId
"${ZOHO_CURL}" /accounts > "${WORK}/accounts.json"
AID=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print((d.get('data') or [{}])[0].get('accountId',''))" "${WORK}/accounts.json")
"${ZOHO_CURL}" "/accounts/${AID}/folders" > "${WORK}/folders.json"
FID=$(python3 -c "
import json, sys
d = json.load(open(sys.argv[1]))
target = sys.argv[2]
for f in (d.get('data') or []):
    if f.get('path') == target:
        print(f.get('folderId')); break" "${WORK}/folders.json" "${FOLDER}")
[[ -z "${FID}" ]] && { echo "email-inspect.sh: folder '${FOLDER}' not found" >&2; exit 2; }

# 2. Find the message in the folder listing (to grab metadata: subject, from, date)
"${ZOHO_CURL}" "/accounts/${AID}/messages/view?folderId=${FID}&limit=100&sortorder=false&start=1" > "${WORK}/folder_msgs.json"
python3 - "${WORK}/folder_msgs.json" "${MID}" > "${WORK}/meta.json" <<'PY'
import json, sys
d = json.load(open(sys.argv[1]))
mid = sys.argv[2]
for m in (d.get("data") or []):
    if str(m.get("messageId")) == mid:
        json.dump(m, sys.stdout); sys.exit(0)
sys.exit(f"messageId {mid} not found in this folder")
PY

# 3. Attachment metadata
"${ZOHO_CURL}" "/accounts/${AID}/folders/${FID}/messages/${MID}/attachmentinfo" > "${WORK}/attachinfo.json"

# 4. Download each attachment — needs raw bytes (Accept: */*), not the JSON
# wrapper's default. We bypass zoho-curl.sh for the attachment download but
# reuse the cached access_token it wrote.
set -a; source "${SCRIPT_DIR}/../../dolibarr/.env"; set +a
: "${ZOHO_DC:=eu}"
TOKEN_CACHE="${TMPDIR:-/tmp}/zoho-access-$(whoami)"
if [[ ! -s "${TOKEN_CACHE}" ]]; then
  echo "email-inspect.sh: missing access token cache — run any zoho-curl call first to populate it" >&2
  exit 2
fi
ACCESS_TOKEN=$(cat "${TOKEN_CACHE}")
MAIL_BASE="https://mail.zoho.${ZOHO_DC}/api"

mkdir -p "${WORK}/atts" "${WORK}/text"
ATT_IDS=$(python3 -c "
import json, sys
d = json.load(open(sys.argv[1]))
data = d.get('data') or {}
for a in (data.get('attachments') or []):
    print(f\"{a.get('attachmentId')}|{a.get('attachmentName','-')}\")" "${WORK}/attachinfo.json")
while IFS='|' read -r aid aname; do
  [[ -z "${aid}" ]] && continue
  outpath="${WORK}/atts/${aname}"
  curl -sS \
    -H "Authorization: Zoho-oauthtoken ${ACCESS_TOKEN}" \
    -H "Accept: */*" \
    --max-time 60 \
    -o "${outpath}" \
    "${MAIL_BASE}/accounts/${AID}/folders/${FID}/messages/${MID}/attachments/${aid}" || true
  # If pdf, extract text (bash 3.2 compatible — no ${var,,})
  aname_lc=$(echo "${aname}" | tr '[:upper:]' '[:lower:]')
  if [[ "${aname_lc}" == *.pdf ]]; then
    pdftotext -layout "${outpath}" "${WORK}/text/${aname%.pdf}.txt" 2>/dev/null || true
  fi
done <<< "${ATT_IDS}"

# Optional save
if [[ -n "${SAVE_PDF_DIR}" ]]; then
  mkdir -p "${SAVE_PDF_DIR}"
  cp "${WORK}/atts/"*.pdf "${SAVE_PDF_DIR}/" 2>/dev/null || true
fi

# 5. Heuristic extract + render
python3 - "${WORK}" "${FMT}" <<'PY'
import json, sys, os, re, datetime, glob
work, fmt = sys.argv[1:3]

meta = json.load(open(os.path.join(work,"meta.json")))
ts = int(meta.get("sentDateInGMT") or meta.get("receivedTime") or 0) // 1000
mail_date = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d") if ts else None
mail_from = (meta.get("fromAddress") or meta.get("sender") or "-").replace("&lt;","<").replace("&gt;",">").replace("<","").replace(">","")
mail_subject = meta.get("subject") or "-"

# Heuristics on PDF text
def extract(text):
    out = {}
    # First non-empty line is often the supplier name (or the address block first line)
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    out["pdf_top_line"] = lines[0] if lines else None

    # Total TTC / HT / TVA — try multiple French/English patterns
    def first_match(*patterns):
        for p in patterns:
            for line in lines:
                m = re.search(p, line, re.IGNORECASE)
                if m: return m.group(1).replace(",", ".").replace(" ", "")
        return None

    def parse_amount(s):
        if not s: return None
        clean = s.replace(",", ".").replace(" ", "")
        try:
            v = float(clean)
            # Money amounts < 1M EUR; filters out VAT-number false positives (FR12345678901)
            return v if 0 <= v < 1_000_000 else None
        except: return None

    def first_amount(*patterns):
        for p in patterns:
            for line in lines:
                m = re.search(p, line, re.IGNORECASE)
                if m:
                    v = parse_amount(m.group(1))
                    if v is not None: return f"{v:.2f}"
        return None

    out["total_ht"]  = first_amount(r'(?:total\s*ht|montant\s*ht|net\s*amount|subtotal)[^\d-]*([\d \.,]+)')
    # TVA: require currency suffix to avoid matching VAT-number digits
    out["total_tva"] = first_amount(r'(?:tva|vat)[^\d-]*([\d \.,]+)\s*(?:€|eur)\b')
    out["total_ttc"] = first_amount(r'(?:total\s*ttc|amount\s*due|total\s*due|grand\s*total|montant\s*total|amount\s*paid)[^\d-]*([\d \.,]+)')

    # Invoice ref — must contain a digit (filters "umber", "Invoice", etc.)
    m = re.search(r'(?:facture|invoice|receipt|reçu)\s*(?:n[°o]?|number|#|:)\s*([A-Za-z0-9][\w\d/-]{2,})', text, re.IGNORECASE)
    if m and any(c.isdigit() for c in m.group(1)):
        out["invoice_ref"] = m.group(1)
    else:
        # Fallback: any reasonable ref-shaped token after "Invoice" / "Facture" header
        m = re.search(r'\b([A-Z]{2,}[-/]?\d[\w\d/-]{2,})\b', text)
        out["invoice_ref"] = m.group(1) if m else None

    # Invoice date — try ISO, French DD/MM/YYYY, English MM/DD/YYYY, French long form
    out["invoice_date_raw"] = None
    for p in (
        r'\b(\d{4}-\d{2}-\d{2})\b',
        r'(?:date|émise\s*le|invoice\s*date|date\s*de\s*facturation)[:\s]*(\d{1,2}[\s/.-]\d{1,2}[\s/.-]\d{2,4})',
        r'(?:date|émise\s*le|invoice\s*date)[:\s]*(\d{1,2}\s+\w{3,9}\.?\s+\d{4})',
    ):
        m = re.search(p, text, re.IGNORECASE)
        if m: out["invoice_date_raw"] = m.group(1).strip(); break

    # VAT rate (e.g. "20%") — restrict to 0-25% so "100%" / page footers don't match.
    vrate = None
    for line in lines:
        m = re.search(r'\b(\d{1,2}([.,]\d+)?)\s*%', line)
        if m:
            v = float(m.group(1).replace(",", "."))
            if 0 <= v <= 25:
                vrate = m.group(1).replace(",", "."); break
    out["vat_rate_pct"] = vrate

    return out

pdfs = []
for pdf in sorted(glob.glob(os.path.join(work,"atts","*.pdf")) +
                  glob.glob(os.path.join(work,"atts","*.PDF"))):
    name = os.path.basename(pdf)
    txt_path = os.path.join(work,"text", os.path.splitext(name)[0] + ".txt")
    text = open(txt_path).read() if os.path.isfile(txt_path) else ""
    h = extract(text)
    h["attachment_name"] = name
    h["pdf_size_bytes"] = os.path.getsize(pdf)
    h["pdf_text_len"] = len(text)
    pdfs.append(h)

result = {
    "email": {
        "messageId": meta.get("messageId"),
        "subject":   mail_subject,
        "from":      mail_from,
        "date":      mail_date,
        "hasAttachment": str(meta.get("hasAttachment","")) == "1",
    },
    "attachments": pdfs,
    "dolibarr_draft_suggestions": [
        {
            "supplier_hint":  p.get("pdf_top_line"),
            "invoice_ref":    p.get("invoice_ref"),
            "invoice_date":   p.get("invoice_date_raw"),
            "total_ht":       p.get("total_ht"),
            "total_tva":      p.get("total_tva"),
            "total_ttc":      p.get("total_ttc"),
            "vat_rate_pct":   p.get("vat_rate_pct"),
            "source_email":   meta.get("messageId"),
            "source_attachment": p.get("attachment_name"),
        } for p in pdfs
    ]
}

if fmt == "json":
    print(json.dumps(result, indent=2, ensure_ascii=False))
    sys.exit(0)

print("=" * 80)
print(f" Email {meta.get('messageId')}")
print("=" * 80)
print(f"  subject   : {mail_subject}")
print(f"  from      : {mail_from}")
print(f"  date      : {mail_date}")
print(f"  attached  : {result['email']['hasAttachment']}")
print()
if not pdfs:
    print("  (no PDF attachments — try inspecting body or other types)")
for i, p in enumerate(pdfs, 1):
    print(f"  -- Attachment {i}: {p['attachment_name']} ({p['pdf_size_bytes']} bytes, {p['pdf_text_len']} chars extracted) --")
    for k in ("pdf_top_line","invoice_ref","invoice_date_raw","total_ht","total_tva","total_ttc","vat_rate_pct"):
        v = p.get(k)
        print(f"    {k:<16} = {v!r}")
    print()

print("  Suggested Dolibarr supplier-invoice draft entries:")
print(json.dumps(result["dolibarr_draft_suggestions"], indent=4, ensure_ascii=False))
PY