#!/usr/bin/env bash # Inspect one email by id and propose a Dolibarr supplier-invoice draft. # # Usage: # email-inspect.sh [--folder PATH] # default folder: /Inbox/books # [--save-pdf DIR] # save PDF attachments under DIR/ # [--json] # emit a single JSON object on stdout # # Pipeline (read-only): # 1. Find the message (in the given folder, default /Inbox/books). # 2. List attachments via /attachmentinfo. # 3. For each PDF attachment: download, run pdftotext, extract supplier-side # heuristics (name, totals, dates, ref). # 4. Emit a draft "Dolibarr-ready" record per attachment so the operator can # hand-create the supplier invoice in the Dolibarr UI. # # This skill DOES NOT write to Dolibarr. Auto-creation of supplier invoices is # V9 candidate. set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" ZOHO_CURL="${SCRIPT_DIR}/zoho-curl.sh" if [[ $# -lt 1 ]]; then echo "email-inspect.sh: missing " >&2 echo " Hint: bin/arcodange email list to see candidate ids." >&2 exit 2 fi MID="$1"; shift || true FOLDER="/Inbox/books"; SAVE_PDF_DIR=""; FMT="text" while [[ $# -gt 0 ]]; do case "$1" in --folder) FOLDER="$2"; shift 2 ;; --save-pdf) SAVE_PDF_DIR="$2"; shift 2 ;; --json) FMT="json"; shift ;; -h|--help) sed -n '2,18p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;; *) echo "email-inspect.sh: unknown arg: $1" >&2; exit 2 ;; esac done command -v pdftotext >/dev/null || { echo "email-inspect.sh: pdftotext not found (brew install poppler)" >&2; exit 2; } WORK="$(mktemp -d -t emailinspect.XXXXXX)" trap 'rm -rf "${WORK}"' EXIT # 1. accountId + folderId "${ZOHO_CURL}" /accounts > "${WORK}/accounts.json" AID=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print((d.get('data') or [{}])[0].get('accountId',''))" "${WORK}/accounts.json") "${ZOHO_CURL}" "/accounts/${AID}/folders" > "${WORK}/folders.json" FID=$(python3 -c " import json, sys d = json.load(open(sys.argv[1])) target = sys.argv[2] for f in (d.get('data') or []): if f.get('path') == target: print(f.get('folderId')); break" "${WORK}/folders.json" "${FOLDER}") [[ -z "${FID}" ]] && { echo "email-inspect.sh: folder '${FOLDER}' not found" >&2; exit 2; } # 2. Find the message in the folder listing (to grab metadata: subject, from, date) "${ZOHO_CURL}" "/accounts/${AID}/messages/view?folderId=${FID}&limit=100&sortorder=false&start=1" > "${WORK}/folder_msgs.json" python3 - "${WORK}/folder_msgs.json" "${MID}" > "${WORK}/meta.json" <<'PY' import json, sys d = json.load(open(sys.argv[1])) mid = sys.argv[2] for m in (d.get("data") or []): if str(m.get("messageId")) == mid: json.dump(m, sys.stdout); sys.exit(0) sys.exit(f"messageId {mid} not found in this folder") PY # 3. Attachment metadata "${ZOHO_CURL}" "/accounts/${AID}/folders/${FID}/messages/${MID}/attachmentinfo" > "${WORK}/attachinfo.json" # 4. Download each attachment — needs raw bytes (Accept: */*), not the JSON # wrapper's default. We bypass zoho-curl.sh for the attachment download but # reuse the cached access_token it wrote. set -a; source "${SCRIPT_DIR}/../../dolibarr/.env"; set +a : "${ZOHO_DC:=eu}" TOKEN_CACHE="${TMPDIR:-/tmp}/zoho-access-$(whoami)" if [[ ! -s "${TOKEN_CACHE}" ]]; then echo "email-inspect.sh: missing access token cache — run any zoho-curl call first to populate it" >&2 exit 2 fi ACCESS_TOKEN=$(cat "${TOKEN_CACHE}") MAIL_BASE="https://mail.zoho.${ZOHO_DC}/api" mkdir -p "${WORK}/atts" "${WORK}/text" ATT_IDS=$(python3 -c " import json, sys d = json.load(open(sys.argv[1])) data = d.get('data') or {} for a in (data.get('attachments') or []): print(f\"{a.get('attachmentId')}|{a.get('attachmentName','-')}\")" "${WORK}/attachinfo.json") while IFS='|' read -r aid aname; do [[ -z "${aid}" ]] && continue outpath="${WORK}/atts/${aname}" curl -sS \ -H "Authorization: Zoho-oauthtoken ${ACCESS_TOKEN}" \ -H "Accept: */*" \ --max-time 60 \ -o "${outpath}" \ "${MAIL_BASE}/accounts/${AID}/folders/${FID}/messages/${MID}/attachments/${aid}" || true # If pdf, extract text (bash 3.2 compatible — no ${var,,}) aname_lc=$(echo "${aname}" | tr '[:upper:]' '[:lower:]') if [[ "${aname_lc}" == *.pdf ]]; then pdftotext -layout "${outpath}" "${WORK}/text/${aname%.pdf}.txt" 2>/dev/null || true fi done <<< "${ATT_IDS}" # Optional save if [[ -n "${SAVE_PDF_DIR}" ]]; then mkdir -p "${SAVE_PDF_DIR}" cp "${WORK}/atts/"*.pdf "${SAVE_PDF_DIR}/" 2>/dev/null || true fi # 5. Heuristic extract + render python3 - "${WORK}" "${FMT}" <<'PY' import json, sys, os, re, datetime, glob work, fmt = sys.argv[1:3] meta = json.load(open(os.path.join(work,"meta.json"))) ts = int(meta.get("sentDateInGMT") or meta.get("receivedTime") or 0) // 1000 mail_date = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d") if ts else None mail_from = (meta.get("fromAddress") or meta.get("sender") or "-").replace("<","<").replace(">",">").replace("<","").replace(">","") mail_subject = meta.get("subject") or "-" # Heuristics on PDF text def extract(text): out = {} # First non-empty line is often the supplier name (or the address block first line) lines = [l.strip() for l in text.splitlines() if l.strip()] out["pdf_top_line"] = lines[0] if lines else None # Total TTC / HT / TVA — try multiple French/English patterns def first_match(*patterns): for p in patterns: for line in lines: m = re.search(p, line, re.IGNORECASE) if m: return m.group(1).replace(",", ".").replace(" ", "") return None def parse_amount(s): if not s: return None clean = s.replace(",", ".").replace(" ", "") try: v = float(clean) # Money amounts < 1M EUR; filters out VAT-number false positives (FR12345678901) return v if 0 <= v < 1_000_000 else None except: return None def first_amount(*patterns): for p in patterns: for line in lines: m = re.search(p, line, re.IGNORECASE) if m: v = parse_amount(m.group(1)) if v is not None: return f"{v:.2f}" return None out["total_ht"] = first_amount(r'(?:total\s*ht|montant\s*ht|net\s*amount|subtotal)[^\d-]*([\d \.,]+)') # TVA: require currency suffix to avoid matching VAT-number digits out["total_tva"] = first_amount(r'(?:tva|vat)[^\d-]*([\d \.,]+)\s*(?:€|eur)\b') out["total_ttc"] = first_amount(r'(?:total\s*ttc|amount\s*due|total\s*due|grand\s*total|montant\s*total|amount\s*paid)[^\d-]*([\d \.,]+)') # Invoice ref — must contain a digit (filters "umber", "Invoice", etc.) m = re.search(r'(?:facture|invoice|receipt|reçu)\s*(?:n[°o]?|number|#|:)\s*([A-Za-z0-9][\w\d/-]{2,})', text, re.IGNORECASE) if m and any(c.isdigit() for c in m.group(1)): out["invoice_ref"] = m.group(1) else: # Fallback: any reasonable ref-shaped token after "Invoice" / "Facture" header m = re.search(r'\b([A-Z]{2,}[-/]?\d[\w\d/-]{2,})\b', text) out["invoice_ref"] = m.group(1) if m else None # Invoice date — try ISO, French DD/MM/YYYY, English MM/DD/YYYY, French long form out["invoice_date_raw"] = None for p in ( r'\b(\d{4}-\d{2}-\d{2})\b', r'(?:date|émise\s*le|invoice\s*date|date\s*de\s*facturation)[:\s]*(\d{1,2}[\s/.-]\d{1,2}[\s/.-]\d{2,4})', r'(?:date|émise\s*le|invoice\s*date)[:\s]*(\d{1,2}\s+\w{3,9}\.?\s+\d{4})', ): m = re.search(p, text, re.IGNORECASE) if m: out["invoice_date_raw"] = m.group(1).strip(); break # VAT rate (e.g. "20%") — restrict to 0-25% so "100%" / page footers don't match. vrate = None for line in lines: m = re.search(r'\b(\d{1,2}([.,]\d+)?)\s*%', line) if m: v = float(m.group(1).replace(",", ".")) if 0 <= v <= 25: vrate = m.group(1).replace(",", "."); break out["vat_rate_pct"] = vrate return out pdfs = [] for pdf in sorted(glob.glob(os.path.join(work,"atts","*.pdf")) + glob.glob(os.path.join(work,"atts","*.PDF"))): name = os.path.basename(pdf) txt_path = os.path.join(work,"text", os.path.splitext(name)[0] + ".txt") text = open(txt_path).read() if os.path.isfile(txt_path) else "" h = extract(text) h["attachment_name"] = name h["pdf_size_bytes"] = os.path.getsize(pdf) h["pdf_text_len"] = len(text) pdfs.append(h) result = { "email": { "messageId": meta.get("messageId"), "subject": mail_subject, "from": mail_from, "date": mail_date, "hasAttachment": str(meta.get("hasAttachment","")) == "1", }, "attachments": pdfs, "dolibarr_draft_suggestions": [ { "supplier_hint": p.get("pdf_top_line"), "invoice_ref": p.get("invoice_ref"), "invoice_date": p.get("invoice_date_raw"), "total_ht": p.get("total_ht"), "total_tva": p.get("total_tva"), "total_ttc": p.get("total_ttc"), "vat_rate_pct": p.get("vat_rate_pct"), "source_email": meta.get("messageId"), "source_attachment": p.get("attachment_name"), } for p in pdfs ] } if fmt == "json": print(json.dumps(result, indent=2, ensure_ascii=False)) sys.exit(0) print("=" * 80) print(f" Email {meta.get('messageId')}") print("=" * 80) print(f" subject : {mail_subject}") print(f" from : {mail_from}") print(f" date : {mail_date}") print(f" attached : {result['email']['hasAttachment']}") print() if not pdfs: print(" (no PDF attachments — try inspecting body or other types)") for i, p in enumerate(pdfs, 1): print(f" -- Attachment {i}: {p['attachment_name']} ({p['pdf_size_bytes']} bytes, {p['pdf_text_len']} chars extracted) --") for k in ("pdf_top_line","invoice_ref","invoice_date_raw","total_ht","total_tva","total_ttc","vat_rate_pct"): v = p.get(k) print(f" {k:<16} = {v!r}") print() print(" Suggested Dolibarr supplier-invoice draft entries:") print(json.dumps(result["dolibarr_draft_suggestions"], indent=4, ensure_ascii=False)) PY