Files
Gabriel Radureau 1d38f25c23 arcodange-email-ingest V8.1: filter calendar invites + newsletter senders
email-list.sh gains two hard-exclusion filters (applied before the
candidate test, regardless of attachments):

- EXCLUDE_PATTERN matches subjects starting with Invitation: / Updated
  invitation: / Canceled event: / Accepted: / Declined: / Tentative: /
  Maybe: (after stripping Re:/Fwd:/Tr: prefixes). Filters Google Calendar
  events that always carry an .ics attachment.
- EXCLUDE_SENDER matches updates.<domain>, noreply@*calendar, news@,
  newsletter@. Filters newsletter blast traffic.

Effect on --all-folders --candidates-only baseline: 27 noisy → 12
actionable (calendar invites + the staying-ahead.ai newsletter blast
removed). Real supplier docs intact: Darnis F1042 in /Notification, 3 Free
Mobile factures in /Inbox/abonnements, Mistral + Anthropic in /Inbox/books.

The originally-planned --mark-ingested feature is deferred to V8.2:
flag-set requires the Zoho OAuth scope ZohoMail.messages.UPDATE which our
read-only refresh_token doesn't have. Documented in SKILL.md: once the
user opts in to the wider scope, --mark-ingested becomes a one-line flag
on email-inspect.sh and is_candidate() learns to skip flag_info messages.

Captured the new --all-folders baseline at examples/email-list-all-folders.txt.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-31 15:18:31 +02:00

142 lines
5.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# List candidate supplier-invoice emails from the books@ Zoho mailbox.
#
# Usage:
# email-list.sh [--folder PATH] # default: /Inbox/books (the books@ alias-filtered folder)
# [--limit N] # default: 30
# [--candidates-only] # filter by subject pattern OR attachment
# [--all-folders] # scan every folder (slow, lots of API calls)
#
# Output: table with mid, date, from, subject, hasAttachment.
# A "candidate" is a message whose subject matches a supplier-like pattern
# (facture/invoice/receipt/reçu/payment/paiement/abonnement/order/commande)
# OR which has an attachment.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ZOHO_CURL="${SCRIPT_DIR}/zoho-curl.sh"
FOLDER="/Inbox/books"
LIMIT=30
CANDIDATES_ONLY=0
ALL_FOLDERS=0
while [[ $# -gt 0 ]]; do
case "$1" in
--folder) FOLDER="$2"; shift 2 ;;
--limit) LIMIT="$2"; shift 2 ;;
--candidates-only) CANDIDATES_ONLY=1; shift ;;
--all-folders) ALL_FOLDERS=1; shift ;;
-h|--help) sed -n '2,12p' "$0" | sed 's/^# \{0,1\}//'; exit 0 ;;
*) echo "email-list.sh: unknown arg: $1" >&2; exit 2 ;;
esac
done
WORK="$(mktemp -d -t emailist.XXXXXX)"
trap 'rm -rf "${WORK}"' EXIT
# 1. Discover accountId
"${ZOHO_CURL}" /accounts > "${WORK}/accounts.json"
AID=$(python3 -c "import json,sys; d=json.load(open(sys.argv[1])); print((d.get('data') or [{}])[0].get('accountId',''))" "${WORK}/accounts.json")
[[ -z "${AID}" ]] && { echo "email-list.sh: no accountId in /accounts response" >&2; exit 1; }
# 2. Resolve folder path → folderId
"${ZOHO_CURL}" "/accounts/${AID}/folders" > "${WORK}/folders.json"
# Build list of (folderId, path) tuples to scan
if [[ "${ALL_FOLDERS}" == "1" ]]; then
FOLDER_IDS=$(python3 -c "
import json, sys
d = json.load(open(sys.argv[1]))
for f in (d.get('data') or []):
fid = f.get('folderId'); path = f.get('path') or f.get('folderName','-')
# Skip noisy system folders
if path in ('/Drafts','/Templates','/Snoozed','/Sent','/Spam','/Trash','/Outbox'): continue
print(f\"{fid}|{path}\")" "${WORK}/folders.json")
else
FOLDER_IDS=$(python3 -c "
import json, sys
d = json.load(open(sys.argv[1]))
target = sys.argv[2]
for f in (d.get('data') or []):
if f.get('path') == target:
print(f\"{f.get('folderId')}|{f.get('path')}\")
break" "${WORK}/folders.json" "${FOLDER}")
if [[ -z "${FOLDER_IDS}" ]]; then
echo "email-list.sh: folder '${FOLDER}' not found. Available:" >&2
python3 -c "import json,sys; [print(f' {f.get(\"path\",\"-\")}') for f in json.load(open(sys.argv[1])).get('data',[])]" "${WORK}/folders.json" >&2
exit 2
fi
fi
# 3. Fetch messages per folder
mkdir -p "${WORK}/msgs"
COUNT=0
while IFS='|' read -r fid fpath; do
[[ -z "${fid}" ]] && continue
COUNT=$((COUNT+1))
out="${WORK}/msgs/$(printf '%03d' "${COUNT}").json"
"${ZOHO_CURL}" "/accounts/${AID}/messages/view?folderId=${fid}&limit=${LIMIT}&sortorder=false&start=1" > "${out}" 2>/dev/null || echo '{"data":[]}' > "${out}"
echo "${fpath}" > "${out}.path"
done <<< "${FOLDER_IDS}"
# 4. Render
python3 - "${WORK}/msgs" "${CANDIDATES_ONLY}" <<'PY'
import json, sys, os, re, datetime, glob
msgs_dir, candidates_only_str = sys.argv[1:3]
candidates_only = candidates_only_str == "1"
CANDIDATE_PATTERN = re.compile(
r'facture|invoice|receipt|re[cç]u|payment|paiement|abonnement|subscription|order|commande|invoice|bill',
re.IGNORECASE,
)
# Subjects that look like calendar invites / event updates / generic notifications
# get filtered out of --candidates-only — they always have a .ics attachment so
# the "has-attachment" heuristic alone catches them as false positives.
EXCLUDE_PATTERN = re.compile(
r'^(?:re:\s*|fwd:\s*|tr:\s*)*' # strip Re:/Fwd:/Tr: prefixes
r'(?:invitation|updated\s+invitation|canceled\s+event|accepted|declined|tentative|maybe)\s*:',
re.IGNORECASE,
)
# Senders that are pure noise — newsletter/marketing patterns.
EXCLUDE_SENDER = re.compile(
r'(updates\.|noreply@.*calendar|@calendar\.|news@|newsletter@|@updates\.)',
re.IGNORECASE,
)
def is_candidate(m):
subj = m.get("subject","") or ""
sender = m.get("fromAddress","") or m.get("sender","") or ""
# Hard exclusions take precedence over inclusions
if EXCLUDE_PATTERN.match(subj.strip()): return False
if EXCLUDE_SENDER.search(sender): return False
if str(m.get("hasAttachment","")) == "1": return True
if CANDIDATE_PATTERN.search(subj): return True
return False
rows = []
for f in sorted(glob.glob(os.path.join(msgs_dir, "*.json"))):
fpath = open(f + ".path").read().strip()
try: data = json.load(open(f)).get("data") or []
except: continue
for m in data:
if candidates_only and not is_candidate(m): continue
ts = int(m.get("sentDateInGMT") or m.get("receivedTime") or 0) // 1000
dt = datetime.datetime.fromtimestamp(ts).strftime("%Y-%m-%d") if ts else "-"
frm = (m.get("fromAddress") or m.get("sender") or "-").replace("&lt;","<").replace("&gt;",">").replace("<","").replace(">","")[:36]
subj = (m.get("subject") or "-")[:55]
has = "Y" if str(m.get("hasAttachment","")) == "1" else " "
cand = "*" if is_candidate(m) else " "
rows.append((dt, fpath, cand, has, m.get("messageId","-"), frm, subj))
rows.sort(key=lambda r: r[0], reverse=True)
print(f"{'date':<10} {'cand':<4} {'att':<3} {'messageId':<22} {'folder':<22} {'from':<36} subject")
print("-" * 130)
for dt, fpath, cand, has, mid, frm, subj in rows:
print(f"{dt:<10} [{cand}] [{has}] {mid:<22} {fpath[:22]:<22} {frm:<36} {subj}")
print("-" * 130)
print(f"# {len(rows)} message(s)" + (" (candidates only)" if candidates_only else ""))
PY