Loading...
Loading...
Guidance for processing financial documents (invoices, receipts, statements) with OCR and text extraction. This skill should be used when tasks involve extracting data from financial PDFs or images, generating summaries (CSV/JSON), or moving/organizing processed documents. Emphasizes data safety practices to prevent catastrophic data loss.
npx skill4agent add letta-ai/skills financial-document-processor# CORRECT: Copy first, verify, then clean up
cp -r /source/documents/ /backup/documents/
# ... process files ...
# ... verify outputs match expectations ...
# Only after verification: rm /backup/documents/
# WRONG: Delete before moving (data loss risk)
rm -f /source/*.pdf && mv /source/* /dest/ # Files deleted before move!which tesseractwhich pdftotextpython3 -c "import pypdf"# Create timestamped backup directory
BACKUP_DIR="/tmp/backup_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$BACKUP_DIR"
cp -r /path/to/source/documents/* "$BACKUP_DIR/"
echo "Backup created at: $BACKUP_DIR"# Test extraction on a single file first
sample_file = "/path/to/sample_invoice.pdf"
extracted_data = extract_document(sample_file)
print(f"Extracted: {extracted_data}")
# Manually verify these values match the documentdef parse_amount(text):
"""Handle multiple number format conventions."""
# Remove currency symbols and whitespace
cleaned = re.sub(r'[$€£\s]', '', text)
# Detect European format (comma as decimal separator)
if re.match(r'^\d{1,3}(\.\d{3})*,\d{2}$', cleaned):
cleaned = cleaned.replace('.', '').replace(',', '.')
# US format (comma as thousands separator)
elif ',' in cleaned:
cleaned = cleaned.replace(',', '')
return float(cleaned) if cleaned else Noneresults = []
errors = []
for doc_path in document_paths:
try:
data = extract_document(doc_path)
results.append(data)
print(f"✓ Processed: {doc_path}")
except Exception as e:
errors.append((doc_path, str(e)))
print(f"✗ Failed: {doc_path} - {e}")
if errors:
print(f"\nWarning: {len(errors)} documents failed to process")
for path, error in errors:
print(f" - {path}: {error}")# Verification checklist
assert len(results) == len(document_paths), "Record count mismatch"
# Spot-check a few values
for sample in random.sample(results, min(3, len(results))):
print(f"Please verify: {sample['filename']} -> Total: {sample['total']}")# Move files to destination (not delete!)
for file in /source/documents/*.pdf; do
mv "$file" /processed/
done
# Only remove backup after confirming processed files exist
ls /processed/*.pdf && rm -rf "$BACKUP_DIR"rmrm -f *.pdf && mv *.pdf /dest/def is_text_based_pdf(pdf_path):
"""Check if PDF contains extractable text."""
from pypdf import PdfReader
reader = PdfReader(pdf_path)
for page in reader.pages:
if page.extract_text().strip():
return True
return Falsecp -r "$BACKUP_DIR"/* /source/| File Type | Primary Tool | Fallback |
|---|---|---|
| Text-based PDF | pypdf, pdftotext | - |
| Scanned PDF | tesseract (after pdf2image) | pypdf |
| JPG/PNG images | tesseract | - |
| Mixed PDF (text + scans) | pypdf first, tesseract for image pages | - |
# System packages
apt-get install tesseract-ocr poppler-utils
# Python packages
pip install pypdf pytesseract pdf2image pillow