Loading...
Loading...
Convert documents (PDF, Word, Excel, PowerPoint, images, HTML) to Markdown using microsoft/markitdown. Use for document analysis, content extraction, preprocessing for LLMs, or batch document conversion. Supports images with OCR/LLM descriptions, audio transcription, and ZIP archives.
npx skill4agent add rysweet/amplihack markitdownfrom markitdown import MarkItDown
md = MarkItDown()
result = md.convert("document.pdf")
print(result.text_content)# Convert single file
markitdown document.pdf > output.md
markitdown document.pdf -o output.md
# Pipe input
cat document.pdf | markitdownexport OPENAI_API_KEY="sk-..."# Basic PDF conversion
md = MarkItDown()
result = md.convert("report.pdf")
# With Azure Document Intelligence (better quality)
md = MarkItDown(docintel_endpoint="<your-endpoint>")
result = md.convert("report.pdf")# Word documents - preserves structure
result = md.convert("document.docx")
# Excel - converts tables to markdown tables
result = md.convert("spreadsheet.xlsx")
# PowerPoint - extracts slide content
result = md.convert("presentation.pptx")# ✅ SECURE: Using environment variables for API keys
import os
from openai import OpenAI
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
raise RuntimeError("OPENAI_API_KEY not set")
client = OpenAI(api_key=api_key)
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert("diagram.jpg") # Gets AI-generated descriptionfrom pathlib import Path
md = MarkItDown()
documents = Path(".").glob("*.pdf")
for doc in documents:
result = md.convert(str(doc))
output_path = doc.with_suffix(".md")
output_path.write_text(result.text_content)# Full installation (all features)
pip install 'markitdown[all]'
# Selective features
pip install 'markitdown[pdf, docx, pptx]'| File Type | Use Case | Command |
|---|---|---|
| Reports, papers | | |
| Word | Documents | |
| Excel | Data tables | |
| PowerPoint | Presentations | |
| Images | Diagrams with OCR | |
| HTML | Web pages | |
| ZIP | Archives | |
# ❌ NEVER DO THIS
md = MarkItDown(llm_client=OpenAI(api_key="sk-hardcoded-key"))
# ✅ ALWAYS DO THIS
api_key = os.getenv("OPENAI_API_KEY")
md = MarkItDown(llm_client=OpenAI(api_key=api_key))# ❌ Vulnerable to path traversal
user_input = "../../../etc/passwd"
md.convert(user_input)
# ✅ Validate and sanitize
from pathlib import Path
safe_path = Path(user_input).resolve()
if not safe_path.is_relative_to(allowed_dir):
raise ValueError("Invalid path")
md.convert(str(safe_path))# ❌ Can cause DoS
md.convert("huge_file.pdf") # No size check
# ✅ Check size first
max_size = 50 * 1024 * 1024 # 50MB
if Path("file.pdf").stat().st_size > max_size:
raise ValueError("File too large")pip install 'markitdown[all]'