batch-convert
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChineseBatch Convert Skill
批量转换Skill
Overview
概述
This skill enables batch conversion of documents between multiple formats using a unified pipeline. Convert hundreds of files at once with consistent settings, automatic format detection, and parallel processing for maximum efficiency.
该Skill支持通过统一流程批量转换多种格式的文档。可一次性转换数百个文件,具备一致的设置、自动格式检测和并行处理功能,以实现最高效率。
How to Use
使用方法
- Specify the source folder or files
- Choose target format(s)
- Optionally configure conversion options
- I'll process all files with progress tracking
Example prompts:
- "Convert all PDFs in this folder to Word documents"
- "Batch convert these markdown files to PDF and HTML"
- "Process all Office files and convert to Markdown"
- "Convert this folder of images to a single PDF"
- 指定源文件夹或文件
- 选择目标格式
- (可选)配置转换选项
- 我将处理所有文件并跟踪进度
示例提示:
- "将此文件夹中的所有PDF转换为Word文档"
- "批量将这些Markdown文件转换为PDF和HTML"
- "处理所有Office文件并转换为Markdown"
- "将此文件夹中的图片转换为单个PDF"
Domain Knowledge
领域知识
Supported Format Matrix
支持的格式矩阵
| From | To: DOCX | To: PDF | To: MD | To: HTML | To: PPTX |
|---|---|---|---|---|---|
| DOCX | - | ✅ | ✅ | ✅ | - |
| ✅ | - | ✅ | ✅ | - | |
| MD | ✅ | ✅ | - | ✅ | ✅ |
| HTML | ✅ | ✅ | ✅ | - | - |
| XLSX | - | ✅ | ✅ | ✅ | - |
| PPTX | - | ✅ | ✅ | ✅ | - |
| 源格式 | 目标:DOCX | 目标:PDF | 目标:MD | 目标:HTML | 目标:PPTX |
|---|---|---|---|---|---|
| DOCX | - | ✅ | ✅ | ✅ | - |
| ✅ | - | ✅ | ✅ | - | |
| MD | ✅ | ✅ | - | ✅ | ✅ |
| HTML | ✅ | ✅ | ✅ | - | - |
| XLSX | - | ✅ | ✅ | ✅ | - |
| PPTX | - | ✅ | ✅ | ✅ | - |
Core Pipeline
核心流程
python
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess
import os
class DocumentConverter:
"""Unified document conversion pipeline."""
def __init__(self, max_workers=4):
self.max_workers = max_workers
self.converters = {
('md', 'docx'): self._md_to_docx,
('md', 'pdf'): self._md_to_pdf,
('md', 'html'): self._md_to_html,
('md', 'pptx'): self._md_to_pptx,
('docx', 'pdf'): self._docx_to_pdf,
('docx', 'md'): self._docx_to_md,
('pdf', 'docx'): self._pdf_to_docx,
('pdf', 'md'): self._pdf_to_md,
('xlsx', 'pdf'): self._xlsx_to_pdf,
('xlsx', 'md'): self._xlsx_to_md,
('pptx', 'pdf'): self._pptx_to_pdf,
('pptx', 'md'): self._pptx_to_md,
('html', 'md'): self._html_to_md,
('html', 'pdf'): self._html_to_pdf,
}
def convert(self, input_path, output_format, output_dir=None):
"""Convert single file to target format."""
input_path = Path(input_path)
input_format = input_path.suffix[1:].lower()
if output_dir:
output_path = Path(output_dir) / f"{input_path.stem}.{output_format}"
else:
output_path = input_path.with_suffix(f".{output_format}")
converter_key = (input_format, output_format)
if converter_key not in self.converters:
raise ValueError(f"Conversion not supported: {input_format} -> {output_format}")
converter = self.converters[converter_key]
return converter(input_path, output_path)
def batch_convert(self, input_dir, output_format, output_dir=None,
file_pattern="*", recursive=False):
"""Batch convert all matching files."""
input_path = Path(input_dir)
output_path = Path(output_dir) if output_dir else input_path / "converted"
output_path.mkdir(exist_ok=True)
# Find files
if recursive:
files = list(input_path.rglob(file_pattern))
else:
files = list(input_path.glob(file_pattern))
# Filter to supported formats
supported_ext = ['.md', '.docx', '.pdf', '.xlsx', '.pptx', '.html']
files = [f for f in files if f.suffix.lower() in supported_ext]
results = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
future_to_file = {
executor.submit(self.convert, f, output_format, output_path): f
for f in files
}
for future in as_completed(future_to_file):
file = future_to_file[future]
try:
result = future.result()
results.append({'file': str(file), 'status': 'success', 'output': str(result)})
except Exception as e:
results.append({'file': str(file), 'status': 'error', 'error': str(e)})
return resultspython
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess
import os
class DocumentConverter:
"""Unified document conversion pipeline."""
def __init__(self, max_workers=4):
self.max_workers = max_workers
self.converters = {
('md', 'docx'): self._md_to_docx,
('md', 'pdf'): self._md_to_pdf,
('md', 'html'): self._md_to_html,
('md', 'pptx'): self._md_to_pptx,
('docx', 'pdf'): self._docx_to_pdf,
('docx', 'md'): self._docx_to_md,
('pdf', 'docx'): self._pdf_to_docx,
('pdf', 'md'): self._pdf_to_md,
('xlsx', 'pdf'): self._xlsx_to_pdf,
('xlsx', 'md'): self._xlsx_to_md,
('pptx', 'pdf'): self._pptx_to_pdf,
('pptx', 'md'): self._pptx_to_md,
('html', 'md'): self._html_to_md,
('html', 'pdf'): self._html_to_pdf,
}
def convert(self, input_path, output_format, output_dir=None):
"""Convert single file to target format."""
input_path = Path(input_path)
input_format = input_path.suffix[1:].lower()
if output_dir:
output_path = Path(output_dir) / f"{input_path.stem}.{output_format}"
else:
output_path = input_path.with_suffix(f".{output_format}")
converter_key = (input_format, output_format)
if converter_key not in self.converters:
raise ValueError(f"Conversion not supported: {input_format} -> {output_format}")
converter = self.converters[converter_key]
return converter(input_path, output_path)
def batch_convert(self, input_dir, output_format, output_dir=None,
file_pattern="*", recursive=False):
"""Batch convert all matching files."""
input_path = Path(input_dir)
output_path = Path(output_dir) if output_dir else input_path / "converted"
output_path.mkdir(exist_ok=True)
# Find files
if recursive:
files = list(input_path.rglob(file_pattern))
else:
files = list(input_path.glob(file_pattern))
# Filter to supported formats
supported_ext = ['.md', '.docx', '.pdf', '.xlsx', '.pptx', '.html']
files = [f for f in files if f.suffix.lower() in supported_ext]
results = []
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
future_to_file = {
executor.submit(self.convert, f, output_format, output_path): f
for f in files
}
for future in as_completed(future_to_file):
file = future_to_file[future]
try:
result = future.result()
results.append({'file': str(file), 'status': 'success', 'output': str(result)})
except Exception as e:
results.append({'file': str(file), 'status': 'error', 'error': str(e)})
return resultsConverter Implementations
转换器实现
python
undefinedpython
undefinedMarkdown conversions (using Pandoc)
Markdown conversions (using Pandoc)
def _md_to_docx(self, input_path, output_path):
subprocess.run(['pandoc', str(input_path), '-o', str(output_path)], check=True)
return output_path
def _md_to_pdf(self, input_path, output_path):
subprocess.run(['pandoc', str(input_path), '-o', str(output_path)], check=True)
return output_path
def _md_to_html(self, input_path, output_path):
subprocess.run(['pandoc', str(input_path), '-s', '-o', str(output_path)], check=True)
return output_path
def _md_to_pptx(self, input_path, output_path):
subprocess.run(['marp', str(input_path), '-o', str(output_path)], check=True)
return output_path
def _md_to_docx(self, input_path, output_path):
subprocess.run(['pandoc', str(input_path), '-o', str(output_path)], check=True)
return output_path
def _md_to_pdf(self, input_path, output_path):
subprocess.run(['pandoc', str(input_path), '-o', str(output_path)], check=True)
return output_path
def _md_to_html(self, input_path, output_path):
subprocess.run(['pandoc', str(input_path), '-s', '-o', str(output_path)], check=True)
return output_path
def _md_to_pptx(self, input_path, output_path):
subprocess.run(['marp', str(input_path), '-o', str(output_path)], check=True)
return output_path
Office to Markdown (using markitdown)
Office to Markdown (using markitdown)
def _docx_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
def _xlsx_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
def _pptx_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
def _docx_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
def _xlsx_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
def _pptx_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
PDF conversions
PDF conversions
def _pdf_to_docx(self, input_path, output_path):
from pdf2docx import Converter
cv = Converter(str(input_path))
cv.convert(str(output_path))
cv.close()
return output_path
def _pdf_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
def _pdf_to_docx(self, input_path, output_path):
from pdf2docx import Converter
cv = Converter(str(input_path))
cv.convert(str(output_path))
cv.close()
return output_path
def _pdf_to_md(self, input_path, output_path):
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(input_path))
with open(output_path, 'w') as f:
f.write(result.text_content)
return output_path
Office to PDF (using LibreOffice)
Office to PDF (using LibreOffice)
def _docx_to_pdf(self, input_path, output_path):
subprocess.run([
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_path.parent), str(input_path)
], check=True)
return output_path
def _xlsx_to_pdf(self, input_path, output_path):
subprocess.run([
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_path.parent), str(input_path)
], check=True)
return output_path
def _pptx_to_pdf(self, input_path, output_path):
subprocess.run([
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_path.parent), str(input_path)
], check=True)
return output_path
undefineddef _docx_to_pdf(self, input_path, output_path):
subprocess.run([
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_path.parent), str(input_path)
], check=True)
return output_path
def _xlsx_to_pdf(self, input_path, output_path):
subprocess.run([
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_path.parent), str(input_path)
], check=True)
return output_path
def _pptx_to_pdf(self, input_path, output_path):
subprocess.run([
'soffice', '--headless', '--convert-to', 'pdf',
'--outdir', str(output_path.parent), str(input_path)
], check=True)
return output_path
undefinedProgress Tracking
进度跟踪
python
from tqdm import tqdm
def batch_convert_with_progress(converter, input_dir, output_format, output_dir=None):
"""Batch convert with progress bar."""
input_path = Path(input_dir)
files = list(input_path.glob('*'))
results = []
for file in tqdm(files, desc=f"Converting to {output_format}"):
try:
result = converter.convert(file, output_format, output_dir)
results.append({'file': str(file), 'status': 'success'})
except Exception as e:
results.append({'file': str(file), 'status': 'error', 'error': str(e)})
return resultspython
from tqdm import tqdm
def batch_convert_with_progress(converter, input_dir, output_format, output_dir=None):
"""Batch convert with progress bar."""
input_path = Path(input_dir)
files = list(input_path.glob('*'))
results = []
for file in tqdm(files, desc=f"Converting to {output_format}"):
try:
result = converter.convert(file, output_format, output_dir)
results.append({'file': str(file), 'status': 'success'})
except Exception as e:
results.append({'file': str(file), 'status': 'error', 'error': str(e)})
return resultsBest Practices
最佳实践
- Test Sample First: Convert a few files before batch processing
- Check Disk Space: Ensure sufficient space for output
- Use Parallel Processing: Speed up with multiple workers
- Handle Errors Gracefully: Log failures, continue processing
- Verify Output: Spot-check converted files
- 先测试样本:批量处理前先转换几个文件测试
- 检查磁盘空间:确保有足够的空间存储输出文件
- 使用并行处理:通过多线程提升转换速度
- 优雅处理错误:记录失败情况,继续处理其他文件
- 验证输出结果:抽查转换后的文件
Common Patterns
常见模式
Format Detection Pipeline
格式检测流程
python
def detect_and_convert(file_path, target_format):
"""Automatically detect format and convert."""
import mimetypes
mime_type, _ = mimetypes.guess_type(str(file_path))
format_map = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
'text/markdown': 'md',
'text/html': 'html',
}
source_format = format_map.get(mime_type, Path(file_path).suffix[1:])
converter = DocumentConverter()
return converter.convert(file_path, target_format)python
def detect_and_convert(file_path, target_format):
"""Automatically detect format and convert."""
import mimetypes
mime_type, _ = mimetypes.guess_type(str(file_path))
format_map = {
'application/pdf': 'pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
'text/markdown': 'md',
'text/html': 'html',
}
source_format = format_map.get(mime_type, Path(file_path).suffix[1:])
converter = DocumentConverter()
return converter.convert(file_path, target_format)Multi-Format Output
多格式输出
python
def convert_to_multiple_formats(input_file, output_formats, output_dir):
"""Convert one file to multiple formats."""
converter = DocumentConverter()
results = {}
for fmt in output_formats:
try:
output = converter.convert(input_file, fmt, output_dir)
results[fmt] = {'status': 'success', 'path': str(output)}
except Exception as e:
results[fmt] = {'status': 'error', 'error': str(e)}
return resultspython
def convert_to_multiple_formats(input_file, output_formats, output_dir):
"""Convert one file to multiple formats."""
converter = DocumentConverter()
results = {}
for fmt in output_formats:
try:
output = converter.convert(input_file, fmt, output_dir)
results[fmt] = {'status': 'success', 'path': str(output)}
except Exception as e:
results[fmt] = {'status': 'error', 'error': str(e)}
return resultsConvert README to multiple formats
Convert README to multiple formats
results = convert_to_multiple_formats(
'README.md',
['docx', 'pdf', 'html'],
'./exports'
)
undefinedresults = convert_to_multiple_formats(
'README.md',
['docx', 'pdf', 'html'],
'./exports'
)
undefinedExamples
示例
Example 1: Documentation Export
示例1:文档导出
python
from pathlib import Path
import json
def export_documentation(docs_dir, export_dir):
"""Export all documentation to multiple formats."""
converter = DocumentConverter(max_workers=8)
docs_path = Path(docs_dir)
export_path = Path(export_dir)
# Create format directories
for fmt in ['pdf', 'docx', 'html']:
(export_path / fmt).mkdir(parents=True, exist_ok=True)
all_results = {}
# Find all markdown files
md_files = list(docs_path.rglob('*.md'))
for md_file in md_files:
file_results = {}
for fmt in ['pdf', 'docx', 'html']:
output_dir = export_path / fmt
try:
output = converter.convert(md_file, fmt, output_dir)
file_results[fmt] = 'success'
except Exception as e:
file_results[fmt] = f'error: {e}'
all_results[str(md_file)] = file_results
print(f"Processed: {md_file.name}")
# Save report
with open(export_path / 'export_report.json', 'w') as f:
json.dump(all_results, f, indent=2)
return all_results
results = export_documentation('./docs', './exports')python
from pathlib import Path
import json
def export_documentation(docs_dir, export_dir):
"""Export all documentation to multiple formats."""
converter = DocumentConverter(max_workers=8)
docs_path = Path(docs_dir)
export_path = Path(export_dir)
# Create format directories
for fmt in ['pdf', 'docx', 'html']:
(export_path / fmt).mkdir(parents=True, exist_ok=True)
all_results = {}
# Find all markdown files
md_files = list(docs_path.rglob('*.md'))
for md_file in md_files:
file_results = {}
for fmt in ['pdf', 'docx', 'html']:
output_dir = export_path / fmt
try:
output = converter.convert(md_file, fmt, output_dir)
file_results[fmt] = 'success'
except Exception as e:
file_results[fmt] = f'error: {e}'
all_results[str(md_file)] = file_results
print(f"Processed: {md_file.name}")
# Save report
with open(export_path / 'export_report.json', 'w') as f:
json.dump(all_results, f, indent=2)
return all_results
results = export_documentation('./docs', './exports')Example 2: Legacy Document Migration
示例2:旧版文档迁移
python
def migrate_legacy_docs(source_dir, target_dir):
"""Migrate legacy documents to modern formats."""
converter = DocumentConverter(max_workers=4)
# Migration rules
migrations = [
('*.doc', 'docx'), # Old Word to new
('*.xls', 'xlsx'), # Old Excel to new
('*.ppt', 'pptx'), # Old PowerPoint to new
('*.rtf', 'docx'), # RTF to Word
]
source_path = Path(source_dir)
target_path = Path(target_dir)
target_path.mkdir(exist_ok=True)
total_migrated = 0
errors = []
for pattern, target_format in migrations:
files = list(source_path.glob(pattern))
for file in files:
try:
# Use LibreOffice for legacy formats
subprocess.run([
'soffice', '--headless',
'--convert-to', target_format,
'--outdir', str(target_path),
str(file)
], check=True)
total_migrated += 1
print(f"Migrated: {file.name}")
except Exception as e:
errors.append({'file': str(file), 'error': str(e)})
print(f"\nMigration complete: {total_migrated} files")
print(f"Errors: {len(errors)}")
return {'migrated': total_migrated, 'errors': errors}python
def migrate_legacy_docs(source_dir, target_dir):
"""Migrate legacy documents to modern formats."""
converter = DocumentConverter(max_workers=4)
# Migration rules
migrations = [
('*.doc', 'docx'), # Old Word to new
('*.xls', 'xlsx'), # Old Excel to new
('*.ppt', 'pptx'), # Old PowerPoint to new
('*.rtf', 'docx'), # RTF to Word
]
source_path = Path(source_dir)
target_path = Path(target_dir)
target_path.mkdir(exist_ok=True)
total_migrated = 0
errors = []
for pattern, target_format in migrations:
files = list(source_path.glob(pattern))
for file in files:
try:
# Use LibreOffice for legacy formats
subprocess.run([
'soffice', '--headless',
'--convert-to', target_format,
'--outdir', str(target_path),
str(file)
], check=True)
total_migrated += 1
print(f"Migrated: {file.name}")
except Exception as e:
errors.append({'file': str(file), 'error': str(e)})
print(f"\nMigration complete: {total_migrated} files")
print(f"Errors: {len(errors)}")
return {'migrated': total_migrated, 'errors': errors}Example 3: Report Generation Pipeline
示例3:报告生成流程
python
def generate_reports_pipeline(data_files, template_dir, output_dir):
"""Generate reports from data files using templates."""
from datetime import datetime
converter = DocumentConverter()
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
reports = []
for data_file in data_files:
# Load data
data_path = Path(data_file)
# Generate markdown report
md_content = f"""---
title: Report - {data_path.stem}
date: {datetime.now().strftime('%Y-%m-%d')}
---python
def generate_reports_pipeline(data_files, template_dir, output_dir):
"""Generate reports from data files using templates."""
from datetime import datetime
converter = DocumentConverter()
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
reports = []
for data_file in data_files:
# Load data
data_path = Path(data_file)
# Generate markdown report
md_content = f"""---
title: Report - {data_path.stem}
date: {datetime.now().strftime('%Y-%m-%d')}
---{data_path.stem} Report
{data_path.stem} Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Data Summary
Data Summary
"""
# Add data content (simplified)
if data_path.suffix == '.xlsx':
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(data_path))
md_content += result.text_content
# Save markdown
md_file = output_path / f"{data_path.stem}_{timestamp}.md"
with open(md_file, 'w') as f:
f.write(md_content)
# Convert to PDF and DOCX
for fmt in ['pdf', 'docx']:
try:
output = converter.convert(md_file, fmt, output_path)
reports.append({'source': str(data_file), 'output': str(output), 'format': fmt})
except Exception as e:
print(f"Error converting {data_file} to {fmt}: {e}")
return reportsundefined"""
# Add data content (simplified)
if data_path.suffix == '.xlsx':
from markitdown import MarkItDown
md = MarkItDown()
result = md.convert(str(data_path))
md_content += result.text_content
# Save markdown
md_file = output_path / f"{data_path.stem}_{timestamp}.md"
with open(md_file, 'w') as f:
f.write(md_content)
# Convert to PDF and DOCX
for fmt in ['pdf', 'docx']:
try:
output = converter.convert(md_file, fmt, output_path)
reports.append({'source': str(data_file), 'output': str(output), 'format': fmt})
except Exception as e:
print(f"Error converting {data_file} to {fmt}: {e}")
return reportsundefinedLimitations
局限性
- Some format combinations not supported
- Complex formatting may be lost in conversion
- Large files may require more time
- Some conversions need external tools (LibreOffice, Pandoc)
- Quality varies by source document complexity
- 部分格式组合不支持
- 复杂格式在转换过程中可能丢失
- 大型文件转换可能需要更多时间
- 部分转换需要依赖外部工具(LibreOffice、Pandoc)
- 转换质量取决于源文档的复杂程度
Installation
安装
bash
undefinedbash
undefinedCore dependencies
核心依赖
pip install pdf2docx markitdown python-docx openpyxl
pip install pdf2docx markitdown python-docx openpyxl
Pandoc (for MD conversions)
Pandoc(用于MD格式转换)
brew install pandoc # macOS
apt install pandoc # Ubuntu
brew install pandoc # macOS
apt install pandoc # Ubuntu
Marp (for PPTX)
Marp(用于PPTX格式转换)
npm install -g @marp-team/marp-cli
npm install -g @marp-team/marp-cli
LibreOffice (for Office formats)
LibreOffice(用于Office格式转换)
brew install libreoffice # macOS
apt install libreoffice # Ubuntu
undefinedbrew install libreoffice # macOS
apt install libreoffice # Ubuntu
undefined