pdf-to-docx
Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChinesePDF to Word Skill
PDF转Word技能
Overview
概述
This skill enables conversion from PDF to editable Word documents using pdf2docx - a Python library that preserves layout, tables, images, and text formatting. Unlike OCR-based solutions, pdf2docx extracts native PDF content for accurate conversion.
本技能可借助pdf2docx将PDF转换为可编辑的Word文档——pdf2docx是一款Python库,能够保留布局、表格、图片和文本格式。与基于OCR的解决方案不同,pdf2docx提取PDF原生内容以实现精准转换。
How to Use
使用方法
- Provide the PDF file you want to convert
- Optionally specify pages or conversion options
- I'll convert it to an editable Word document
Example prompts:
- "Convert this PDF report to an editable Word document"
- "Turn pages 1-5 of this PDF into Word format"
- "Extract this scanned document as editable text"
- "Convert this PDF contract to Word for editing"
- 提供你想要转换的PDF文件
- 可选:指定页码或转换选项
- 我会将其转换为可编辑的Word文档
示例提示:
- "将这份PDF报告转换为可编辑的Word文档"
- "把这份PDF的第1-5页转换为Word格式"
- "将这份扫描件提取为可编辑文本"
- "将这份PDF合同转换为Word以便编辑"
Domain Knowledge
领域知识
pdf2docx Fundamentals
pdf2docx基础
python
from pdf2docx import Converterpython
from pdf2docx import ConverterBasic conversion
Basic conversion
cv = Converter('input.pdf')
cv.convert('output.docx')
cv.close()
cv = Converter('input.pdf')
cv.convert('output.docx')
cv.close()
Or using context manager
Or using context manager
with Converter('input.pdf') as cv:
cv.convert('output.docx')
undefinedwith Converter('input.pdf') as cv:
cv.convert('output.docx')
undefinedConversion Options
转换选项
python
from pdf2docx import Converter
cv = Converter('input.pdf')python
from pdf2docx import Converter
cv = Converter('input.pdf')Full document
Full document
cv.convert('output.docx')
cv.convert('output.docx')
Specific pages (0-indexed)
Specific pages (0-indexed)
cv.convert('output.docx', start=0, end=5)
cv.convert('output.docx', start=0, end=5)
Single page
Single page
cv.convert('output.docx', pages=[0])
cv.convert('output.docx', pages=[0])
Multiple specific pages
Multiple specific pages
cv.convert('output.docx', pages=[0, 2, 4])
cv.close()
undefinedcv.convert('output.docx', pages=[0, 2, 4])
cv.close()
undefinedAdvanced Options
高级选项
python
from pdf2docx import Converter
cv = Converter('input.pdf')
cv.convert(
'output.docx',
start=0, # Start page (0-indexed)
end=None, # End page (None = last page)
pages=None, # Specific pages list
password=None, # PDF password if encrypted
min_section_height=20.0, # Minimum height for section
connected_border_tolerance=0.5, # Border detection tolerance
line_overlap_threshold=0.9, # Line merging threshold
line_break_width_ratio=0.5, # Line break detection
line_break_free_space_ratio=0.1,
line_separate_threshold=5, # Vertical line separation
new_paragraph_free_space_ratio=0.85,
float_image_ignorable_gap=5,
page_margin_factor_top=0.5,
page_margin_factor_bottom=0.5,
)
cv.close()python
from pdf2docx import Converter
cv = Converter('input.pdf')
cv.convert(
'output.docx',
start=0, # Start page (0-indexed)
end=None, # End page (None = last page)
pages=None, # Specific pages list
password=None, # PDF password if encrypted
min_section_height=20.0, # Minimum height for section
connected_border_tolerance=0.5, # Border detection tolerance
line_overlap_threshold=0.9, # Line merging threshold
line_break_width_ratio=0.5, # Line break detection
line_break_free_space_ratio=0.1,
line_separate_threshold=5, # Vertical line separation
new_paragraph_free_space_ratio=0.85,
float_image_ignorable_gap=5,
page_margin_factor_top=0.5,
page_margin_factor_bottom=0.5,
)
cv.close()Handling Different PDF Types
处理不同类型的PDF
Native PDFs (Text-based)
原生PDF(基于文本)
python
undefinedpython
undefinedWorks best with native PDFs
Works best with native PDFs
cv = Converter('native_pdf.pdf')
cv.convert('output.docx')
cv.close()
undefinedcv = Converter('native_pdf.pdf')
cv.convert('output.docx')
cv.close()
undefinedScanned PDFs (Image-based)
扫描版PDF(基于图像)
python
undefinedpython
undefinedFor scanned PDFs, use OCR first
For scanned PDFs, use OCR first
pdf2docx works best with native text PDFs
pdf2docx works best with native text PDFs
Consider using pytesseract or PaddleOCR first
Consider using pytesseract or PaddleOCR first
import pytesseract
from pdf2image import convert_from_path
import pytesseract
from pdf2image import convert_from_path
Convert PDF pages to images
Convert PDF pages to images
images = convert_from_path('scanned.pdf')
images = convert_from_path('scanned.pdf')
OCR each page
OCR each page
text = ''
for img in images:
text += pytesseract.image_to_string(img)
text = ''
for img in images:
text += pytesseract.image_to_string(img)
Then create Word document from text
Then create Word document from text
undefinedundefinedPython Integration
Python集成
python
from pdf2docx import Converter
import os
def pdf_to_word(pdf_path, output_path=None, pages=None):
"""Convert PDF to Word document."""
if output_path is None:
output_path = pdf_path.replace('.pdf', '.docx')
cv = Converter(pdf_path)
if pages:
cv.convert(output_path, pages=pages)
else:
cv.convert(output_path)
cv.close()
return output_pathpython
from pdf2docx import Converter
import os
def pdf_to_word(pdf_path, output_path=None, pages=None):
"""Convert PDF to Word document."""
if output_path is None:
output_path = pdf_path.replace('.pdf', '.docx')
cv = Converter(pdf_path)
if pages:
cv.convert(output_path, pages=pages)
else:
cv.convert(output_path)
cv.close()
return output_pathUsage
Usage
result = pdf_to_word('document.pdf')
print(f"Created: {result}")
undefinedresult = pdf_to_word('document.pdf')
print(f"Created: {result}")
undefinedBatch Conversion
批量转换
python
from pdf2docx import Converter
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
def convert_single(pdf_path, output_dir):
"""Convert single PDF to Word."""
output_path = output_dir / pdf_path.with_suffix('.docx').name
try:
cv = Converter(str(pdf_path))
cv.convert(str(output_path))
cv.close()
return f"Success: {pdf_path.name}"
except Exception as e:
return f"Error: {pdf_path.name} - {e}"
def batch_convert(input_dir, output_dir, max_workers=4):
"""Convert all PDFs in directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
pdf_files = list(input_path.glob('*.pdf'))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(convert_single, pdf, output_path)
for pdf in pdf_files
]
for future in futures:
print(future.result())
batch_convert('./pdfs', './word_docs')python
from pdf2docx import Converter
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
def convert_single(pdf_path, output_dir):
"""Convert single PDF to Word."""
output_path = output_dir / pdf_path.with_suffix('.docx').name
try:
cv = Converter(str(pdf_path))
cv.convert(str(output_path))
cv.close()
return f"Success: {pdf_path.name}"
except Exception as e:
return f"Error: {pdf_path.name} - {e}"
def batch_convert(input_dir, output_dir, max_workers=4):
"""Convert all PDFs in directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
pdf_files = list(input_path.glob('*.pdf'))
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(convert_single, pdf, output_path)
for pdf in pdf_files
]
for future in futures:
print(future.result())
batch_convert('./pdfs', './word_docs')Parsing PDF Structure
解析PDF结构
python
from pdf2docx import Converter
def analyze_pdf(pdf_path):
"""Analyze PDF structure before conversion."""
cv = Converter(pdf_path)
for i, page in enumerate(cv.pages):
print(f"Page {i+1}:")
print(f" Size: {page.width} x {page.height}")
print(f" Blocks: {len(page.blocks)}")
for block in page.blocks:
if hasattr(block, 'text'):
print(f" Text block: {block.text[:50]}...")
elif hasattr(block, 'image'):
print(f" Image block")
cv.close()
analyze_pdf('document.pdf')python
from pdf2docx import Converter
def analyze_pdf(pdf_path):
"""Analyze PDF structure before conversion."""
cv = Converter(pdf_path)
for i, page in enumerate(cv.pages):
print(f"Page {i+1}:")
print(f" Size: {page.width} x {page.height}")
print(f" Blocks: {len(page.blocks)}")
for block in page.blocks:
if hasattr(block, 'text'):
print(f" Text block: {block.text[:50]}...")
elif hasattr(block, 'image'):
print(f" Image block")
cv.close()
analyze_pdf('document.pdf')Best Practices
最佳实践
- Check PDF Type: Native PDFs convert better than scanned
- Preview First: Test with a few pages before full conversion
- Handle Tables: Complex tables may need manual adjustment
- Image Quality: Images are extracted at original resolution
- Font Handling: Some fonts may substitute to system defaults
- 检查PDF类型:原生PDF的转换效果优于扫描版
- 先预览:在全量转换前先测试几页
- 处理表格:复杂表格可能需要手动调整
- 图片质量:图片会以原始分辨率提取
- 字体处理:部分字体可能会替换为系统默认字体
Common Patterns
常见模式
Convert with Progress
带进度的转换
python
from pdf2docx import Converter
def convert_with_progress(pdf_path, output_path):
"""Convert PDF with progress tracking."""
cv = Converter(pdf_path)
total_pages = len(cv.pages)
print(f"Converting {total_pages} pages...")
for i in range(total_pages):
cv.convert(output_path, start=i, end=i+1)
progress = (i + 1) / total_pages * 100
print(f"Progress: {progress:.1f}%")
cv.close()
print("Conversion complete!")python
from pdf2docx import Converter
def convert_with_progress(pdf_path, output_path):
"""Convert PDF with progress tracking."""
cv = Converter(pdf_path)
total_pages = len(cv.pages)
print(f"Converting {total_pages} pages...")
for i in range(total_pages):
cv.convert(output_path, start=i, end=i+1)
progress = (i + 1) / total_pages * 100
print(f"Progress: {progress:.1f}%")
cv.close()
print("Conversion complete!")Extract Tables Only
仅提取表格
python
from pdf2docx import Converter
from docx import Document
def extract_tables_to_word(pdf_path, output_path):
"""Extract only tables from PDF to Word."""
cv = Converter(pdf_path)
# First do full conversion
temp_path = 'temp_full.docx'
cv.convert(temp_path)
cv.close()
# Open and extract tables
doc = Document(temp_path)
new_doc = Document()
for table in doc.tables:
# Copy table to new document
new_table = new_doc.add_table(rows=0, cols=len(table.columns))
for row in table.rows:
new_row = new_table.add_row()
for i, cell in enumerate(row.cells):
new_row.cells[i].text = cell.text
new_doc.add_paragraph() # Add spacing
new_doc.save(output_path)
os.remove(temp_path)python
from pdf2docx import Converter
from docx import Document
def extract_tables_to_word(pdf_path, output_path):
"""Extract only tables from PDF to Word."""
cv = Converter(pdf_path)
# First do full conversion
temp_path = 'temp_full.docx'
cv.convert(temp_path)
cv.close()
# Open and extract tables
doc = Document(temp_path)
new_doc = Document()
for table in doc.tables:
# Copy table to new document
new_table = new_doc.add_table(rows=0, cols=len(table.columns))
for row in table.rows:
new_row = new_table.add_row()
for i, cell in enumerate(row.cells):
new_row.cells[i].text = cell.text
new_doc.add_paragraph() # Add spacing
new_doc.save(output_path)
os.remove(temp_path)Examples
示例
Example 1: Contract Conversion
示例1:合同转换
python
from pdf2docx import Converter
import os
def convert_contract(pdf_path):
"""Convert contract PDF to editable Word with metadata."""
# Define output path
base_name = os.path.splitext(pdf_path)[0]
output_path = f"{base_name}_editable.docx"
# Convert
cv = Converter(pdf_path)
# Check page count
page_count = len(cv.pages)
print(f"Processing {page_count} pages...")
# Convert all pages
cv.convert(output_path)
cv.close()
print(f"Created: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")
return output_pathpython
from pdf2docx import Converter
import os
def convert_contract(pdf_path):
"""Convert contract PDF to editable Word with metadata."""
# Define output path
base_name = os.path.splitext(pdf_path)[0]
output_path = f"{base_name}_editable.docx"
# Convert
cv = Converter(pdf_path)
# Check page count
page_count = len(cv.pages)
print(f"Processing {page_count} pages...")
# Convert all pages
cv.convert(output_path)
cv.close()
print(f"Created: {output_path}")
print(f"File size: {os.path.getsize(output_path) / 1024:.1f} KB")
return output_pathUsage
Usage
result = convert_contract('contract.pdf')
undefinedresult = convert_contract('contract.pdf')
undefinedExample 2: Selective Page Conversion
示例2:选择性页面转换
python
from pdf2docx import Converter
def convert_selected_pages(pdf_path, page_ranges, output_path):
"""Convert specific page ranges to Word.
page_ranges: List of tuples like [(1, 3), (5, 7)] for pages 1-3 and 5-7
"""
cv = Converter(pdf_path)
# Convert pages (0-indexed internally)
all_pages = []
for start, end in page_ranges:
all_pages.extend(range(start - 1, end)) # Convert to 0-indexed
cv.convert(output_path, pages=all_pages)
cv.close()
print(f"Converted pages: {page_ranges}")
return output_pathpython
from pdf2docx import Converter
def convert_selected_pages(pdf_path, page_ranges, output_path):
"""Convert specific page ranges to Word.
page_ranges: List of tuples like [(1, 3), (5, 7)] for pages 1-3 and 5-7
"""
cv = Converter(pdf_path)
# Convert pages (0-indexed internally)
all_pages = []
for start, end in page_ranges:
all_pages.extend(range(start - 1, end)) # Convert to 0-indexed
cv.convert(output_path, pages=all_pages)
cv.close()
print(f"Converted pages: {page_ranges}")
return output_pathConvert pages 1-5 and 10-15
Convert pages 1-5 and 10-15
convert_selected_pages(
'long_document.pdf',
[(1, 5), (10, 15)],
'selected_pages.docx'
)
undefinedconvert_selected_pages(
'long_document.pdf',
[(1, 5), (10, 15)],
'selected_pages.docx'
)
undefinedExample 3: PDF Report to Editable Template
示例3:PDF报告转可编辑模板
python
from pdf2docx import Converter
from docx import Document
def pdf_to_template(pdf_path, output_path):
"""Convert PDF report to Word template with placeholders."""
# Convert PDF to Word
cv = Converter(pdf_path)
cv.convert(output_path)
cv.close()
# Open and add placeholder fields
doc = Document(output_path)
# Replace common fields with placeholders
replacements = {
'Company Name': '[COMPANY_NAME]',
'Date:': 'Date: [DATE]',
'Prepared by:': 'Prepared by: [AUTHOR]',
}
for para in doc.paragraphs:
for old, new in replacements.items():
if old in para.text:
para.text = para.text.replace(old, new)
# Also check tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for old, new in replacements.items():
if old in cell.text:
cell.text = cell.text.replace(old, new)
doc.save(output_path)
print(f"Template created: {output_path}")
pdf_to_template('annual_report.pdf', 'report_template.docx')python
from pdf2docx import Converter
from docx import Document
def pdf_to_template(pdf_path, output_path):
"""Convert PDF report to Word template with placeholders."""
# Convert PDF to Word
cv = Converter(pdf_path)
cv.convert(output_path)
cv.close()
# Open and add placeholder fields
doc = Document(output_path)
# Replace common fields with placeholders
replacements = {
'Company Name': '[COMPANY_NAME]',
'Date:': 'Date: [DATE]',
'Prepared by:': 'Prepared by: [AUTHOR]',
}
for para in doc.paragraphs:
for old, new in replacements.items():
if old in para.text:
para.text = para.text.replace(old, new)
# Also check tables
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
for old, new in replacements.items():
if old in cell.text:
cell.text = cell.text.replace(old, new)
doc.save(output_path)
print(f"Template created: {output_path}")
pdf_to_template('annual_report.pdf', 'report_template.docx')Example 4: Bulk Invoice Processing
示例4:批量发票处理
python
from pdf2docx import Converter
from pathlib import Path
import json
def process_invoices(input_folder, output_folder):
"""Convert PDF invoices to editable Word documents."""
input_path = Path(input_folder)
output_path = Path(output_folder)
output_path.mkdir(exist_ok=True)
results = []
for pdf_file in input_path.glob('*.pdf'):
output_file = output_path / pdf_file.with_suffix('.docx').name
try:
cv = Converter(str(pdf_file))
cv.convert(str(output_file))
cv.close()
results.append({
'file': pdf_file.name,
'status': 'success',
'output': str(output_file)
})
except Exception as e:
results.append({
'file': pdf_file.name,
'status': 'error',
'error': str(e)
})
# Save results log
with open(output_path / 'conversion_log.json', 'w') as f:
json.dump(results, f, indent=2)
# Summary
success = sum(1 for r in results if r['status'] == 'success')
print(f"Converted {success}/{len(results)} files")
return results
results = process_invoices('./invoices_pdf', './invoices_word')python
from pdf2docx import Converter
from pathlib import Path
import json
def process_invoices(input_folder, output_folder):
"""Convert PDF invoices to editable Word documents."""
input_path = Path(input_folder)
output_path = Path(output_folder)
output_path.mkdir(exist_ok=True)
results = []
for pdf_file in input_path.glob('*.pdf'):
output_file = output_path / pdf_file.with_suffix('.docx').name
try:
cv = Converter(str(pdf_file))
cv.convert(str(output_file))
cv.close()
results.append({
'file': pdf_file.name,
'status': 'success',
'output': str(output_file)
})
except Exception as e:
results.append({
'file': pdf_file.name,
'status': 'error',
'error': str(e)
})
# Save results log
with open(output_path / 'conversion_log.json', 'w') as f:
json.dump(results, f, indent=2)
# Summary
success = sum(1 for r in results if r['status'] == 'success')
print(f"Converted {success}/{len(results)} files")
return results
results = process_invoices('./invoices_pdf', './invoices_word')Limitations
局限性
- Scanned PDFs require OCR preprocessing
- Complex layouts may not convert perfectly
- Some fonts may not be available
- Watermarks are included in conversion
- Protected/encrypted PDFs need password
- 扫描版PDF需要先进行OCR预处理
- 复杂布局可能无法完美转换
- 部分字体可能无法正常显示
- 水印会被包含在转换结果中
- 受保护/加密的PDF需要密码
Installation
安装
bash
pip install pdf2docxbash
pip install pdf2docxFor image handling
For image handling
pip install Pillow
undefinedpip install Pillow
undefined