Compare original and translation side by side
🇺🇸
Original
English🇨🇳
Translation
ChinesePDF 处理指南
PDF 处理指南
概述
概述
本指南涵盖使用 Python 库和命令行工具进行的基本 PDF 处理操作。有关高级功能、JavaScript 库和详细示例,请参阅 reference.md。如果需要填写 PDF 表单,请阅读 forms.md 并按照其说明操作。
本指南涵盖使用 Python 库和命令行工具进行的基本 PDF 处理操作。有关高级功能、JavaScript 库和详细示例,请参阅 reference.md。如果需要填写 PDF 表单,请阅读 forms.md 并按照其说明操作。
快速开始
快速开始
python
from pypdf import PdfReader, PdfWriterpython
from pypdf import PdfReader, PdfWriter读取 PDF
读取 PDF
reader = PdfReader("document.pdf")
print(f"页数: {len(reader.pages)}")
reader = PdfReader("document.pdf")
print(f"页数: {len(reader.pages)}")
提取文本
提取文本
text = ""
for page in reader.pages:
text += page.extract_text()
undefinedtext = ""
for page in reader.pages:
text += page.extract_text()
undefinedPython 库
Python 库
pypdf - 基本操作
pypdf - 基本操作
合并 PDF
合并 PDF
python
from pypdf import PdfWriter, PdfReader
writer = PdfWriter()
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
reader = PdfReader(pdf_file)
for page in reader.pages:
writer.add_page(page)
with open("merged.pdf", "wb") as output:
writer.write(output)python
from pypdf import PdfWriter, PdfReader
writer = PdfWriter()
for pdf_file in ["doc1.pdf", "doc2.pdf", "doc3.pdf"]:
reader = PdfReader(pdf_file)
for page in reader.pages:
writer.add_page(page)
with open("merged.pdf", "wb") as output:
writer.write(output)拆分 PDF
拆分 PDF
python
reader = PdfReader("input.pdf")
for i, page in enumerate(reader.pages):
writer = PdfWriter()
writer.add_page(page)
with open(f"page_{i+1}.pdf", "wb") as output:
writer.write(output)python
reader = PdfReader("input.pdf")
for i, page in enumerate(reader.pages):
writer = PdfWriter()
writer.add_page(page)
with open(f"page_{i+1}.pdf", "wb") as output:
writer.write(output)提取元数据
提取元数据
python
reader = PdfReader("document.pdf")
meta = reader.metadata
print(f"标题: {meta.title}")
print(f"作者: {meta.author}")
print(f"主题: {meta.subject}")
print(f"创建者: {meta.creator}")python
reader = PdfReader("document.pdf")
meta = reader.metadata
print(f"标题: {meta.title}")
print(f"作者: {meta.author}")
print(f"主题: {meta.subject}")
print(f"创建者: {meta.creator}")旋转页面
旋转页面
python
reader = PdfReader("input.pdf")
writer = PdfWriter()
page = reader.pages[0]
page.rotate(90) # 顺时针旋转90度
writer.add_page(page)
with open("rotated.pdf", "wb") as output:
writer.write(output)python
reader = PdfReader("input.pdf")
writer = PdfWriter()
page = reader.pages[0]
page.rotate(90) # 顺时针旋转90度
writer.add_page(page)
with open("rotated.pdf", "wb") as output:
writer.write(output)pdfplumber - 文本和表格提取
pdfplumber - 文本和表格提取
提取带布局的文本
提取带布局的文本
python
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
for page in pdf.pages:
text = page.extract_text()
print(text)python
import pdfplumber
with pdfplumber.open("document.pdf") as pdf:
for page in pdf.pages:
text = page.extract_text()
print(text)提取表格
提取表格
python
with pdfplumber.open("document.pdf") as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables()
for j, table in enumerate(tables):
print(f"第 {i+1} 页的表格 {j+1}:")
for row in table:
print(row)python
with pdfplumber.open("document.pdf") as pdf:
for i, page in enumerate(pdf.pages):
tables = page.extract_tables()
for j, table in enumerate(tables):
print(f"第 {i+1} 页的表格 {j+1}:")
for row in table:
print(row)高级表格提取
高级表格提取
python
import pandas as pd
with pdfplumber.open("document.pdf") as pdf:
all_tables = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if table: # 检查表格是否为空
df = pd.DataFrame(table[1:], columns=table[0])
all_tables.append(df)python
import pandas as pd
with pdfplumber.open("document.pdf") as pdf:
all_tables = []
for page in pdf.pages:
tables = page.extract_tables()
for table in tables:
if table: # 检查表格是否为空
df = pd.DataFrame(table[1:], columns=table[0])
all_tables.append(df)合并所有表格
合并所有表格
if all_tables:
combined_df = pd.concat(all_tables, ignore_index=True)
combined_df.to_excel("extracted_tables.xlsx", index=False)
undefinedif all_tables:
combined_df = pd.concat(all_tables, ignore_index=True)
combined_df.to_excel("extracted_tables.xlsx", index=False)
undefinedreportlab - 创建 PDF
reportlab - 创建 PDF
基本 PDF 创建
基本 PDF 创建
python
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
c = canvas.Canvas("hello.pdf", pagesize=letter)
width, height = letterpython
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
c = canvas.Canvas("hello.pdf", pagesize=letter)
width, height = letter添加文本
添加文本
c.drawString(100, height - 100, "Hello World!")
c.drawString(100, height - 120, "这是用 reportlab 创建的 PDF")
c.drawString(100, height - 100, "Hello World!")
c.drawString(100, height - 120, "这是用 reportlab 创建的 PDF")
添加线条
添加线条
c.line(100, height - 140, 400, height - 140)
c.line(100, height - 140, 400, height - 140)
保存
保存
c.save()
undefinedc.save()
undefined创建多页 PDF
创建多页 PDF
python
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
styles = getSampleStyleSheet()
story = []python
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, PageBreak
from reportlab.lib.styles import getSampleStyleSheet
doc = SimpleDocTemplate("report.pdf", pagesize=letter)
styles = getSampleStyleSheet()
story = []添加内容
添加内容
title = Paragraph("报告标题", styles['Title'])
story.append(title)
story.append(Spacer(1, 12))
body = Paragraph("这是报告的正文内容。" * 20, styles['Normal'])
story.append(body)
story.append(PageBreak())
title = Paragraph("报告标题", styles['Title'])
story.append(title)
story.append(Spacer(1, 12))
body = Paragraph("这是报告的正文内容。" * 20, styles['Normal'])
story.append(body)
story.append(PageBreak())
第2页
第2页
story.append(Paragraph("第2页", styles['Heading1']))
story.append(Paragraph("第2页的内容", styles['Normal']))
story.append(Paragraph("第2页", styles['Heading1']))
story.append(Paragraph("第2页的内容", styles['Normal']))
构建 PDF
构建 PDF
doc.build(story)
undefineddoc.build(story)
undefined命令行工具
命令行工具
pdftotext (poppler-utils)
pdftotext (poppler-utils)
bash
undefinedbash
undefined提取文本
提取文本
pdftotext input.pdf output.txt
pdftotext input.pdf output.txt
提取文本并保留布局
提取文本并保留布局
pdftotext -layout input.pdf output.txt
pdftotext -layout input.pdf output.txt
提取指定页面
提取指定页面
pdftotext -f 1 -l 5 input.pdf output.txt # 第1-5页
undefinedpdftotext -f 1 -l 5 input.pdf output.txt # 第1-5页
undefinedqpdf
qpdf
bash
undefinedbash
undefined合并 PDF
合并 PDF
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
qpdf --empty --pages file1.pdf file2.pdf -- merged.pdf
拆分页面
拆分页面
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
qpdf input.pdf --pages . 1-5 -- pages1-5.pdf
qpdf input.pdf --pages . 6-10 -- pages6-10.pdf
旋转页面
旋转页面
qpdf input.pdf output.pdf --rotate=+90:1 # 将第1页旋转90度
qpdf input.pdf output.pdf --rotate=+90:1 # 将第1页旋转90度
移除密码
移除密码
qpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
undefinedqpdf --password=mypassword --decrypt encrypted.pdf decrypted.pdf
undefinedpdftk(如果可用)
pdftk(如果可用)
bash
undefinedbash
undefined合并
合并
pdftk file1.pdf file2.pdf cat output merged.pdf
pdftk file1.pdf file2.pdf cat output merged.pdf
拆分
拆分
pdftk input.pdf burst
pdftotk input.pdf burst
旋转
旋转
pdftk input.pdf rotate 1east output rotated.pdf
undefinedpdftotk input.pdf rotate 1east output rotated.pdf
undefined常见任务
常见任务
从扫描的 PDF 提取文本
从扫描的 PDF 提取文本
python
undefinedpython
undefined需要安装: pip install pytesseract pdf2image
需要安装: pip install pytesseract pdf2image
import pytesseract
from pdf2image import convert_from_path
import pytesseract
from pdf2image import convert_from_path
将 PDF 转换为图像
将 PDF 转换为图像
images = convert_from_path('scanned.pdf')
images = convert_from_path('scanned.pdf')
对每一页进行 OCR 识别
对每一页进行 OCR 识别
text = ""
for i, image in enumerate(images):
text += f"第 {i+1} 页:\n"
text += pytesseract.image_to_string(image)
text += "\n\n"
print(text)
undefinedtext = ""
for i, image in enumerate(images):
text += f"第 {i+1} 页:\n"
text += pytesseract.image_to_string(image)
text += "\n\n"
print(text)
undefined添加水印
添加水印
python
from pypdf import PdfReader, PdfWriterpython
from pypdf import PdfReader, PdfWriter创建水印(或加载现有的)
创建水印(或加载现有的)
watermark = PdfReader("watermark.pdf").pages[0]
watermark = PdfReader("watermark.pdf").pages[0]
应用到所有页面
应用到所有页面
reader = PdfReader("document.pdf")
writer = PdfWriter()
for page in reader.pages:
page.merge_page(watermark)
writer.add_page(page)
with open("watermarked.pdf", "wb") as output:
writer.write(output)
undefinedreader = PdfReader("document.pdf")
writer = PdfWriter()
for page in reader.pages:
page.merge_page(watermark)
writer.add_page(page)
with open("watermarked.pdf", "wb") as output:
writer.write(output)
undefined提取图像
提取图像
bash
undefinedbash
undefined使用 pdfimages (poppler-utils)
使用 pdfimages (poppler-utils)
pdfimages -j input.pdf output_prefix
pdfimages -j input.pdf output_prefix
这会将所有图像提取为 output_prefix-000.jpg、output_prefix-001.jpg 等
这会将所有图像提取为 output_prefix-000.jpg、output_prefix-001.jpg 等
undefinedundefined密码保护
密码保护
python
from pypdf import PdfReader, PdfWriter
reader = PdfReader("input.pdf")
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)python
from pypdf import PdfReader, PdfWriter
reader = PdfReader("input.pdf")
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)添加密码
添加密码
writer.encrypt("userpassword", "ownerpassword")
with open("encrypted.pdf", "wb") as output:
writer.write(output)
undefinedwriter.encrypt("userpassword", "ownerpassword")
with open("encrypted.pdf", "wb") as output:
writer.write(output)
undefined快速参考
快速参考
| 任务 | 最佳工具 | 命令/代码 |
|---|---|---|
| 合并 PDF | pypdf | |
| 拆分 PDF | pypdf | 每页一个文件 |
| 提取文本 | pdfplumber | |
| 提取表格 | pdfplumber | |
| 创建 PDF | reportlab | Canvas 或 Platypus |
| 命令行合并 | qpdf | |
| OCR 扫描 PDF | pytesseract | 先转换为图像 |
| 填写 PDF 表单 | pdf-lib 或 pypdf(参见 forms.md) | 参见 forms.md |
| 任务 | 最佳工具 | 命令/代码 |
|---|---|---|
| 合并 PDF | pypdf | |
| 拆分 PDF | pypdf | 每页一个文件 |
| 提取文本 | pdfplumber | |
| 提取表格 | pdfplumber | |
| 创建 PDF | reportlab | Canvas 或 Platypus |
| 命令行合并 | qpdf | |
| OCR 扫描 PDF | pytesseract | 先转换为图像 |
| 填写 PDF 表单 | pdf-lib 或 pypdf(参见 forms.md) | 参见 forms.md |
后续步骤
后续步骤
- 有关 pypdfium2 的高级用法,请参阅 reference.md
- 有关 JavaScript 库(pdf-lib),请参阅 reference.md
- 如果需要填写 PDF 表单,请按照 forms.md 中的说明操作
- 有关故障排除指南,请参阅 reference.md
- 有关 pypdfium2 的高级用法,请参阅 reference.md
- 有关 JavaScript 库(pdf-lib),请参阅 reference.md
- 如果需要填写 PDF 表单,请按照 forms.md 中的说明操作
- 有关故障排除指南,请参阅 reference.md