高级示例¶
高级用法模式和集成示例。
多阶段处理管道¶
from doctra import DocResEngine, StructuredPDFParser
# 阶段 1:恢复文档
engine = DocResEngine(device="cuda")
enhanced_pdf = engine.restore_pdf(
pdf_path="low_quality.pdf",
output_path="enhanced.pdf",
task="appearance"
)
# 阶段 2:解析增强的文档
from doctra.engines.vlm.service import VLMStructuredExtractor
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your-key"
)
parser = StructuredPDFParser(vlm=vlm_engine)
parser.parse(enhanced_pdf)
使用不同 VLM 提供商的自定义处理¶
from doctra import ChartTablePDFParser
from doctra.engines.vlm.service import VLMStructuredExtractor
# 使用 OpenAI
vlm_openai = VLMStructuredExtractor(
vlm_provider="openai",
api_key="sk-xxx"
)
parser_openai = ChartTablePDFParser(
extract_charts=True,
extract_tables=True,
vlm=vlm_openai
)
# 使用 Gemini(成本效益高)
vlm_gemini = VLMStructuredExtractor(
vlm_provider="gemini",
api_key="gemini-key"
)
parser_gemini = ChartTablePDFParser(
extract_charts=True,
extract_tables=True,
vlm=vlm_gemini
)
# 使用 Qianfan ERNIE(百度智能云)
vlm_qianfan = VLMStructuredExtractor(
vlm_provider="qianfan",
vlm_model="ernie-4.5-turbo-vl-32k",
api_key="qianfan-key"
)
parser_qianfan = ChartTablePDFParser(
extract_charts=True,
extract_tables=True,
vlm=vlm_qianfan
)
# 使用不同的提供商解析
parser_openai.parse("doc.pdf")
parser_gemini.parse("doc.pdf")
parser_qianfan.parse("doc.pdf")
并行批量处理¶
from concurrent.futures import ThreadPoolExecutor
from doctra import StructuredPDFParser
def process_pdf(pdf_path):
parser = StructuredPDFParser()
try:
parser.parse(pdf_path)
return f"成功:{pdf_path}"
except Exception as e:
return f"错误 {pdf_path}:{e}"
# 并行处理多个 PDF
pdf_files = ["doc1.pdf", "doc2.pdf", "doc3.pdf"]
with ThreadPoolExecutor(max_workers=3) as executor:
results = executor.map(process_pdf, pdf_files)
for result in results:
print(result)
动态 DPI 选择¶
import os
from doctra import StructuredPDFParser
def smart_parse(pdf_path):
# 根据文件大小选择 DPI
file_size = os.path.getsize(pdf_path) / (1024 * 1024) # MB
if file_size < 5:
dpi = 200 # 标准质量
elif file_size < 20:
dpi = 150 # 大文件使用较低 DPI
else:
dpi = 100 # 超大文件使用非常低的 DPI
parser = StructuredPDFParser(dpi=dpi)
print(f"以 {dpi} DPI 处理 {pdf_path}")
parser.parse(pdf_path)
smart_parse("document.pdf")
与数据分析集成¶
from doctra import ChartTablePDFParser
import pandas as pd
# 提取表格
from doctra.engines.vlm.service import VLMStructuredExtractor
vlm_engine = VLMStructuredExtractor(
vlm_provider="openai",
api_key="your-key"
)
parser = ChartTablePDFParser(
extract_tables=True,
vlm=vlm_engine
)
parser.parse("financial_report.pdf")
# 加载并分析提取的数据
excel_path = "outputs/financial_report/structured_parsing/parsed_tables_charts.xlsx"
xls = pd.ExcelFile(excel_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(xls, sheet_name=sheet_name)
print(f"\n表格:{sheet_name}")
print(df.describe())