337 lines
11 KiB
Python
337 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
PDF目录摘取测试脚本 (与Java代码逻辑一致)
|
||
|
||
模拟 FileServiceImpl.extractDirectoryFromPDF 的逻辑,用于测试验证。
|
||
|
||
使用方法:
|
||
python test_catalog_extract.py <pdf文件路径>
|
||
|
||
依赖安装:
|
||
pip install pypdf
|
||
"""
|
||
|
||
import sys
|
||
import json
|
||
import os
|
||
from datetime import datetime
|
||
|
||
# 模拟Java的日志输出
|
||
def log_info(msg):
|
||
print(f"[{datetime.now().strftime('%H:%M:%S')}] INFO [目录摘取] {msg}")
|
||
|
||
def log_debug(msg):
|
||
print(f"[{datetime.now().strftime('%H:%M:%S')}] DEBUG [目录摘取] {msg}")
|
||
|
||
def log_warn(msg):
|
||
print(f"[{datetime.now().strftime('%H:%M:%S')}] WARN [目录摘取] {msg}")
|
||
|
||
def log_error(msg):
|
||
print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR [目录摘取] {msg}")
|
||
|
||
|
||
def get_page_number_from_outline_item(reader, item):
|
||
"""
|
||
对应Java: getPageNumberFromOutlineItem(PDOutlineItem item)
|
||
尝试从目录项获取页码
|
||
"""
|
||
try:
|
||
if hasattr(item, 'page') and item.page:
|
||
# 直接有page属性
|
||
page_num = reader.get_destination_page_number(item)
|
||
return page_num + 1 # 转为1-based
|
||
except:
|
||
pass
|
||
return -1
|
||
|
||
|
||
def extract_outline_items(reader, outline_items, chapters, level=1, indent=""):
|
||
"""
|
||
对应Java: extractOutlineItems(PDOutlineItem item, String indent, List<JSONObject> chapters, int level)
|
||
递归提取PDF目录项
|
||
"""
|
||
if not outline_items:
|
||
return
|
||
|
||
for i, item in enumerate(outline_items):
|
||
try:
|
||
# 如果是列表(子项),递归处理
|
||
if isinstance(item, list):
|
||
extract_outline_items(reader, item, chapters, level + 1, indent + " ")
|
||
continue
|
||
|
||
# 获取标题
|
||
title = item.title if hasattr(item, 'title') else None
|
||
|
||
if title and title.strip():
|
||
title = title.strip()
|
||
|
||
chapter = {
|
||
"chapterId": f"chap-{len(chapters)}",
|
||
"title": title,
|
||
"level": level,
|
||
"page": "1",
|
||
"startPage": 1,
|
||
"endPage": 1,
|
||
"children": []
|
||
}
|
||
|
||
# 尝试获取页码信息
|
||
start_page = -1
|
||
try:
|
||
page_num = reader.get_destination_page_number(item)
|
||
if page_num is not None and page_num >= 0:
|
||
start_page = page_num + 1 # PyPDF页码从0开始,转为1-based
|
||
except Exception as e:
|
||
log_debug(f"无法获取章节页码信息: {title} - {e}")
|
||
|
||
# 如果获取到了起始页码
|
||
if start_page != -1:
|
||
chapter["page"] = str(start_page)
|
||
chapter["startPage"] = start_page
|
||
chapter["endPage"] = start_page # 暂时设为起始页码
|
||
else:
|
||
chapter["page"] = "1"
|
||
chapter["startPage"] = 1
|
||
chapter["endPage"] = 1
|
||
|
||
chapters.append(chapter)
|
||
log_debug(f"{indent}• {title} (第{chapter['startPage']}页)")
|
||
|
||
except Exception as e:
|
||
log_warn(f"处理目录项失败: {e}")
|
||
|
||
|
||
def build_hierarchical_structure(chapters, max_depth=2):
|
||
"""
|
||
对应Java: buildHierarchicalStructure(List<JSONObject> chapters)
|
||
构建层级目录结构
|
||
"""
|
||
root_array = []
|
||
if not chapters:
|
||
return root_array
|
||
|
||
level_parents = [] # 存储每个层级的当前父节点
|
||
|
||
for chapter in chapters:
|
||
level = max(0, chapter.get("level", 1))
|
||
if chapter.get("children") is None:
|
||
chapter["children"] = []
|
||
|
||
if level == 0 or not level_parents or level_parents[0] is None:
|
||
root_array.append(chapter)
|
||
level_parents.clear()
|
||
level_parents.append(chapter)
|
||
continue
|
||
|
||
parent_index = min(level - 1, len(level_parents) - 1)
|
||
parent = level_parents[parent_index] if parent_index >= 0 else None
|
||
|
||
if parent is None:
|
||
# 找不到父节点时降级为根节点
|
||
root_array.append(chapter)
|
||
level_parents.clear()
|
||
level_parents.append(chapter)
|
||
continue
|
||
|
||
parent_children = parent.get("children", [])
|
||
parent_children.append(chapter)
|
||
parent["children"] = parent_children
|
||
|
||
# 更新level_parents
|
||
while len(level_parents) < level:
|
||
level_parents.append(None)
|
||
if level >= len(level_parents):
|
||
level_parents.append(chapter)
|
||
else:
|
||
level_parents[level] = chapter
|
||
|
||
# 清空更深层级的父节点
|
||
for idx in range(level + 1, len(level_parents)):
|
||
level_parents[idx] = None
|
||
|
||
# 限制目录深度
|
||
limit_directory_depth(root_array, max_depth)
|
||
|
||
return root_array
|
||
|
||
|
||
def limit_directory_depth(chapters, max_depth, current_depth=1):
|
||
"""限制目录深度"""
|
||
if current_depth >= max_depth:
|
||
for chapter in chapters:
|
||
chapter["children"] = []
|
||
else:
|
||
for chapter in chapters:
|
||
if chapter.get("children"):
|
||
limit_directory_depth(chapter["children"], max_depth, current_depth + 1)
|
||
|
||
|
||
def extract_directory_from_pdf(file_path, file_id="test-file", file_name=None):
|
||
"""
|
||
对应Java: extractDirectoryFromPDF(String filePath, String fileId, String fileName)
|
||
从PDF中提取目录结构
|
||
"""
|
||
try:
|
||
from pypdf import PdfReader
|
||
except ImportError:
|
||
try:
|
||
from PyPDF2 import PdfReader
|
||
except ImportError:
|
||
log_error("请先安装 pypdf: pip install pypdf")
|
||
return None
|
||
|
||
if file_name is None:
|
||
file_name = os.path.basename(file_path)
|
||
|
||
log_info(f"开始处理PDF文件: {file_name}")
|
||
|
||
try:
|
||
total_pages = 1
|
||
chapters = []
|
||
|
||
# 加载PDF文档
|
||
reader = PdfReader(file_path)
|
||
total_pages = len(reader.pages)
|
||
log_info(f"PDF总页数: {total_pages}")
|
||
|
||
# 尝试从PDF大纲中提取目录
|
||
outline = reader.outline
|
||
|
||
if outline:
|
||
log_info("发现PDF大纲,开始提取...")
|
||
extract_outline_items(reader, outline, chapters, level=1, indent="")
|
||
log_info(f"从大纲提取到 {len(chapters)} 个章节")
|
||
|
||
if chapters:
|
||
# 构建层级结构
|
||
hierarchical_array = build_hierarchical_structure(chapters)
|
||
log_info(f"成功构建层级目录结构,根节点数: {len(hierarchical_array)}")
|
||
return {
|
||
"source": "pdf_outline",
|
||
"total_pages": total_pages,
|
||
"flat_chapters": chapters,
|
||
"hierarchical_chapters": hierarchical_array
|
||
}
|
||
|
||
# 回退方案:使用文件名作为单个章节
|
||
title = file_name
|
||
if title and title.lower().endswith(".pdf"):
|
||
title = title[:-4]
|
||
if not title or not title.strip():
|
||
title = "文档内容"
|
||
|
||
directory_array = [{
|
||
"chapterId": "chap-0",
|
||
"title": title,
|
||
"level": 1,
|
||
"page": "1",
|
||
"startPage": 1,
|
||
"endPage": total_pages,
|
||
"children": []
|
||
}]
|
||
|
||
log_info(f"使用文件名作为默认章节: '{title}', 页数范围: 1-{total_pages}")
|
||
|
||
return {
|
||
"source": "filename_fallback",
|
||
"total_pages": total_pages,
|
||
"flat_chapters": directory_array,
|
||
"hierarchical_chapters": directory_array
|
||
}
|
||
|
||
except Exception as e:
|
||
log_error(f"处理PDF失败: {e}")
|
||
import traceback
|
||
traceback.print_exc()
|
||
return None
|
||
|
||
|
||
def print_catalog_tree(chapters, indent=0):
|
||
"""打印目录树形结构"""
|
||
for chapter in chapters:
|
||
prefix = " " * indent + ("├─ " if indent > 0 else "")
|
||
title = chapter.get("title", "未命名")
|
||
page = chapter.get("startPage", "?")
|
||
level = chapter.get("level", 1)
|
||
print(f"{prefix}[L{level}] {title} (第{page}页)")
|
||
|
||
children = chapter.get("children", [])
|
||
if children:
|
||
print_catalog_tree(children, indent + 1)
|
||
|
||
|
||
def main():
|
||
print("=" * 70)
|
||
print("PDF目录摘取测试脚本 (与Java extractDirectoryFromPDF 逻辑一致)")
|
||
print("=" * 70)
|
||
|
||
if len(sys.argv) < 2:
|
||
print("\n用法:")
|
||
print(" python test_catalog_extract.py <pdf文件路径>")
|
||
print("\n示例:")
|
||
print(" python test_catalog_extract.py ./document.pdf")
|
||
print(" python test_catalog_extract.py /path/to/技术方案.pdf")
|
||
print("\n依赖:")
|
||
print(" pip install pypdf")
|
||
print("=" * 70)
|
||
sys.exit(1)
|
||
|
||
pdf_path = sys.argv[1]
|
||
|
||
# 检查文件是否存在
|
||
if not os.path.exists(pdf_path):
|
||
log_error(f"文件不存在: {pdf_path}")
|
||
sys.exit(1)
|
||
|
||
print(f"\n📄 PDF文件: {pdf_path}")
|
||
print(f"⏰ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||
print("-" * 70)
|
||
|
||
start_time = datetime.now()
|
||
|
||
# 提取目录
|
||
result = extract_directory_from_pdf(pdf_path)
|
||
|
||
duration = (datetime.now() - start_time).total_seconds()
|
||
|
||
if result is None:
|
||
log_error("目录提取失败")
|
||
sys.exit(1)
|
||
|
||
print("-" * 70)
|
||
print(f"\n📊 提取结果:")
|
||
print(f" 来源: {result.get('source', 'unknown')}")
|
||
print(f" 总页数: {result.get('total_pages', '?')}")
|
||
print(f" 扁平目录项数量: {len(result.get('flat_chapters', []))}")
|
||
print(f" 层级根节点数量: {len(result.get('hierarchical_chapters', []))}")
|
||
print(f" 处理耗时: {duration:.3f}秒")
|
||
|
||
# 打印扁平目录
|
||
print(f"\n📋 扁平目录 (flat_chapters):")
|
||
print("-" * 40)
|
||
for chapter in result.get("flat_chapters", []):
|
||
level = chapter.get("level", 1)
|
||
indent = " " * (level - 1)
|
||
print(f"{indent}[{chapter.get('chapterId')}] {chapter.get('title')} (L{level}, 第{chapter.get('startPage')}页)")
|
||
|
||
# 打印层级目录
|
||
print(f"\n🌳 层级目录 (hierarchical_chapters):")
|
||
print("-" * 40)
|
||
print_catalog_tree(result.get("hierarchical_chapters", []))
|
||
|
||
# 保存JSON结果
|
||
output_file = os.path.splitext(pdf_path)[0] + "_catalog_test.json"
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||
print(f"\n✅ JSON结果已保存到: {output_file}")
|
||
|
||
print("\n" + "=" * 70)
|
||
print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||
print("=" * 70)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|