Files
smart-admin/test_catalog_extract.py
2025-12-13 23:00:09 +08:00

337 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF目录摘取测试脚本 (与Java代码逻辑一致)
模拟 FileServiceImpl.extractDirectoryFromPDF 的逻辑,用于测试验证。
使用方法:
python test_catalog_extract.py <pdf文件路径>
依赖安装:
pip install pypdf
"""
import sys
import json
import os
from datetime import datetime
# 模拟Java的日志输出
def log_info(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] INFO [目录摘取] {msg}")
def log_debug(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] DEBUG [目录摘取] {msg}")
def log_warn(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] WARN [目录摘取] {msg}")
def log_error(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR [目录摘取] {msg}")
def get_page_number_from_outline_item(reader, item):
"""
对应Java: getPageNumberFromOutlineItem(PDOutlineItem item)
尝试从目录项获取页码
"""
try:
if hasattr(item, 'page') and item.page:
# 直接有page属性
page_num = reader.get_destination_page_number(item)
return page_num + 1 # 转为1-based
except:
pass
return -1
def extract_outline_items(reader, outline_items, chapters, level=1, indent=""):
"""
对应Java: extractOutlineItems(PDOutlineItem item, String indent, List<JSONObject> chapters, int level)
递归提取PDF目录项
"""
if not outline_items:
return
for i, item in enumerate(outline_items):
try:
# 如果是列表(子项),递归处理
if isinstance(item, list):
extract_outline_items(reader, item, chapters, level + 1, indent + " ")
continue
# 获取标题
title = item.title if hasattr(item, 'title') else None
if title and title.strip():
title = title.strip()
chapter = {
"chapterId": f"chap-{len(chapters)}",
"title": title,
"level": level,
"page": "1",
"startPage": 1,
"endPage": 1,
"children": []
}
# 尝试获取页码信息
start_page = -1
try:
page_num = reader.get_destination_page_number(item)
if page_num is not None and page_num >= 0:
start_page = page_num + 1 # PyPDF页码从0开始转为1-based
except Exception as e:
log_debug(f"无法获取章节页码信息: {title} - {e}")
# 如果获取到了起始页码
if start_page != -1:
chapter["page"] = str(start_page)
chapter["startPage"] = start_page
chapter["endPage"] = start_page # 暂时设为起始页码
else:
chapter["page"] = "1"
chapter["startPage"] = 1
chapter["endPage"] = 1
chapters.append(chapter)
log_debug(f"{indent}{title} (第{chapter['startPage']}页)")
except Exception as e:
log_warn(f"处理目录项失败: {e}")
def build_hierarchical_structure(chapters, max_depth=2):
"""
对应Java: buildHierarchicalStructure(List<JSONObject> chapters)
构建层级目录结构
"""
root_array = []
if not chapters:
return root_array
level_parents = [] # 存储每个层级的当前父节点
for chapter in chapters:
level = max(0, chapter.get("level", 1))
if chapter.get("children") is None:
chapter["children"] = []
if level == 0 or not level_parents or level_parents[0] is None:
root_array.append(chapter)
level_parents.clear()
level_parents.append(chapter)
continue
parent_index = min(level - 1, len(level_parents) - 1)
parent = level_parents[parent_index] if parent_index >= 0 else None
if parent is None:
# 找不到父节点时降级为根节点
root_array.append(chapter)
level_parents.clear()
level_parents.append(chapter)
continue
parent_children = parent.get("children", [])
parent_children.append(chapter)
parent["children"] = parent_children
# 更新level_parents
while len(level_parents) < level:
level_parents.append(None)
if level >= len(level_parents):
level_parents.append(chapter)
else:
level_parents[level] = chapter
# 清空更深层级的父节点
for idx in range(level + 1, len(level_parents)):
level_parents[idx] = None
# 限制目录深度
limit_directory_depth(root_array, max_depth)
return root_array
def limit_directory_depth(chapters, max_depth, current_depth=1):
"""限制目录深度"""
if current_depth >= max_depth:
for chapter in chapters:
chapter["children"] = []
else:
for chapter in chapters:
if chapter.get("children"):
limit_directory_depth(chapter["children"], max_depth, current_depth + 1)
def extract_directory_from_pdf(file_path, file_id="test-file", file_name=None):
"""
对应Java: extractDirectoryFromPDF(String filePath, String fileId, String fileName)
从PDF中提取目录结构
"""
try:
from pypdf import PdfReader
except ImportError:
try:
from PyPDF2 import PdfReader
except ImportError:
log_error("请先安装 pypdf: pip install pypdf")
return None
if file_name is None:
file_name = os.path.basename(file_path)
log_info(f"开始处理PDF文件: {file_name}")
try:
total_pages = 1
chapters = []
# 加载PDF文档
reader = PdfReader(file_path)
total_pages = len(reader.pages)
log_info(f"PDF总页数: {total_pages}")
# 尝试从PDF大纲中提取目录
outline = reader.outline
if outline:
log_info("发现PDF大纲开始提取...")
extract_outline_items(reader, outline, chapters, level=1, indent="")
log_info(f"从大纲提取到 {len(chapters)} 个章节")
if chapters:
# 构建层级结构
hierarchical_array = build_hierarchical_structure(chapters)
log_info(f"成功构建层级目录结构,根节点数: {len(hierarchical_array)}")
return {
"source": "pdf_outline",
"total_pages": total_pages,
"flat_chapters": chapters,
"hierarchical_chapters": hierarchical_array
}
# 回退方案:使用文件名作为单个章节
title = file_name
if title and title.lower().endswith(".pdf"):
title = title[:-4]
if not title or not title.strip():
title = "文档内容"
directory_array = [{
"chapterId": "chap-0",
"title": title,
"level": 1,
"page": "1",
"startPage": 1,
"endPage": total_pages,
"children": []
}]
log_info(f"使用文件名作为默认章节: '{title}', 页数范围: 1-{total_pages}")
return {
"source": "filename_fallback",
"total_pages": total_pages,
"flat_chapters": directory_array,
"hierarchical_chapters": directory_array
}
except Exception as e:
log_error(f"处理PDF失败: {e}")
import traceback
traceback.print_exc()
return None
def print_catalog_tree(chapters, indent=0):
"""打印目录树形结构"""
for chapter in chapters:
prefix = " " * indent + ("├─ " if indent > 0 else "")
title = chapter.get("title", "未命名")
page = chapter.get("startPage", "?")
level = chapter.get("level", 1)
print(f"{prefix}[L{level}] {title} (第{page}页)")
children = chapter.get("children", [])
if children:
print_catalog_tree(children, indent + 1)
def main():
print("=" * 70)
print("PDF目录摘取测试脚本 (与Java extractDirectoryFromPDF 逻辑一致)")
print("=" * 70)
if len(sys.argv) < 2:
print("\n用法:")
print(" python test_catalog_extract.py <pdf文件路径>")
print("\n示例:")
print(" python test_catalog_extract.py ./document.pdf")
print(" python test_catalog_extract.py /path/to/技术方案.pdf")
print("\n依赖:")
print(" pip install pypdf")
print("=" * 70)
sys.exit(1)
pdf_path = sys.argv[1]
# 检查文件是否存在
if not os.path.exists(pdf_path):
log_error(f"文件不存在: {pdf_path}")
sys.exit(1)
print(f"\n📄 PDF文件: {pdf_path}")
print(f"⏰ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("-" * 70)
start_time = datetime.now()
# 提取目录
result = extract_directory_from_pdf(pdf_path)
duration = (datetime.now() - start_time).total_seconds()
if result is None:
log_error("目录提取失败")
sys.exit(1)
print("-" * 70)
print(f"\n📊 提取结果:")
print(f" 来源: {result.get('source', 'unknown')}")
print(f" 总页数: {result.get('total_pages', '?')}")
print(f" 扁平目录项数量: {len(result.get('flat_chapters', []))}")
print(f" 层级根节点数量: {len(result.get('hierarchical_chapters', []))}")
print(f" 处理耗时: {duration:.3f}")
# 打印扁平目录
print(f"\n📋 扁平目录 (flat_chapters):")
print("-" * 40)
for chapter in result.get("flat_chapters", []):
level = chapter.get("level", 1)
indent = " " * (level - 1)
print(f"{indent}[{chapter.get('chapterId')}] {chapter.get('title')} (L{level}, 第{chapter.get('startPage')}页)")
# 打印层级目录
print(f"\n🌳 层级目录 (hierarchical_chapters):")
print("-" * 40)
print_catalog_tree(result.get("hierarchical_chapters", []))
# 保存JSON结果
output_file = os.path.splitext(pdf_path)[0] + "_catalog_test.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n✅ JSON结果已保存到: {output_file}")
print("\n" + "=" * 70)
print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)
if __name__ == "__main__":
main()