I第一次提交
This commit is contained in:
336
test_catalog_extract.py
Normal file
336
test_catalog_extract.py
Normal file
@@ -0,0 +1,336 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
PDF目录摘取测试脚本 (与Java代码逻辑一致)
|
||||
|
||||
模拟 FileServiceImpl.extractDirectoryFromPDF 的逻辑,用于测试验证。
|
||||
|
||||
使用方法:
|
||||
python test_catalog_extract.py <pdf文件路径>
|
||||
|
||||
依赖安装:
|
||||
pip install pypdf
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime
|
||||
|
||||
# 模拟Java的日志输出
|
||||
def log_info(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] INFO [目录摘取] {msg}")
|
||||
|
||||
def log_debug(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] DEBUG [目录摘取] {msg}")
|
||||
|
||||
def log_warn(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] WARN [目录摘取] {msg}")
|
||||
|
||||
def log_error(msg):
|
||||
print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR [目录摘取] {msg}")
|
||||
|
||||
|
||||
def get_page_number_from_outline_item(reader, item):
|
||||
"""
|
||||
对应Java: getPageNumberFromOutlineItem(PDOutlineItem item)
|
||||
尝试从目录项获取页码
|
||||
"""
|
||||
try:
|
||||
if hasattr(item, 'page') and item.page:
|
||||
# 直接有page属性
|
||||
page_num = reader.get_destination_page_number(item)
|
||||
return page_num + 1 # 转为1-based
|
||||
except:
|
||||
pass
|
||||
return -1
|
||||
|
||||
|
||||
def extract_outline_items(reader, outline_items, chapters, level=1, indent=""):
|
||||
"""
|
||||
对应Java: extractOutlineItems(PDOutlineItem item, String indent, List<JSONObject> chapters, int level)
|
||||
递归提取PDF目录项
|
||||
"""
|
||||
if not outline_items:
|
||||
return
|
||||
|
||||
for i, item in enumerate(outline_items):
|
||||
try:
|
||||
# 如果是列表(子项),递归处理
|
||||
if isinstance(item, list):
|
||||
extract_outline_items(reader, item, chapters, level + 1, indent + " ")
|
||||
continue
|
||||
|
||||
# 获取标题
|
||||
title = item.title if hasattr(item, 'title') else None
|
||||
|
||||
if title and title.strip():
|
||||
title = title.strip()
|
||||
|
||||
chapter = {
|
||||
"chapterId": f"chap-{len(chapters)}",
|
||||
"title": title,
|
||||
"level": level,
|
||||
"page": "1",
|
||||
"startPage": 1,
|
||||
"endPage": 1,
|
||||
"children": []
|
||||
}
|
||||
|
||||
# 尝试获取页码信息
|
||||
start_page = -1
|
||||
try:
|
||||
page_num = reader.get_destination_page_number(item)
|
||||
if page_num is not None and page_num >= 0:
|
||||
start_page = page_num + 1 # PyPDF页码从0开始,转为1-based
|
||||
except Exception as e:
|
||||
log_debug(f"无法获取章节页码信息: {title} - {e}")
|
||||
|
||||
# 如果获取到了起始页码
|
||||
if start_page != -1:
|
||||
chapter["page"] = str(start_page)
|
||||
chapter["startPage"] = start_page
|
||||
chapter["endPage"] = start_page # 暂时设为起始页码
|
||||
else:
|
||||
chapter["page"] = "1"
|
||||
chapter["startPage"] = 1
|
||||
chapter["endPage"] = 1
|
||||
|
||||
chapters.append(chapter)
|
||||
log_debug(f"{indent}• {title} (第{chapter['startPage']}页)")
|
||||
|
||||
except Exception as e:
|
||||
log_warn(f"处理目录项失败: {e}")
|
||||
|
||||
|
||||
def build_hierarchical_structure(chapters, max_depth=2):
|
||||
"""
|
||||
对应Java: buildHierarchicalStructure(List<JSONObject> chapters)
|
||||
构建层级目录结构
|
||||
"""
|
||||
root_array = []
|
||||
if not chapters:
|
||||
return root_array
|
||||
|
||||
level_parents = [] # 存储每个层级的当前父节点
|
||||
|
||||
for chapter in chapters:
|
||||
level = max(0, chapter.get("level", 1))
|
||||
if chapter.get("children") is None:
|
||||
chapter["children"] = []
|
||||
|
||||
if level == 0 or not level_parents or level_parents[0] is None:
|
||||
root_array.append(chapter)
|
||||
level_parents.clear()
|
||||
level_parents.append(chapter)
|
||||
continue
|
||||
|
||||
parent_index = min(level - 1, len(level_parents) - 1)
|
||||
parent = level_parents[parent_index] if parent_index >= 0 else None
|
||||
|
||||
if parent is None:
|
||||
# 找不到父节点时降级为根节点
|
||||
root_array.append(chapter)
|
||||
level_parents.clear()
|
||||
level_parents.append(chapter)
|
||||
continue
|
||||
|
||||
parent_children = parent.get("children", [])
|
||||
parent_children.append(chapter)
|
||||
parent["children"] = parent_children
|
||||
|
||||
# 更新level_parents
|
||||
while len(level_parents) < level:
|
||||
level_parents.append(None)
|
||||
if level >= len(level_parents):
|
||||
level_parents.append(chapter)
|
||||
else:
|
||||
level_parents[level] = chapter
|
||||
|
||||
# 清空更深层级的父节点
|
||||
for idx in range(level + 1, len(level_parents)):
|
||||
level_parents[idx] = None
|
||||
|
||||
# 限制目录深度
|
||||
limit_directory_depth(root_array, max_depth)
|
||||
|
||||
return root_array
|
||||
|
||||
|
||||
def limit_directory_depth(chapters, max_depth, current_depth=1):
|
||||
"""限制目录深度"""
|
||||
if current_depth >= max_depth:
|
||||
for chapter in chapters:
|
||||
chapter["children"] = []
|
||||
else:
|
||||
for chapter in chapters:
|
||||
if chapter.get("children"):
|
||||
limit_directory_depth(chapter["children"], max_depth, current_depth + 1)
|
||||
|
||||
|
||||
def extract_directory_from_pdf(file_path, file_id="test-file", file_name=None):
|
||||
"""
|
||||
对应Java: extractDirectoryFromPDF(String filePath, String fileId, String fileName)
|
||||
从PDF中提取目录结构
|
||||
"""
|
||||
try:
|
||||
from pypdf import PdfReader
|
||||
except ImportError:
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
except ImportError:
|
||||
log_error("请先安装 pypdf: pip install pypdf")
|
||||
return None
|
||||
|
||||
if file_name is None:
|
||||
file_name = os.path.basename(file_path)
|
||||
|
||||
log_info(f"开始处理PDF文件: {file_name}")
|
||||
|
||||
try:
|
||||
total_pages = 1
|
||||
chapters = []
|
||||
|
||||
# 加载PDF文档
|
||||
reader = PdfReader(file_path)
|
||||
total_pages = len(reader.pages)
|
||||
log_info(f"PDF总页数: {total_pages}")
|
||||
|
||||
# 尝试从PDF大纲中提取目录
|
||||
outline = reader.outline
|
||||
|
||||
if outline:
|
||||
log_info("发现PDF大纲,开始提取...")
|
||||
extract_outline_items(reader, outline, chapters, level=1, indent="")
|
||||
log_info(f"从大纲提取到 {len(chapters)} 个章节")
|
||||
|
||||
if chapters:
|
||||
# 构建层级结构
|
||||
hierarchical_array = build_hierarchical_structure(chapters)
|
||||
log_info(f"成功构建层级目录结构,根节点数: {len(hierarchical_array)}")
|
||||
return {
|
||||
"source": "pdf_outline",
|
||||
"total_pages": total_pages,
|
||||
"flat_chapters": chapters,
|
||||
"hierarchical_chapters": hierarchical_array
|
||||
}
|
||||
|
||||
# 回退方案:使用文件名作为单个章节
|
||||
title = file_name
|
||||
if title and title.lower().endswith(".pdf"):
|
||||
title = title[:-4]
|
||||
if not title or not title.strip():
|
||||
title = "文档内容"
|
||||
|
||||
directory_array = [{
|
||||
"chapterId": "chap-0",
|
||||
"title": title,
|
||||
"level": 1,
|
||||
"page": "1",
|
||||
"startPage": 1,
|
||||
"endPage": total_pages,
|
||||
"children": []
|
||||
}]
|
||||
|
||||
log_info(f"使用文件名作为默认章节: '{title}', 页数范围: 1-{total_pages}")
|
||||
|
||||
return {
|
||||
"source": "filename_fallback",
|
||||
"total_pages": total_pages,
|
||||
"flat_chapters": directory_array,
|
||||
"hierarchical_chapters": directory_array
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
log_error(f"处理PDF失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
def print_catalog_tree(chapters, indent=0):
|
||||
"""打印目录树形结构"""
|
||||
for chapter in chapters:
|
||||
prefix = " " * indent + ("├─ " if indent > 0 else "")
|
||||
title = chapter.get("title", "未命名")
|
||||
page = chapter.get("startPage", "?")
|
||||
level = chapter.get("level", 1)
|
||||
print(f"{prefix}[L{level}] {title} (第{page}页)")
|
||||
|
||||
children = chapter.get("children", [])
|
||||
if children:
|
||||
print_catalog_tree(children, indent + 1)
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("PDF目录摘取测试脚本 (与Java extractDirectoryFromPDF 逻辑一致)")
|
||||
print("=" * 70)
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("\n用法:")
|
||||
print(" python test_catalog_extract.py <pdf文件路径>")
|
||||
print("\n示例:")
|
||||
print(" python test_catalog_extract.py ./document.pdf")
|
||||
print(" python test_catalog_extract.py /path/to/技术方案.pdf")
|
||||
print("\n依赖:")
|
||||
print(" pip install pypdf")
|
||||
print("=" * 70)
|
||||
sys.exit(1)
|
||||
|
||||
pdf_path = sys.argv[1]
|
||||
|
||||
# 检查文件是否存在
|
||||
if not os.path.exists(pdf_path):
|
||||
log_error(f"文件不存在: {pdf_path}")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\n📄 PDF文件: {pdf_path}")
|
||||
print(f"⏰ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print("-" * 70)
|
||||
|
||||
start_time = datetime.now()
|
||||
|
||||
# 提取目录
|
||||
result = extract_directory_from_pdf(pdf_path)
|
||||
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
|
||||
if result is None:
|
||||
log_error("目录提取失败")
|
||||
sys.exit(1)
|
||||
|
||||
print("-" * 70)
|
||||
print(f"\n📊 提取结果:")
|
||||
print(f" 来源: {result.get('source', 'unknown')}")
|
||||
print(f" 总页数: {result.get('total_pages', '?')}")
|
||||
print(f" 扁平目录项数量: {len(result.get('flat_chapters', []))}")
|
||||
print(f" 层级根节点数量: {len(result.get('hierarchical_chapters', []))}")
|
||||
print(f" 处理耗时: {duration:.3f}秒")
|
||||
|
||||
# 打印扁平目录
|
||||
print(f"\n📋 扁平目录 (flat_chapters):")
|
||||
print("-" * 40)
|
||||
for chapter in result.get("flat_chapters", []):
|
||||
level = chapter.get("level", 1)
|
||||
indent = " " * (level - 1)
|
||||
print(f"{indent}[{chapter.get('chapterId')}] {chapter.get('title')} (L{level}, 第{chapter.get('startPage')}页)")
|
||||
|
||||
# 打印层级目录
|
||||
print(f"\n🌳 层级目录 (hierarchical_chapters):")
|
||||
print("-" * 40)
|
||||
print_catalog_tree(result.get("hierarchical_chapters", []))
|
||||
|
||||
# 保存JSON结果
|
||||
output_file = os.path.splitext(pdf_path)[0] + "_catalog_test.json"
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n✅ JSON结果已保存到: {output_file}")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user