I第一次提交

This commit is contained in:
zhangzf1119
2025-12-13 23:00:09 +08:00
commit ac08a0b6ff
180 changed files with 28023 additions and 0 deletions

336
test_catalog_extract.py Normal file
View File

@@ -0,0 +1,336 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF目录摘取测试脚本 (与Java代码逻辑一致)
模拟 FileServiceImpl.extractDirectoryFromPDF 的逻辑,用于测试验证。
使用方法:
python test_catalog_extract.py <pdf文件路径>
依赖安装:
pip install pypdf
"""
import sys
import json
import os
from datetime import datetime
# 模拟Java的日志输出
def log_info(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] INFO [目录摘取] {msg}")
def log_debug(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] DEBUG [目录摘取] {msg}")
def log_warn(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] WARN [目录摘取] {msg}")
def log_error(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR [目录摘取] {msg}")
def get_page_number_from_outline_item(reader, item):
"""
对应Java: getPageNumberFromOutlineItem(PDOutlineItem item)
尝试从目录项获取页码
"""
try:
if hasattr(item, 'page') and item.page:
# 直接有page属性
page_num = reader.get_destination_page_number(item)
return page_num + 1 # 转为1-based
except:
pass
return -1
def extract_outline_items(reader, outline_items, chapters, level=1, indent=""):
"""
对应Java: extractOutlineItems(PDOutlineItem item, String indent, List<JSONObject> chapters, int level)
递归提取PDF目录项
"""
if not outline_items:
return
for i, item in enumerate(outline_items):
try:
# 如果是列表(子项),递归处理
if isinstance(item, list):
extract_outline_items(reader, item, chapters, level + 1, indent + " ")
continue
# 获取标题
title = item.title if hasattr(item, 'title') else None
if title and title.strip():
title = title.strip()
chapter = {
"chapterId": f"chap-{len(chapters)}",
"title": title,
"level": level,
"page": "1",
"startPage": 1,
"endPage": 1,
"children": []
}
# 尝试获取页码信息
start_page = -1
try:
page_num = reader.get_destination_page_number(item)
if page_num is not None and page_num >= 0:
start_page = page_num + 1 # PyPDF页码从0开始转为1-based
except Exception as e:
log_debug(f"无法获取章节页码信息: {title} - {e}")
# 如果获取到了起始页码
if start_page != -1:
chapter["page"] = str(start_page)
chapter["startPage"] = start_page
chapter["endPage"] = start_page # 暂时设为起始页码
else:
chapter["page"] = "1"
chapter["startPage"] = 1
chapter["endPage"] = 1
chapters.append(chapter)
log_debug(f"{indent}{title} (第{chapter['startPage']}页)")
except Exception as e:
log_warn(f"处理目录项失败: {e}")
def build_hierarchical_structure(chapters, max_depth=2):
"""
对应Java: buildHierarchicalStructure(List<JSONObject> chapters)
构建层级目录结构
"""
root_array = []
if not chapters:
return root_array
level_parents = [] # 存储每个层级的当前父节点
for chapter in chapters:
level = max(0, chapter.get("level", 1))
if chapter.get("children") is None:
chapter["children"] = []
if level == 0 or not level_parents or level_parents[0] is None:
root_array.append(chapter)
level_parents.clear()
level_parents.append(chapter)
continue
parent_index = min(level - 1, len(level_parents) - 1)
parent = level_parents[parent_index] if parent_index >= 0 else None
if parent is None:
# 找不到父节点时降级为根节点
root_array.append(chapter)
level_parents.clear()
level_parents.append(chapter)
continue
parent_children = parent.get("children", [])
parent_children.append(chapter)
parent["children"] = parent_children
# 更新level_parents
while len(level_parents) < level:
level_parents.append(None)
if level >= len(level_parents):
level_parents.append(chapter)
else:
level_parents[level] = chapter
# 清空更深层级的父节点
for idx in range(level + 1, len(level_parents)):
level_parents[idx] = None
# 限制目录深度
limit_directory_depth(root_array, max_depth)
return root_array
def limit_directory_depth(chapters, max_depth, current_depth=1):
"""限制目录深度"""
if current_depth >= max_depth:
for chapter in chapters:
chapter["children"] = []
else:
for chapter in chapters:
if chapter.get("children"):
limit_directory_depth(chapter["children"], max_depth, current_depth + 1)
def extract_directory_from_pdf(file_path, file_id="test-file", file_name=None):
"""
对应Java: extractDirectoryFromPDF(String filePath, String fileId, String fileName)
从PDF中提取目录结构
"""
try:
from pypdf import PdfReader
except ImportError:
try:
from PyPDF2 import PdfReader
except ImportError:
log_error("请先安装 pypdf: pip install pypdf")
return None
if file_name is None:
file_name = os.path.basename(file_path)
log_info(f"开始处理PDF文件: {file_name}")
try:
total_pages = 1
chapters = []
# 加载PDF文档
reader = PdfReader(file_path)
total_pages = len(reader.pages)
log_info(f"PDF总页数: {total_pages}")
# 尝试从PDF大纲中提取目录
outline = reader.outline
if outline:
log_info("发现PDF大纲开始提取...")
extract_outline_items(reader, outline, chapters, level=1, indent="")
log_info(f"从大纲提取到 {len(chapters)} 个章节")
if chapters:
# 构建层级结构
hierarchical_array = build_hierarchical_structure(chapters)
log_info(f"成功构建层级目录结构,根节点数: {len(hierarchical_array)}")
return {
"source": "pdf_outline",
"total_pages": total_pages,
"flat_chapters": chapters,
"hierarchical_chapters": hierarchical_array
}
# 回退方案:使用文件名作为单个章节
title = file_name
if title and title.lower().endswith(".pdf"):
title = title[:-4]
if not title or not title.strip():
title = "文档内容"
directory_array = [{
"chapterId": "chap-0",
"title": title,
"level": 1,
"page": "1",
"startPage": 1,
"endPage": total_pages,
"children": []
}]
log_info(f"使用文件名作为默认章节: '{title}', 页数范围: 1-{total_pages}")
return {
"source": "filename_fallback",
"total_pages": total_pages,
"flat_chapters": directory_array,
"hierarchical_chapters": directory_array
}
except Exception as e:
log_error(f"处理PDF失败: {e}")
import traceback
traceback.print_exc()
return None
def print_catalog_tree(chapters, indent=0):
"""打印目录树形结构"""
for chapter in chapters:
prefix = " " * indent + ("├─ " if indent > 0 else "")
title = chapter.get("title", "未命名")
page = chapter.get("startPage", "?")
level = chapter.get("level", 1)
print(f"{prefix}[L{level}] {title} (第{page}页)")
children = chapter.get("children", [])
if children:
print_catalog_tree(children, indent + 1)
def main():
print("=" * 70)
print("PDF目录摘取测试脚本 (与Java extractDirectoryFromPDF 逻辑一致)")
print("=" * 70)
if len(sys.argv) < 2:
print("\n用法:")
print(" python test_catalog_extract.py <pdf文件路径>")
print("\n示例:")
print(" python test_catalog_extract.py ./document.pdf")
print(" python test_catalog_extract.py /path/to/技术方案.pdf")
print("\n依赖:")
print(" pip install pypdf")
print("=" * 70)
sys.exit(1)
pdf_path = sys.argv[1]
# 检查文件是否存在
if not os.path.exists(pdf_path):
log_error(f"文件不存在: {pdf_path}")
sys.exit(1)
print(f"\n📄 PDF文件: {pdf_path}")
print(f"⏰ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("-" * 70)
start_time = datetime.now()
# 提取目录
result = extract_directory_from_pdf(pdf_path)
duration = (datetime.now() - start_time).total_seconds()
if result is None:
log_error("目录提取失败")
sys.exit(1)
print("-" * 70)
print(f"\n📊 提取结果:")
print(f" 来源: {result.get('source', 'unknown')}")
print(f" 总页数: {result.get('total_pages', '?')}")
print(f" 扁平目录项数量: {len(result.get('flat_chapters', []))}")
print(f" 层级根节点数量: {len(result.get('hierarchical_chapters', []))}")
print(f" 处理耗时: {duration:.3f}")
# 打印扁平目录
print(f"\n📋 扁平目录 (flat_chapters):")
print("-" * 40)
for chapter in result.get("flat_chapters", []):
level = chapter.get("level", 1)
indent = " " * (level - 1)
print(f"{indent}[{chapter.get('chapterId')}] {chapter.get('title')} (L{level}, 第{chapter.get('startPage')}页)")
# 打印层级目录
print(f"\n🌳 层级目录 (hierarchical_chapters):")
print("-" * 40)
print_catalog_tree(result.get("hierarchical_chapters", []))
# 保存JSON结果
output_file = os.path.splitext(pdf_path)[0] + "_catalog_test.json"
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"\n✅ JSON结果已保存到: {output_file}")
print("\n" + "=" * 70)
print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)
if __name__ == "__main__":
main()