#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ PDF目录摘取测试脚本 (与Java代码逻辑一致) 模拟 FileServiceImpl.extractDirectoryFromPDF 的逻辑,用于测试验证。 使用方法: python test_catalog_extract.py 依赖安装: pip install pypdf """ import sys import json import os from datetime import datetime # 模拟Java的日志输出 def log_info(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] INFO [目录摘取] {msg}") def log_debug(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] DEBUG [目录摘取] {msg}") def log_warn(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] WARN [目录摘取] {msg}") def log_error(msg): print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR [目录摘取] {msg}") def get_page_number_from_outline_item(reader, item): """ 对应Java: getPageNumberFromOutlineItem(PDOutlineItem item) 尝试从目录项获取页码 """ try: if hasattr(item, 'page') and item.page: # 直接有page属性 page_num = reader.get_destination_page_number(item) return page_num + 1 # 转为1-based except: pass return -1 def extract_outline_items(reader, outline_items, chapters, level=1, indent=""): """ 对应Java: extractOutlineItems(PDOutlineItem item, String indent, List chapters, int level) 递归提取PDF目录项 """ if not outline_items: return for i, item in enumerate(outline_items): try: # 如果是列表(子项),递归处理 if isinstance(item, list): extract_outline_items(reader, item, chapters, level + 1, indent + " ") continue # 获取标题 title = item.title if hasattr(item, 'title') else None if title and title.strip(): title = title.strip() chapter = { "chapterId": f"chap-{len(chapters)}", "title": title, "level": level, "page": "1", "startPage": 1, "endPage": 1, "children": [] } # 尝试获取页码信息 start_page = -1 try: page_num = reader.get_destination_page_number(item) if page_num is not None and page_num >= 0: start_page = page_num + 1 # PyPDF页码从0开始,转为1-based except Exception as e: log_debug(f"无法获取章节页码信息: {title} - {e}") # 如果获取到了起始页码 if start_page != -1: chapter["page"] = str(start_page) chapter["startPage"] = start_page chapter["endPage"] = start_page # 暂时设为起始页码 else: chapter["page"] = "1" chapter["startPage"] = 1 chapter["endPage"] = 1 chapters.append(chapter) log_debug(f"{indent}• {title} (第{chapter['startPage']}页)") except Exception as e: log_warn(f"处理目录项失败: {e}") def build_hierarchical_structure(chapters, max_depth=2): """ 对应Java: buildHierarchicalStructure(List chapters) 构建层级目录结构 """ root_array = [] if not chapters: return root_array level_parents = [] # 存储每个层级的当前父节点 for chapter in chapters: level = max(0, chapter.get("level", 1)) if chapter.get("children") is None: chapter["children"] = [] if level == 0 or not level_parents or level_parents[0] is None: root_array.append(chapter) level_parents.clear() level_parents.append(chapter) continue parent_index = min(level - 1, len(level_parents) - 1) parent = level_parents[parent_index] if parent_index >= 0 else None if parent is None: # 找不到父节点时降级为根节点 root_array.append(chapter) level_parents.clear() level_parents.append(chapter) continue parent_children = parent.get("children", []) parent_children.append(chapter) parent["children"] = parent_children # 更新level_parents while len(level_parents) < level: level_parents.append(None) if level >= len(level_parents): level_parents.append(chapter) else: level_parents[level] = chapter # 清空更深层级的父节点 for idx in range(level + 1, len(level_parents)): level_parents[idx] = None # 限制目录深度 limit_directory_depth(root_array, max_depth) return root_array def limit_directory_depth(chapters, max_depth, current_depth=1): """限制目录深度""" if current_depth >= max_depth: for chapter in chapters: chapter["children"] = [] else: for chapter in chapters: if chapter.get("children"): limit_directory_depth(chapter["children"], max_depth, current_depth + 1) def extract_directory_from_pdf(file_path, file_id="test-file", file_name=None): """ 对应Java: extractDirectoryFromPDF(String filePath, String fileId, String fileName) 从PDF中提取目录结构 """ try: from pypdf import PdfReader except ImportError: try: from PyPDF2 import PdfReader except ImportError: log_error("请先安装 pypdf: pip install pypdf") return None if file_name is None: file_name = os.path.basename(file_path) log_info(f"开始处理PDF文件: {file_name}") try: total_pages = 1 chapters = [] # 加载PDF文档 reader = PdfReader(file_path) total_pages = len(reader.pages) log_info(f"PDF总页数: {total_pages}") # 尝试从PDF大纲中提取目录 outline = reader.outline if outline: log_info("发现PDF大纲,开始提取...") extract_outline_items(reader, outline, chapters, level=1, indent="") log_info(f"从大纲提取到 {len(chapters)} 个章节") if chapters: # 构建层级结构 hierarchical_array = build_hierarchical_structure(chapters) log_info(f"成功构建层级目录结构,根节点数: {len(hierarchical_array)}") return { "source": "pdf_outline", "total_pages": total_pages, "flat_chapters": chapters, "hierarchical_chapters": hierarchical_array } # 回退方案:使用文件名作为单个章节 title = file_name if title and title.lower().endswith(".pdf"): title = title[:-4] if not title or not title.strip(): title = "文档内容" directory_array = [{ "chapterId": "chap-0", "title": title, "level": 1, "page": "1", "startPage": 1, "endPage": total_pages, "children": [] }] log_info(f"使用文件名作为默认章节: '{title}', 页数范围: 1-{total_pages}") return { "source": "filename_fallback", "total_pages": total_pages, "flat_chapters": directory_array, "hierarchical_chapters": directory_array } except Exception as e: log_error(f"处理PDF失败: {e}") import traceback traceback.print_exc() return None def print_catalog_tree(chapters, indent=0): """打印目录树形结构""" for chapter in chapters: prefix = " " * indent + ("├─ " if indent > 0 else "") title = chapter.get("title", "未命名") page = chapter.get("startPage", "?") level = chapter.get("level", 1) print(f"{prefix}[L{level}] {title} (第{page}页)") children = chapter.get("children", []) if children: print_catalog_tree(children, indent + 1) def main(): print("=" * 70) print("PDF目录摘取测试脚本 (与Java extractDirectoryFromPDF 逻辑一致)") print("=" * 70) if len(sys.argv) < 2: print("\n用法:") print(" python test_catalog_extract.py ") print("\n示例:") print(" python test_catalog_extract.py ./document.pdf") print(" python test_catalog_extract.py /path/to/技术方案.pdf") print("\n依赖:") print(" pip install pypdf") print("=" * 70) sys.exit(1) pdf_path = sys.argv[1] # 检查文件是否存在 if not os.path.exists(pdf_path): log_error(f"文件不存在: {pdf_path}") sys.exit(1) print(f"\n📄 PDF文件: {pdf_path}") print(f"⏰ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("-" * 70) start_time = datetime.now() # 提取目录 result = extract_directory_from_pdf(pdf_path) duration = (datetime.now() - start_time).total_seconds() if result is None: log_error("目录提取失败") sys.exit(1) print("-" * 70) print(f"\n📊 提取结果:") print(f" 来源: {result.get('source', 'unknown')}") print(f" 总页数: {result.get('total_pages', '?')}") print(f" 扁平目录项数量: {len(result.get('flat_chapters', []))}") print(f" 层级根节点数量: {len(result.get('hierarchical_chapters', []))}") print(f" 处理耗时: {duration:.3f}秒") # 打印扁平目录 print(f"\n📋 扁平目录 (flat_chapters):") print("-" * 40) for chapter in result.get("flat_chapters", []): level = chapter.get("level", 1) indent = " " * (level - 1) print(f"{indent}[{chapter.get('chapterId')}] {chapter.get('title')} (L{level}, 第{chapter.get('startPage')}页)") # 打印层级目录 print(f"\n🌳 层级目录 (hierarchical_chapters):") print("-" * 40) print_catalog_tree(result.get("hierarchical_chapters", [])) # 保存JSON结果 output_file = os.path.splitext(pdf_path)[0] + "_catalog_test.json" with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"\n✅ JSON结果已保存到: {output_file}") print("\n" + "=" * 70) print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") print("=" * 70) if __name__ == "__main__": main()