I第一次提交

2025-12-13 23:00:09 +08:00
commit ac08a0b6ff
180 changed files with 28023 additions and 0 deletions
--- a/test_catalog_extract.py
+++ b/test_catalog_extract.py
@@ -0,0 +1,336 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+PDF目录摘取测试脚本 (与Java代码逻辑一致)
+
+模拟 FileServiceImpl.extractDirectoryFromPDF 的逻辑，用于测试验证。
+
+使用方法:
+    python test_catalog_extract.py <pdf文件路径>
+
+依赖安装:
+    pip install pypdf
+"""
+
+import sys
+import json
+import os
+from datetime import datetime
+
+# 模拟Java的日志输出
+def log_info(msg):
+    print(f"[{datetime.now().strftime('%H:%M:%S')}] INFO  [目录摘取] {msg}")
+
+def log_debug(msg):
+    print(f"[{datetime.now().strftime('%H:%M:%S')}] DEBUG [目录摘取] {msg}")
+
+def log_warn(msg):
+    print(f"[{datetime.now().strftime('%H:%M:%S')}] WARN  [目录摘取] {msg}")
+
+def log_error(msg):
+    print(f"[{datetime.now().strftime('%H:%M:%S')}] ERROR [目录摘取] {msg}")
+
+
+def get_page_number_from_outline_item(reader, item):
+    """
+    对应Java: getPageNumberFromOutlineItem(PDOutlineItem item)
+    尝试从目录项获取页码
+    """
+    try:
+        if hasattr(item, 'page') and item.page:
+            # 直接有page属性
+            page_num = reader.get_destination_page_number(item)
+            return page_num + 1  # 转为1-based
+    except:
+        pass
+    return -1
+
+
+def extract_outline_items(reader, outline_items, chapters, level=1, indent=""):
+    """
+    对应Java: extractOutlineItems(PDOutlineItem item, String indent, List<JSONObject> chapters, int level)
+    递归提取PDF目录项
+    """
+    if not outline_items:
+        return
+    
+    for i, item in enumerate(outline_items):
+        try:
+            # 如果是列表（子项），递归处理
+            if isinstance(item, list):
+                extract_outline_items(reader, item, chapters, level + 1, indent + "  ")
+                continue
+            
+            # 获取标题
+            title = item.title if hasattr(item, 'title') else None
+            
+            if title and title.strip():
+                title = title.strip()
+                
+                chapter = {
+                    "chapterId": f"chap-{len(chapters)}",
+                    "title": title,
+                    "level": level,
+                    "page": "1",
+                    "startPage": 1,
+                    "endPage": 1,
+                    "children": []
+                }
+                
+                # 尝试获取页码信息
+                start_page = -1
+                try:
+                    page_num = reader.get_destination_page_number(item)
+                    if page_num is not None and page_num >= 0:
+                        start_page = page_num + 1  # PyPDF页码从0开始，转为1-based
+                except Exception as e:
+                    log_debug(f"无法获取章节页码信息: {title} - {e}")
+                
+                # 如果获取到了起始页码
+                if start_page != -1:
+                    chapter["page"] = str(start_page)
+                    chapter["startPage"] = start_page
+                    chapter["endPage"] = start_page  # 暂时设为起始页码
+                else:
+                    chapter["page"] = "1"
+                    chapter["startPage"] = 1
+                    chapter["endPage"] = 1
+                
+                chapters.append(chapter)
+                log_debug(f"{indent}• {title} (第{chapter['startPage']}页)")
+                
+        except Exception as e:
+            log_warn(f"处理目录项失败: {e}")
+
+
+def build_hierarchical_structure(chapters, max_depth=2):
+    """
+    对应Java: buildHierarchicalStructure(List<JSONObject> chapters)
+    构建层级目录结构
+    """
+    root_array = []
+    if not chapters:
+        return root_array
+    
+    level_parents = []  # 存储每个层级的当前父节点
+    
+    for chapter in chapters:
+        level = max(0, chapter.get("level", 1))
+        if chapter.get("children") is None:
+            chapter["children"] = []
+        
+        if level == 0 or not level_parents or level_parents[0] is None:
+            root_array.append(chapter)
+            level_parents.clear()
+            level_parents.append(chapter)
+            continue
+        
+        parent_index = min(level - 1, len(level_parents) - 1)
+        parent = level_parents[parent_index] if parent_index >= 0 else None
+        
+        if parent is None:
+            # 找不到父节点时降级为根节点
+            root_array.append(chapter)
+            level_parents.clear()
+            level_parents.append(chapter)
+            continue
+        
+        parent_children = parent.get("children", [])
+        parent_children.append(chapter)
+        parent["children"] = parent_children
+        
+        # 更新level_parents
+        while len(level_parents) < level:
+            level_parents.append(None)
+        if level >= len(level_parents):
+            level_parents.append(chapter)
+        else:
+            level_parents[level] = chapter
+        
+        # 清空更深层级的父节点
+        for idx in range(level + 1, len(level_parents)):
+            level_parents[idx] = None
+    
+    # 限制目录深度
+    limit_directory_depth(root_array, max_depth)
+    
+    return root_array
+
+
+def limit_directory_depth(chapters, max_depth, current_depth=1):
+    """限制目录深度"""
+    if current_depth >= max_depth:
+        for chapter in chapters:
+            chapter["children"] = []
+    else:
+        for chapter in chapters:
+            if chapter.get("children"):
+                limit_directory_depth(chapter["children"], max_depth, current_depth + 1)
+
+
+def extract_directory_from_pdf(file_path, file_id="test-file", file_name=None):
+    """
+    对应Java: extractDirectoryFromPDF(String filePath, String fileId, String fileName)
+    从PDF中提取目录结构
+    """
+    try:
+        from pypdf import PdfReader
+    except ImportError:
+        try:
+            from PyPDF2 import PdfReader
+        except ImportError:
+            log_error("请先安装 pypdf: pip install pypdf")
+            return None
+    
+    if file_name is None:
+        file_name = os.path.basename(file_path)
+    
+    log_info(f"开始处理PDF文件: {file_name}")
+    
+    try:
+        total_pages = 1
+        chapters = []
+        
+        # 加载PDF文档
+        reader = PdfReader(file_path)
+        total_pages = len(reader.pages)
+        log_info(f"PDF总页数: {total_pages}")
+        
+        # 尝试从PDF大纲中提取目录
+        outline = reader.outline
+        
+        if outline:
+            log_info("发现PDF大纲，开始提取...")
+            extract_outline_items(reader, outline, chapters, level=1, indent="")
+            log_info(f"从大纲提取到 {len(chapters)} 个章节")
+        
+        if chapters:
+            # 构建层级结构
+            hierarchical_array = build_hierarchical_structure(chapters)
+            log_info(f"成功构建层级目录结构，根节点数: {len(hierarchical_array)}")
+            return {
+                "source": "pdf_outline",
+                "total_pages": total_pages,
+                "flat_chapters": chapters,
+                "hierarchical_chapters": hierarchical_array
+            }
+        
+        # 回退方案：使用文件名作为单个章节
+        title = file_name
+        if title and title.lower().endswith(".pdf"):
+            title = title[:-4]
+        if not title or not title.strip():
+            title = "文档内容"
+        
+        directory_array = [{
+            "chapterId": "chap-0",
+            "title": title,
+            "level": 1,
+            "page": "1",
+            "startPage": 1,
+            "endPage": total_pages,
+            "children": []
+        }]
+        
+        log_info(f"使用文件名作为默认章节: '{title}', 页数范围: 1-{total_pages}")
+        
+        return {
+            "source": "filename_fallback",
+            "total_pages": total_pages,
+            "flat_chapters": directory_array,
+            "hierarchical_chapters": directory_array
+        }
+        
+    except Exception as e:
+        log_error(f"处理PDF失败: {e}")
+        import traceback
+        traceback.print_exc()
+        return None
+
+
+def print_catalog_tree(chapters, indent=0):
+    """打印目录树形结构"""
+    for chapter in chapters:
+        prefix = "  " * indent + ("├─ " if indent > 0 else "")
+        title = chapter.get("title", "未命名")
+        page = chapter.get("startPage", "?")
+        level = chapter.get("level", 1)
+        print(f"{prefix}[L{level}] {title} (第{page}页)")
+        
+        children = chapter.get("children", [])
+        if children:
+            print_catalog_tree(children, indent + 1)
+
+
+def main():
+    print("=" * 70)
+    print("PDF目录摘取测试脚本 (与Java extractDirectoryFromPDF 逻辑一致)")
+    print("=" * 70)
+    
+    if len(sys.argv) < 2:
+        print("\n用法:")
+        print("  python test_catalog_extract.py <pdf文件路径>")
+        print("\n示例:")
+        print("  python test_catalog_extract.py ./document.pdf")
+        print("  python test_catalog_extract.py /path/to/技术方案.pdf")
+        print("\n依赖:")
+        print("  pip install pypdf")
+        print("=" * 70)
+        sys.exit(1)
+    
+    pdf_path = sys.argv[1]
+    
+    # 检查文件是否存在
+    if not os.path.exists(pdf_path):
+        log_error(f"文件不存在: {pdf_path}")
+        sys.exit(1)
+    
+    print(f"\n📄 PDF文件: {pdf_path}")
+    print(f"⏰ 开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("-" * 70)
+    
+    start_time = datetime.now()
+    
+    # 提取目录
+    result = extract_directory_from_pdf(pdf_path)
+    
+    duration = (datetime.now() - start_time).total_seconds()
+    
+    if result is None:
+        log_error("目录提取失败")
+        sys.exit(1)
+    
+    print("-" * 70)
+    print(f"\n📊 提取结果:")
+    print(f"  来源: {result.get('source', 'unknown')}")
+    print(f"  总页数: {result.get('total_pages', '?')}")
+    print(f"  扁平目录项数量: {len(result.get('flat_chapters', []))}")
+    print(f"  层级根节点数量: {len(result.get('hierarchical_chapters', []))}")
+    print(f"  处理耗时: {duration:.3f}秒")
+    
+    # 打印扁平目录
+    print(f"\n📋 扁平目录 (flat_chapters):")
+    print("-" * 40)
+    for chapter in result.get("flat_chapters", []):
+        level = chapter.get("level", 1)
+        indent = "  " * (level - 1)
+        print(f"{indent}[{chapter.get('chapterId')}] {chapter.get('title')} (L{level}, 第{chapter.get('startPage')}页)")
+    
+    # 打印层级目录
+    print(f"\n🌳 层级目录 (hierarchical_chapters):")
+    print("-" * 40)
+    print_catalog_tree(result.get("hierarchical_chapters", []))
+    
+    # 保存JSON结果
+    output_file = os.path.splitext(pdf_path)[0] + "_catalog_test.json"
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+    print(f"\n✅ JSON结果已保存到: {output_file}")
+    
+    print("\n" + "=" * 70)
+    print(f"⏰ 完成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
+    print("=" * 70)
+
+
+if __name__ == "__main__":
+    main()