138 lines
3.5 KiB
Python
138 lines
3.5 KiB
Python
"""
|
|
数据解析模块
|
|
|
|
提供论文文档的解析功能,支持多种格式(当前实现 Markdown 格式)
|
|
"""
|
|
|
|
import re
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import List, Tuple
|
|
|
|
|
|
class BaseParser(ABC):
|
|
"""解析器基类"""
|
|
|
|
@abstractmethod
|
|
def parse(self, content: str) -> Tuple[List[str], List[dict]]:
|
|
"""
|
|
解析内容文本
|
|
|
|
Args:
|
|
content: 待解析的文本内容
|
|
|
|
Returns:
|
|
(texts, metadatas): 文本列表和对应的元数据列表
|
|
"""
|
|
pass
|
|
|
|
|
|
class MarkdownPaperParser(BaseParser):
|
|
"""
|
|
Markdown 格式论文解析器
|
|
|
|
解析格式:
|
|
- 论文以 '---' 分隔
|
|
- 每篇论文以 '## Title' 开头
|
|
- 内容包含标题、摘要和其他部分
|
|
"""
|
|
|
|
def __init__(self, separator: str = r'\n---\s*\n', title_pattern: str = r'^##\s+(.+)$'):
|
|
"""
|
|
初始化解析器
|
|
|
|
Args:
|
|
separator: 论文分隔符正则表达式
|
|
title_pattern: 标题匹配正则表达式
|
|
"""
|
|
self.separator = separator
|
|
self.title_pattern = title_pattern
|
|
|
|
def parse(self, content: str) -> Tuple[List[str], List[dict]]:
|
|
"""
|
|
解析 markdown 格式的论文内容
|
|
|
|
Args:
|
|
content: markdown 文件内容
|
|
|
|
Returns:
|
|
(texts, metadatas): 论文文本列表和元数据列表
|
|
|
|
Raises:
|
|
ValueError: 如果未找到有效论文
|
|
"""
|
|
raw_chunks = re.split(self.separator, content)
|
|
|
|
texts: List[str] = []
|
|
metadatas: List[dict] = []
|
|
|
|
for chunk in raw_chunks:
|
|
chunk = chunk.strip()
|
|
if not chunk:
|
|
continue
|
|
|
|
# 提取标题
|
|
title_match = re.search(self.title_pattern, chunk, re.MULTILINE)
|
|
if not title_match:
|
|
self._handle_missing_title(chunk)
|
|
continue
|
|
|
|
title = title_match.group(1).strip()
|
|
paper_content = chunk
|
|
|
|
texts.append(paper_content)
|
|
metadatas.append({
|
|
"title": title,
|
|
"content_length": len(paper_content),
|
|
})
|
|
|
|
if not texts:
|
|
raise ValueError("No valid papers were found in the content.")
|
|
|
|
return texts, metadatas
|
|
|
|
def _handle_missing_title(self, chunk: str) -> None:
|
|
"""处理缺少标题的论文块"""
|
|
preview = chunk[:50].replace('\n', ' ')
|
|
print(f"[WARN] Skipping paper without ## title: {preview}...")
|
|
|
|
|
|
class PaperFileReader:
|
|
"""论文文件读取器"""
|
|
|
|
def __init__(self, parser: BaseParser):
|
|
"""
|
|
初始化文件读取器
|
|
|
|
Args:
|
|
parser: 内容解析器实例
|
|
"""
|
|
self.parser = parser
|
|
|
|
def read(self, file_path: Path) -> Tuple[List[str], List[dict]]:
|
|
"""
|
|
从文件读取并解析论文
|
|
|
|
Args:
|
|
file_path: 论文文件路径
|
|
|
|
Returns:
|
|
(texts, metadatas): 解析后的文本和元数据
|
|
|
|
Raises:
|
|
FileNotFoundError: 如果文件不存在
|
|
"""
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"Paper file not found: {file_path}")
|
|
|
|
print(f"[INFO] Reading papers from: {file_path}")
|
|
content = file_path.read_text(encoding="utf-8")
|
|
|
|
texts, metadatas = self.parser.parse(content)
|
|
|
|
# 添加源文件信息到元数据
|
|
for meta in metadatas:
|
|
meta["source_file"] = file_path.name
|
|
|
|
return texts, metadatas
|