Files
paper-embedding/vector_stores.py
2026-01-11 16:09:16 +08:00

65 lines
1.7 KiB
Python

"""
向量存储模块
提供向量数据库的统一接口,当前支持 Chroma
"""
from abc import ABC, abstractmethod
from pathlib import Path
from typing import List
from langchain_community.vectorstores import Chroma
from embeddings import BaseEmbeddings
class BaseVectorStore(ABC):
"""向量存储基类"""
@abstractmethod
def persist(self, texts: List[str], metadatas: List[dict]) -> None:
"""
持久化文本到向量数据库
Args:
texts: 文本列表
metadatas: 元数据列表
"""
pass
class ChromaVectorStore(BaseVectorStore):
"""Chroma 向量数据库实现"""
def __init__(self, persist_directory: Path, embeddings_provider: BaseEmbeddings):
"""
初始化 Chroma 向量存储
Args:
persist_directory: 持久化目录
embeddings_provider: 嵌入模型提供者
"""
self.persist_directory = Path(persist_directory)
self.embeddings_provider = embeddings_provider
def persist(self, texts: List[str], metadatas: List[dict]) -> None:
"""
将文本向量化并持久化到 Chroma
Args:
texts: 文本列表
metadatas: 元数据列表
"""
self.persist_directory.mkdir(parents=True, exist_ok=True)
embeddings = self.embeddings_provider.get_embeddings()
print(f"[INFO] Writing {len(texts)} papers to Chroma: {self.persist_directory}")
vectorstore = Chroma.from_texts(
texts=texts,
metadatas=metadatas,
embedding=embeddings,
persist_directory=str(self.persist_directory),
)
print("[OK] Chroma persistence complete.")