65 lines
1.7 KiB
Python
65 lines
1.7 KiB
Python
"""
|
|
向量存储模块
|
|
|
|
提供向量数据库的统一接口,当前支持 Chroma
|
|
"""
|
|
|
|
from abc import ABC, abstractmethod
|
|
from pathlib import Path
|
|
from typing import List
|
|
|
|
from langchain_community.vectorstores import Chroma
|
|
|
|
from embeddings import BaseEmbeddings
|
|
|
|
|
|
class BaseVectorStore(ABC):
|
|
"""向量存储基类"""
|
|
|
|
@abstractmethod
|
|
def persist(self, texts: List[str], metadatas: List[dict]) -> None:
|
|
"""
|
|
持久化文本到向量数据库
|
|
|
|
Args:
|
|
texts: 文本列表
|
|
metadatas: 元数据列表
|
|
"""
|
|
pass
|
|
|
|
|
|
class ChromaVectorStore(BaseVectorStore):
|
|
"""Chroma 向量数据库实现"""
|
|
|
|
def __init__(self, persist_directory: Path, embeddings_provider: BaseEmbeddings):
|
|
"""
|
|
初始化 Chroma 向量存储
|
|
|
|
Args:
|
|
persist_directory: 持久化目录
|
|
embeddings_provider: 嵌入模型提供者
|
|
"""
|
|
self.persist_directory = Path(persist_directory)
|
|
self.embeddings_provider = embeddings_provider
|
|
|
|
def persist(self, texts: List[str], metadatas: List[dict]) -> None:
|
|
"""
|
|
将文本向量化并持久化到 Chroma
|
|
|
|
Args:
|
|
texts: 文本列表
|
|
metadatas: 元数据列表
|
|
"""
|
|
self.persist_directory.mkdir(parents=True, exist_ok=True)
|
|
|
|
embeddings = self.embeddings_provider.get_embeddings()
|
|
|
|
print(f"[INFO] Writing {len(texts)} papers to Chroma: {self.persist_directory}")
|
|
vectorstore = Chroma.from_texts(
|
|
texts=texts,
|
|
metadatas=metadatas,
|
|
embedding=embeddings,
|
|
persist_directory=str(self.persist_directory),
|
|
)
|
|
print("[OK] Chroma persistence complete.")
|