161 lines
4.4 KiB
Python
161 lines
4.4 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
from qdrant_client.http import models as qm
|
|
from sqlalchemy import select
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.config import get_settings
|
|
from app.db.models import ChatSession, Document, DocumentChunk
|
|
from app.rag import embeddings
|
|
from app.rag.chunker import chunk_text
|
|
from app.rag.store import (
|
|
COLLECTION_DOC_CHUNKS,
|
|
COLLECTION_FACTS,
|
|
COLLECTION_SUMMARIES,
|
|
delete_by_filter,
|
|
upsert_points,
|
|
)
|
|
|
|
|
|
async def index_memory_fact(
|
|
*,
|
|
fact_id: int,
|
|
user_id: int,
|
|
content: str,
|
|
category: str,
|
|
importance: int,
|
|
active: bool = True,
|
|
) -> None:
|
|
settings = get_settings()
|
|
if not settings.rag_enabled or not active:
|
|
return
|
|
vectors = await embeddings.embed_texts([content])
|
|
if not vectors:
|
|
return
|
|
upsert_points(
|
|
COLLECTION_FACTS,
|
|
[
|
|
qm.PointStruct(
|
|
id=int(fact_id),
|
|
vector=vectors[0],
|
|
payload={
|
|
"user_id": user_id,
|
|
"fact_id": fact_id,
|
|
"category": category,
|
|
"content": content,
|
|
"importance": importance,
|
|
},
|
|
)
|
|
],
|
|
)
|
|
|
|
|
|
async def deactivate_memory_fact(fact_id: int) -> None:
|
|
settings = get_settings()
|
|
if not settings.rag_enabled:
|
|
return
|
|
delete_by_filter(
|
|
COLLECTION_FACTS,
|
|
[qm.FieldCondition(key="fact_id", match=qm.MatchValue(value=fact_id))],
|
|
)
|
|
|
|
|
|
async def index_session_summary(session_id: int, summary: str) -> None:
|
|
settings = get_settings()
|
|
if not settings.rag_enabled or not summary.strip():
|
|
return
|
|
from app.db.base import SessionLocal
|
|
|
|
user_id = 1
|
|
db = SessionLocal()
|
|
try:
|
|
session = db.get(ChatSession, session_id)
|
|
if session:
|
|
user_id = session.user_id
|
|
finally:
|
|
db.close()
|
|
|
|
vectors = await embeddings.embed_texts([summary])
|
|
if not vectors:
|
|
return
|
|
upsert_points(
|
|
COLLECTION_SUMMARIES,
|
|
[
|
|
qm.PointStruct(
|
|
id=int(session_id),
|
|
vector=vectors[0],
|
|
payload={"user_id": user_id, "session_id": session_id, "summary": summary[:4000]},
|
|
)
|
|
],
|
|
)
|
|
|
|
|
|
async def ingest_document_file(
|
|
db: Session,
|
|
*,
|
|
user_id: int,
|
|
title: str,
|
|
filename: str,
|
|
raw_bytes: bytes,
|
|
) -> dict[str, Any]:
|
|
settings = get_settings()
|
|
text = raw_bytes.decode("utf-8", errors="replace").strip()
|
|
if not text:
|
|
raise ValueError("Пустой документ")
|
|
|
|
digest = hashlib.sha256(raw_bytes).hexdigest()
|
|
doc = Document(
|
|
user_id=user_id,
|
|
title=title or filename,
|
|
filename=filename,
|
|
content_hash=digest,
|
|
size_bytes=len(raw_bytes),
|
|
)
|
|
db.add(doc)
|
|
db.flush()
|
|
|
|
chunks = chunk_text(text)
|
|
chunk_rows: list[DocumentChunk] = []
|
|
for idx, piece in enumerate(chunks):
|
|
row = DocumentChunk(document_id=doc.id, chunk_index=idx, content=piece)
|
|
db.add(row)
|
|
chunk_rows.append(row)
|
|
db.commit()
|
|
db.refresh(doc)
|
|
|
|
if settings.rag_enabled and chunks:
|
|
vectors = await embeddings.embed_texts(chunks)
|
|
points: list[qm.PointStruct] = []
|
|
for row, vector in zip(chunk_rows, vectors, strict=False):
|
|
db.refresh(row)
|
|
point_id = int(row.id)
|
|
points.append(
|
|
qm.PointStruct(
|
|
id=point_id,
|
|
vector=vector,
|
|
payload={
|
|
"user_id": user_id,
|
|
"document_id": doc.id,
|
|
"chunk_id": row.id,
|
|
"chunk_index": row.chunk_index,
|
|
"title": doc.title,
|
|
"content": row.content,
|
|
},
|
|
)
|
|
)
|
|
upsert_points(COLLECTION_DOC_CHUNKS, points)
|
|
|
|
return {
|
|
"id": doc.id,
|
|
"title": doc.title,
|
|
"filename": doc.filename,
|
|
"chunk_count": len(chunks),
|
|
"size_bytes": doc.size_bytes,
|
|
"created_at": doc.created_at.isoformat() if doc.created_at else None,
|
|
}
|