Home_assistant/backend/app/rag/ingest.py

from __future__ import annotations

import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Any

from qdrant_client.http import models as qm
from sqlalchemy import select
from sqlalchemy.orm import Session

from app.config import get_settings
from app.db.models import ChatSession, Document, DocumentChunk
from app.rag import embeddings
from app.rag.chunker import chunk_text
from app.rag.store import (
    COLLECTION_DOC_CHUNKS,
    COLLECTION_FACTS,
    COLLECTION_SUMMARIES,
    delete_by_filter,
    upsert_points,
)


async def index_memory_fact(
    *,
    fact_id: int,
    user_id: int,
    content: str,
    category: str,
    importance: int,
    active: bool = True,
) -> None:
    settings = get_settings()
    if not settings.rag_enabled or not active:
        return
    vectors = await embeddings.embed_texts([content])
    if not vectors:
        return
    upsert_points(
        COLLECTION_FACTS,
        [
            qm.PointStruct(
                id=int(fact_id),
                vector=vectors[0],
                payload={
                    "user_id": user_id,
                    "fact_id": fact_id,
                    "category": category,
                    "content": content,
                    "importance": importance,
                },
            )
        ],
    )


async def deactivate_memory_fact(fact_id: int) -> None:
    settings = get_settings()
    if not settings.rag_enabled:
        return
    delete_by_filter(
        COLLECTION_FACTS,
        [qm.FieldCondition(key="fact_id", match=qm.MatchValue(value=fact_id))],
    )


async def index_session_summary(session_id: int, summary: str) -> None:
    settings = get_settings()
    if not settings.rag_enabled or not summary.strip():
        return
    from app.db.base import SessionLocal

    user_id = 1
    db = SessionLocal()
    try:
        session = db.get(ChatSession, session_id)
        if session:
            user_id = session.user_id
    finally:
        db.close()

    vectors = await embeddings.embed_texts([summary])
    if not vectors:
        return
    upsert_points(
        COLLECTION_SUMMARIES,
        [
            qm.PointStruct(
                id=int(session_id),
                vector=vectors[0],
                payload={"user_id": user_id, "session_id": session_id, "summary": summary[:4000]},
            )
        ],
    )


async def ingest_document_file(
    db: Session,
    *,
    user_id: int,
    title: str,
    filename: str,
    raw_bytes: bytes,
) -> dict[str, Any]:
    settings = get_settings()
    text = raw_bytes.decode("utf-8", errors="replace").strip()
    if not text:
        raise ValueError("Пустой документ")

    digest = hashlib.sha256(raw_bytes).hexdigest()
    doc = Document(
        user_id=user_id,
        title=title or filename,
        filename=filename,
        content_hash=digest,
        size_bytes=len(raw_bytes),
    )
    db.add(doc)
    db.flush()

    chunks = chunk_text(text)
    chunk_rows: list[DocumentChunk] = []
    for idx, piece in enumerate(chunks):
        row = DocumentChunk(document_id=doc.id, chunk_index=idx, content=piece)
        db.add(row)
        chunk_rows.append(row)
    db.commit()
    db.refresh(doc)

    if settings.rag_enabled and chunks:
        vectors = await embeddings.embed_texts(chunks)
        points: list[qm.PointStruct] = []
        for row, vector in zip(chunk_rows, vectors, strict=False):
            db.refresh(row)
            point_id = int(row.id)
            points.append(
                qm.PointStruct(
                    id=point_id,
                    vector=vector,
                    payload={
                        "user_id": user_id,
                        "document_id": doc.id,
                        "chunk_id": row.id,
                        "chunk_index": row.chunk_index,
                        "title": doc.title,
                        "content": row.content,
                    },
                )
            )
        upsert_points(COLLECTION_DOC_CHUNKS, points)

    return {
        "id": doc.id,
        "title": doc.title,
        "filename": doc.filename,
        "chunk_count": len(chunks),
        "size_bytes": doc.size_bytes,
        "created_at": doc.created_at.isoformat() if doc.created_at else None,
    }