"""
Ingest documents into the FAISS vector store.

Usage:
    python ingest.py /path/to/docs_folder

The vector store is saved to ./vectorstore/.
"""

import os
import sys
from pathlib import Path
from typing import Iterable

from dotenv import load_dotenv
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    TextLoader,
    UnstructuredFileLoader,
    UnstructuredHTMLLoader,
    UnstructuredMarkdownLoader,
    UnstructuredPDFLoader,
    PyPDFLoader,
)
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

load_dotenv()

VECTORSTORE_DIR = os.path.join(os.path.dirname(__file__), "vectorstore")
PDF_FAST_SIZE_MB = float(os.getenv("PDF_FAST_SIZE_MB", "5"))
PDF_FAST_SIZE_BYTES = int(PDF_FAST_SIZE_MB * 1024 * 1024)

MARKDOWN_EXTS = {".md", ".markdown", ".mdx"}
TEXT_EXTS = {
    ".txt",
    ".log",
    ".csv",
    ".tsv",
    ".json",
    ".yaml",
    ".yml",
    ".toml",
    ".ini",
    ".cfg",
    ".rst",
    ".py",
    ".js",
    ".ts",
    ".tsx",
    ".jsx",
    ".java",
    ".go",
    ".rs",
    ".c",
    ".h",
    ".cpp",
    ".hpp",
    ".cs",
    ".rb",
    ".php",
    ".sh",
}
HTML_EXTS = {".html", ".htm", ".xml"}
UNSTRUCTURED_EXTS = {
    ".doc",
    ".docx",
    ".ppt",
    ".pptx",
    ".xls",
    ".xlsx",
    ".odt",
    ".ods",
    ".odp",
    ".rtf",
    ".epub",
}


def _iter_files(root: Path) -> Iterable[Path]:
    if root.is_file():
        yield root
        return
    for path in root.rglob("*"):
        if path.is_file():
            yield path


def _pdf_loader_for(path: Path):
    size_bytes = path.stat().st_size
    if size_bytes >= PDF_FAST_SIZE_BYTES:
        loader = PyPDFLoader(str(path))
        loader_name = "PyPDFLoader"
    else:
        loader = UnstructuredPDFLoader(str(path))
        loader_name = "UnstructuredPDFLoader"
    size_mb = size_bytes / (1024 * 1024)
    print(f"Loading {path} ({size_mb:.1f} MB) with {loader_name}...")
    return loader


def _select_loader(path: Path):
    suffix = path.suffix.lower()

    if suffix == ".pdf":
        return _pdf_loader_for(path)
    if suffix in MARKDOWN_EXTS:
        return UnstructuredMarkdownLoader(str(path))
    if suffix in HTML_EXTS:
        return UnstructuredHTMLLoader(str(path))
    if suffix in TEXT_EXTS:
        return TextLoader(str(path), autodetect_encoding=True)
    if suffix in UNSTRUCTURED_EXTS:
        return UnstructuredFileLoader(str(path))
    return None


def load_documents(path: str):
    """Load supported files from a directory (recursively)."""
    root = Path(path)
    if not root.exists():
        print(f"Path not found: {root}")
        return []

    docs = []
    for file_path in _iter_files(root):
        loader = _select_loader(file_path)
        if loader is None:
            continue
        try:
            docs.extend(loader.load())
        except Exception as exc:
            print(f"Error loading {file_path}: {exc}")

    return docs


def ingest(docs_path: str):
    print(f"Loading documents from: {docs_path}")
    documents = load_documents(docs_path)

    if not documents:
        print("No documents found. Exiting.")
        sys.exit(1)

    print(f"Loaded {len(documents)} document(s). Splitting…")

    # Use smaller chunks for more fine-grained retrieval
    # chunk_size=400 creates more precise chunks that better match specific questions
    # chunk_overlap=100 maintains context across chunks
    splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
    chunks = splitter.split_documents(documents)
    print(f"Split into {len(chunks)} chunks.")

    print("Creating embeddings & building FAISS index…")
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(chunks, embeddings)

    vectorstore.save_local(VECTORSTORE_DIR)
    print(f"Vector store saved to {VECTORSTORE_DIR}")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: python ingest.py /path/to/docs")
        sys.exit(1)
    ingest(sys.argv[1])