"""document_loading_demo.py — TextLoader, PyPDFLoader, CSVLoader, WebBaseLoader"""

from pathlib import Path

from langchain_community.document_loaders import (
    CSVLoader,
    PyPDFLoader,
    TextLoader,
    WebBaseLoader,
)

SAMPLES = Path(__file__).parent / "document_loading_samples"


def preview(label: str, docs: list) -> None:
    print(f"\n=== {label} ({len(docs)} chunk(s)) ===")
    for i, doc in enumerate(docs):
        print(f"--- [{i}] metadata: {doc.metadata}")
        text = doc.page_content.strip().replace("\n", " ")
        snippet = text[:180] + ("…" if len(text) > 180 else "")
        print(snippet)


def main() -> None:
    if not SAMPLES.exists():
        raise SystemExit(
            f"Missing folder: {SAMPLES}\n"
            "Download document_loading_demo.zip from the lesson page "
            "and unzip so this folder sits next to the script."
        )

    text_docs = TextLoader(
        str(SAMPLES / "html_basics.txt"),
        encoding="utf-8",
    ).load()
    preview("TextLoader", text_docs)

    csv_docs = CSVLoader(str(SAMPLES / "html_tags.csv")).load()
    preview("CSVLoader", csv_docs)

    pdf_path = SAMPLES / "html_basics.pdf"
    if pdf_path.exists():
        pdf_docs = PyPDFLoader(str(pdf_path)).load()
        preview("PyPDFLoader", pdf_docs)
    else:
        print("\n=== PyPDFLoader (skipped) ===")
        print(f"Place a small PDF at {pdf_path}")

    print("\n=== WebBaseLoader (fetching google.com) ===")
    web_docs = WebBaseLoader("https://www.google.com").load()
    preview("WebBaseLoader", web_docs)


if __name__ == "__main__":
    main()