"""Extract text from uploaded documents using markitdown.

Supports PDF, DOCX, TXT, and images (PNG, JPG — with LLM-powered descriptions
when an OpenAI client is available).
"""

from __future__ import annotations

import io
from dataclasses import dataclass

from fastapi import HTTPException, UploadFile, status
from markitdown import MarkItDown, StreamInfo

MAX_UPLOAD_BYTES = 10 * 1024 * 1024  # 10 MB

_SUPPORTED_MIMES = {
    "application/pdf",
    "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "text/plain",
    "image/png",
    "image/jpeg",
}

_EXT_TO_MIME: dict[str, str] = {
    ".pdf": "application/pdf",
    ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    ".txt": "text/plain",
    ".png": "image/png",
    ".jpg": "image/jpeg",
    ".jpeg": "image/jpeg",
}


@dataclass(frozen=True, slots=True)
class UploadPayload:
    """Validated upload: raw bytes + resolved metadata.

    The router consumes this once and can both extract text *and* persist
    the original to object storage without re-reading the stream.
    """

    data: bytes
    mime: str
    filename: str | None
    extension: str | None


def _build_converter() -> MarkItDown:
    """Build a MarkItDown instance, optionally with LLM vision for images."""
    try:
        from openai import OpenAI

        client = OpenAI()
        return MarkItDown(llm_client=client, llm_model="gpt-4o-mini")
    except Exception:
        # No OpenAI client available — images will produce minimal output.
        return MarkItDown()


async def read_upload(file: UploadFile) -> UploadPayload:
    """Read and validate *file*. Returns bytes + resolved metadata.

    Raises ``HTTPException(422)`` for unsupported types or oversized docs.
    """
    data = await file.read()
    if len(data) > MAX_UPLOAD_BYTES:
        raise HTTPException(
            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
            detail=f"File exceeds maximum size of {MAX_UPLOAD_BYTES // (1024 * 1024)} MB.",
        )

    mime = _resolve_mime(file)
    if mime not in _SUPPORTED_MIMES:
        raise HTTPException(
            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
            detail=(
                f"Unsupported file type: {mime or 'unknown'}. Accepted: PDF, DOCX, TXT, PNG, JPG."
            ),
        )

    return UploadPayload(
        data=data,
        mime=mime,
        filename=file.filename,
        extension=_ext_from_filename(file.filename),
    )


def extract_text_from_payload(payload: UploadPayload) -> str:
    """Extract Markdown text from a previously-validated payload."""
    stream_info = StreamInfo(
        extension=payload.extension,
        mimetype=payload.mime,
        filename=payload.filename,
    )

    md = _build_converter()
    result = md.convert_stream(io.BytesIO(payload.data), stream_info=stream_info)
    text = (result.markdown or "").strip()

    if not text:
        raise HTTPException(
            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
            detail="Could not extract any text from the uploaded file.",
        )
    return text


async def extract_text(file: UploadFile) -> str:
    """Read *file* and return extracted text as Markdown.

    Compatibility wrapper for callers that don't need the raw bytes.
    Raises ``HTTPException(422)`` for unsupported types, oversized, or empty docs.
    """
    payload = await read_upload(file)
    return extract_text_from_payload(payload)


def _resolve_mime(file: UploadFile) -> str | None:
    """Return the best-guess MIME type for *file*."""
    mime = file.content_type
    if mime and mime in _SUPPORTED_MIMES:
        return mime
    # Fallback: derive from extension.
    name = (file.filename or "").lower()
    for ext, ext_mime in _EXT_TO_MIME.items():
        if name.endswith(ext):
            return ext_mime
    return mime


def _ext_from_filename(filename: str | None) -> str | None:
    """Extract extension (with dot) from filename."""
    if not filename:
        return None
    dot = filename.rfind(".")
    if dot == -1:
        return None
    return filename[dot:].lower()