pdf_tool/pdf_daemon.py

#!/usr/bin/env python3
"""
PDF Text Extraction Daemon - Fast API service for PDF text extraction.

Run with: uvicorn pdf_daemon:app --host 0.0.0.0 --port 8000 --reload
"""

import os
import time
import aiohttp
import fitz  # PyMuPDF
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from typing import Optional


app = FastAPI(
    title="PDF Text Extraction API",
    description="Fast PDF text extraction service using PyMuPDF",
    version="1.0.0"
)


class ExtractResponse(BaseModel):
    """Response model with extracted text and metadata."""
    success: bool
    text: str
    file_size_kb: float
    pages: int
    extraction_time_ms: float
    message: str


async def download_pdf(session: aiohttp.ClientSession, url: str) -> bytes:
    """Download PDF from URL using aiohttp session."""
    async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
        if response.status != 200:
            raise HTTPException(
                status_code=response.status,
                detail=f"Failed to download PDF: {response.status}"
            )
        return await response.read()


def extract_text_from_path(pdf_path: str) -> tuple[str, int]:
    """Extract text from PDF file and return (text, page_count)."""
    try:
        doc = fitz.open(pdf_path)
        page_count = len(doc)
        text_parts = []

        for page in doc:
            text_parts.append(page.get_text())

        doc.close()
        return "\n".join(text_parts), page_count
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")


@app.get("/extract", response_model=ExtractResponse)
async def extract_pdf_from_url(
    url: str = Query(..., description="Direct link to PDF file (must start with http:// or https://)"),
    output_file: Optional[str] = Query(None, description="Optional custom output filename")
):
    """
    Extract text from a PDF hosted at URL.

    - **url**: Direct link to PDF file (required query parameter)
    - **output_file**: Optional custom output filename
    """
    start_time = time.time()

    # Validate URL format
    if not url.startswith(("http://", "https://")):
        raise HTTPException(status_code=400, detail="URL must start with http:// or https://")

    try:
        # Generate output filename
        if output_file:
            output_path = f"/tmp/{output_file}"
        else:
            base_name = os.path.basename(url).split(".pdf")[0] or "extracted"
            output_path = f"/tmp/{base_name}.txt"

        # Download PDF
        download_start = time.time()
        async with aiohttp.ClientSession() as session:
            pdf_content = await download_pdf(session, url)

            # Save to temp file
            pdf_path = "/tmp/downloaded.pdf"
            with open(pdf_path, "wb") as f:
                f.write(pdf_content)

        download_time = (time.time() - download_start) * 1000

        # Get file size
        file_size_kb = os.path.getsize(pdf_path) / 1024

        # Extract text
        extract_start = time.time()
        text, page_count = extract_text_from_path(pdf_path)
        extraction_time = (time.time() - extract_start) * 1000

        # Save to output file if specified
        if output_file:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)

        total_time = (time.time() - start_time) * 1000

        return ExtractResponse(
            success=True,
            text=text,
            file_size_kb=round(file_size_kb, 2),
            pages=page_count,
            extraction_time_ms=round(extraction_time, 2),
            message=f"Successfully extracted {page_count} page(s)"
        )

    except HTTPException:
        raise
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))


@app.get("/health")
async def health_check():
    """Health check endpoint."""
    return {"status": "healthy", "service": "PDF Text Extraction Daemon"}


@app.get("/")
async def root():
    """API info endpoint."""
    return {
        "name": "PDF Text Extraction API",
        "version": "1.0.0",
        "endpoints": {
            "/extract": {"method": "GET", "description": "Extract text from PDF URL"},
            "/health": {"method": "GET", "description": "Health check"}
        }
    }


if __name__ == "__main__":
    import argparse
    import uvicorn

    parser = argparse.ArgumentParser(description="PDF Text Extraction Daemon")
    parser.add_argument("--host", default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)")
    parser.add_argument("--port", type=int, default=8000, help="Port to listen on (default: 8000)")
    args = parser.parse_args()

    uvicorn.run(app, host=args.host, port=args.port)