#!/usr/bin/env python3 """ PDF Text Extraction Daemon - Fast API service for PDF text extraction. Run with: uvicorn pdf_daemon:app --host 0.0.0.0 --port 8000 --reload """ import os import time import aiohttp import fitz # PyMuPDF from fastapi import FastAPI, HTTPException, Query from pydantic import BaseModel from typing import Optional app = FastAPI( title="PDF Text Extraction API", description="Fast PDF text extraction service using PyMuPDF", version="1.0.0" ) class ExtractResponse(BaseModel): """Response model with extracted text and metadata.""" success: bool text: str file_size_kb: float pages: int extraction_time_ms: float message: str async def download_pdf(session: aiohttp.ClientSession, url: str) -> bytes: """Download PDF from URL using aiohttp session.""" async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response: if response.status != 200: raise HTTPException( status_code=response.status, detail=f"Failed to download PDF: {response.status}" ) return await response.read() def extract_text_from_path(pdf_path: str) -> tuple[str, int]: """Extract text from PDF file and return (text, page_count).""" try: doc = fitz.open(pdf_path) page_count = len(doc) text_parts = [] for page in doc: text_parts.append(page.get_text()) doc.close() return "\n".join(text_parts), page_count except Exception as e: raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}") @app.get("/extract", response_model=ExtractResponse) async def extract_pdf_from_url( url: str = Query(..., description="Direct link to PDF file (must start with http:// or https://)"), output_file: Optional[str] = Query(None, description="Optional custom output filename") ): """ Extract text from a PDF hosted at URL. - **url**: Direct link to PDF file (required query parameter) - **output_file**: Optional custom output filename """ start_time = time.time() # Validate URL format if not url.startswith(("http://", "https://")): raise HTTPException(status_code=400, detail="URL must start with http:// or https://") try: # Generate output filename if output_file: output_path = f"/tmp/{output_file}" else: base_name = os.path.basename(url).split(".pdf")[0] or "extracted" output_path = f"/tmp/{base_name}.txt" # Download PDF download_start = time.time() async with aiohttp.ClientSession() as session: pdf_content = await download_pdf(session, url) # Save to temp file pdf_path = "/tmp/downloaded.pdf" with open(pdf_path, "wb") as f: f.write(pdf_content) download_time = (time.time() - download_start) * 1000 # Get file size file_size_kb = os.path.getsize(pdf_path) / 1024 # Extract text extract_start = time.time() text, page_count = extract_text_from_path(pdf_path) extraction_time = (time.time() - extract_start) * 1000 # Save to output file if specified if output_file: with open(output_path, "w", encoding="utf-8") as f: f.write(text) total_time = (time.time() - start_time) * 1000 return ExtractResponse( success=True, text=text, file_size_kb=round(file_size_kb, 2), pages=page_count, extraction_time_ms=round(extraction_time, 2), message=f"Successfully extracted {page_count} page(s)" ) except HTTPException: raise except Exception as e: raise HTTPException(status_code=500, detail=str(e)) @app.get("/health") async def health_check(): """Health check endpoint.""" return {"status": "healthy", "service": "PDF Text Extraction Daemon"} @app.get("/") async def root(): """API info endpoint.""" return { "name": "PDF Text Extraction API", "version": "1.0.0", "endpoints": { "/extract": {"method": "GET", "description": "Extract text from PDF URL"}, "/health": {"method": "GET", "description": "Health check"} } } if __name__ == "__main__": import argparse import uvicorn parser = argparse.ArgumentParser(description="PDF Text Extraction Daemon") parser.add_argument("--host", default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)") parser.add_argument("--port", type=int, default=8000, help="Port to listen on (default: 8000)") args = parser.parse_args() uvicorn.run(app, host=args.host, port=args.port)