pdf_tool/pdf_daemon.py

157 lines
4.8 KiB
Python

#!/usr/bin/env python3
"""
PDF Text Extraction Daemon - Fast API service for PDF text extraction.
Run with: uvicorn pdf_daemon:app --host 0.0.0.0 --port 8000 --reload
"""
import os
import time
import aiohttp
import fitz # PyMuPDF
from fastapi import FastAPI, HTTPException, Query
from pydantic import BaseModel
from typing import Optional
app = FastAPI(
title="PDF Text Extraction API",
description="Fast PDF text extraction service using PyMuPDF",
version="1.0.0"
)
class ExtractResponse(BaseModel):
"""Response model with extracted text and metadata."""
success: bool
text: str
file_size_kb: float
pages: int
extraction_time_ms: float
message: str
async def download_pdf(session: aiohttp.ClientSession, url: str) -> bytes:
"""Download PDF from URL using aiohttp session."""
async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
if response.status != 200:
raise HTTPException(
status_code=response.status,
detail=f"Failed to download PDF: {response.status}"
)
return await response.read()
def extract_text_from_path(pdf_path: str) -> tuple[str, int]:
"""Extract text from PDF file and return (text, page_count)."""
try:
doc = fitz.open(pdf_path)
page_count = len(doc)
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts), page_count
except Exception as e:
raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
@app.get("/extract", response_model=ExtractResponse)
async def extract_pdf_from_url(
url: str = Query(..., description="Direct link to PDF file (must start with http:// or https://)"),
output_file: Optional[str] = Query(None, description="Optional custom output filename")
):
"""
Extract text from a PDF hosted at URL.
- **url**: Direct link to PDF file (required query parameter)
- **output_file**: Optional custom output filename
"""
start_time = time.time()
# Validate URL format
if not url.startswith(("http://", "https://")):
raise HTTPException(status_code=400, detail="URL must start with http:// or https://")
try:
# Generate output filename
if output_file:
output_path = f"/tmp/{output_file}"
else:
base_name = os.path.basename(url).split(".pdf")[0] or "extracted"
output_path = f"/tmp/{base_name}.txt"
# Download PDF
download_start = time.time()
async with aiohttp.ClientSession() as session:
pdf_content = await download_pdf(session, url)
# Save to temp file
pdf_path = "/tmp/downloaded.pdf"
with open(pdf_path, "wb") as f:
f.write(pdf_content)
download_time = (time.time() - download_start) * 1000
# Get file size
file_size_kb = os.path.getsize(pdf_path) / 1024
# Extract text
extract_start = time.time()
text, page_count = extract_text_from_path(pdf_path)
extraction_time = (time.time() - extract_start) * 1000
# Save to output file if specified
if output_file:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
total_time = (time.time() - start_time) * 1000
return ExtractResponse(
success=True,
text=text,
file_size_kb=round(file_size_kb, 2),
pages=page_count,
extraction_time_ms=round(extraction_time, 2),
message=f"Successfully extracted {page_count} page(s)"
)
except HTTPException:
raise
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {"status": "healthy", "service": "PDF Text Extraction Daemon"}
@app.get("/")
async def root():
"""API info endpoint."""
return {
"name": "PDF Text Extraction API",
"version": "1.0.0",
"endpoints": {
"/extract": {"method": "GET", "description": "Extract text from PDF URL"},
"/health": {"method": "GET", "description": "Health check"}
}
}
if __name__ == "__main__":
import argparse
import uvicorn
parser = argparse.ArgumentParser(description="PDF Text Extraction Daemon")
parser.add_argument("--host", default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)")
parser.add_argument("--port", type=int, default=8000, help="Port to listen on (default: 8000)")
args = parser.parse_args()
uvicorn.run(app, host=args.host, port=args.port)