157 lines
4.8 KiB
Python
157 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PDF Text Extraction Daemon - Fast API service for PDF text extraction.
|
|
|
|
Run with: uvicorn pdf_daemon:app --host 0.0.0.0 --port 8000 --reload
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import aiohttp
|
|
import fitz # PyMuPDF
|
|
from fastapi import FastAPI, HTTPException, Query
|
|
from pydantic import BaseModel
|
|
from typing import Optional
|
|
|
|
|
|
app = FastAPI(
|
|
title="PDF Text Extraction API",
|
|
description="Fast PDF text extraction service using PyMuPDF",
|
|
version="1.0.0"
|
|
)
|
|
|
|
|
|
class ExtractResponse(BaseModel):
|
|
"""Response model with extracted text and metadata."""
|
|
success: bool
|
|
text: str
|
|
file_size_kb: float
|
|
pages: int
|
|
extraction_time_ms: float
|
|
message: str
|
|
|
|
|
|
async def download_pdf(session: aiohttp.ClientSession, url: str) -> bytes:
|
|
"""Download PDF from URL using aiohttp session."""
|
|
async with session.get(url, timeout=aiohttp.ClientTimeout(total=60)) as response:
|
|
if response.status != 200:
|
|
raise HTTPException(
|
|
status_code=response.status,
|
|
detail=f"Failed to download PDF: {response.status}"
|
|
)
|
|
return await response.read()
|
|
|
|
|
|
def extract_text_from_path(pdf_path: str) -> tuple[str, int]:
|
|
"""Extract text from PDF file and return (text, page_count)."""
|
|
try:
|
|
doc = fitz.open(pdf_path)
|
|
page_count = len(doc)
|
|
text_parts = []
|
|
|
|
for page in doc:
|
|
text_parts.append(page.get_text())
|
|
|
|
doc.close()
|
|
return "\n".join(text_parts), page_count
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Extraction failed: {str(e)}")
|
|
|
|
|
|
@app.get("/extract", response_model=ExtractResponse)
|
|
async def extract_pdf_from_url(
|
|
url: str = Query(..., description="Direct link to PDF file (must start with http:// or https://)"),
|
|
output_file: Optional[str] = Query(None, description="Optional custom output filename")
|
|
):
|
|
"""
|
|
Extract text from a PDF hosted at URL.
|
|
|
|
- **url**: Direct link to PDF file (required query parameter)
|
|
- **output_file**: Optional custom output filename
|
|
"""
|
|
start_time = time.time()
|
|
|
|
# Validate URL format
|
|
if not url.startswith(("http://", "https://")):
|
|
raise HTTPException(status_code=400, detail="URL must start with http:// or https://")
|
|
|
|
try:
|
|
# Generate output filename
|
|
if output_file:
|
|
output_path = f"/tmp/{output_file}"
|
|
else:
|
|
base_name = os.path.basename(url).split(".pdf")[0] or "extracted"
|
|
output_path = f"/tmp/{base_name}.txt"
|
|
|
|
# Download PDF
|
|
download_start = time.time()
|
|
async with aiohttp.ClientSession() as session:
|
|
pdf_content = await download_pdf(session, url)
|
|
|
|
# Save to temp file
|
|
pdf_path = "/tmp/downloaded.pdf"
|
|
with open(pdf_path, "wb") as f:
|
|
f.write(pdf_content)
|
|
|
|
download_time = (time.time() - download_start) * 1000
|
|
|
|
# Get file size
|
|
file_size_kb = os.path.getsize(pdf_path) / 1024
|
|
|
|
# Extract text
|
|
extract_start = time.time()
|
|
text, page_count = extract_text_from_path(pdf_path)
|
|
extraction_time = (time.time() - extract_start) * 1000
|
|
|
|
# Save to output file if specified
|
|
if output_file:
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(text)
|
|
|
|
total_time = (time.time() - start_time) * 1000
|
|
|
|
return ExtractResponse(
|
|
success=True,
|
|
text=text,
|
|
file_size_kb=round(file_size_kb, 2),
|
|
pages=page_count,
|
|
extraction_time_ms=round(extraction_time, 2),
|
|
message=f"Successfully extracted {page_count} page(s)"
|
|
)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=str(e))
|
|
|
|
|
|
@app.get("/health")
|
|
async def health_check():
|
|
"""Health check endpoint."""
|
|
return {"status": "healthy", "service": "PDF Text Extraction Daemon"}
|
|
|
|
|
|
@app.get("/")
|
|
async def root():
|
|
"""API info endpoint."""
|
|
return {
|
|
"name": "PDF Text Extraction API",
|
|
"version": "1.0.0",
|
|
"endpoints": {
|
|
"/extract": {"method": "GET", "description": "Extract text from PDF URL"},
|
|
"/health": {"method": "GET", "description": "Health check"}
|
|
}
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
import uvicorn
|
|
|
|
parser = argparse.ArgumentParser(description="PDF Text Extraction Daemon")
|
|
parser.add_argument("--host", default="0.0.0.0", help="Host to bind to (default: 0.0.0.0)")
|
|
parser.add_argument("--port", type=int, default=8000, help="Port to listen on (default: 8000)")
|
|
args = parser.parse_args()
|
|
|
|
uvicorn.run(app, host=args.host, port=args.port)
|