pdf_tool/pdf_extractor.py

#!/usr/bin/env python3
"""
PDF to Text Extractor - Fast text extraction from PDF files or URLs.

Uses PyMuPDF for extremely fast text extraction.
Requires: pip install pymupdf

Usage:
    pdf_extractor <pdf_file_or_url> [--output output.txt]

Options:
    --output, -o  Output file path (default: same dir with .txt extension)
    --help, -h    Show this help message
"""

import argparse
import os
import sys
import urllib.request


def download_pdf(url):
    """Download PDF from URL to current directory."""
    try:
        filename = url.split("/")[-1] or "downloaded.pdf"
        if not filename.endswith(".pdf"):
            filename = "downloaded.pdf"

        urllib.request.urlretrieve(url, filename)
        print(f"Downloaded to: {filename}")
        return filename
    except Exception as e:
        print(f"Error downloading PDF: {e}", file=sys.stderr)
        sys.exit(1)


def extract_text(pdf_path):
    """Extract text from PDF using PyMuPDF (extremely fast)."""
    try:
        import fitz  # PyMuPDF
    except ImportError:
        print("Error: pymupdf not installed.", file=sys.stderr)
        print("Install with: pip install pymupdf", file=sys.stderr)
        sys.exit(1)

    try:
        text = ""
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
            text += "\n\n"
        doc.close()
        return text.strip()
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {e}", file=sys.stderr)
        sys.exit(1)


def get_output_filename(input_path):
    """Generate output filename in same directory as input."""
    base_name = os.path.splitext(os.path.basename(input_path))[0]
    return os.path.join(os.path.dirname(input_path) or ".", f"{base_name}.txt")


def main():
    parser = argparse.ArgumentParser(
        description="Extract text from PDF files or URLs (fast extraction using PyMuPDF).",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  pdf_extractor document.pdf
  pdf_extractor https://example.com/doc.pdf
  pdf_extractor file.pdf --output output.txt

Requires: pip install pymupdf
        """
    )

    parser.add_argument(
        "input",
        help="PDF file path or URL to extract text from"
    )

    parser.add_argument(
        "-o", "--output",
        help="Output file path (default: same dir with .txt extension)"
    )

    args = parser.parse_args()

    # Determine input type and handle accordingly
    if args.input.startswith(("http://", "https://")):
        print(f"Downloading PDF from URL...")
        pdf_path = download_pdf(args.input)
        output_name = os.path.basename(pdf_path).replace(".pdf", "_extracted.txt")
        default_output = os.path.join(os.getcwd(), output_name)
    else:
        if not os.path.exists(args.input):
            print(f"Error: File '{args.input}' does not exist.", file=sys.stderr)
            sys.exit(1)
        pdf_path = args.input
        default_output = get_output_filename(args.input)

    # Determine output path
    output_path = args.output if args.output else default_output

    # Extract text with timing
    print(f"Extracting text from {pdf_path}...")
    import time
    start_time = time.time()
    text = extract_text(pdf_path)
    elapsed = time.time() - start_time

    # Write to file or stdout
    if output_path:
        try:
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Text extracted successfully!")
            print(f"Output saved to: {output_path}")
        except Exception as e:
            print(f"Error writing to file: {e}", file=sys.stderr)
            sys.exit(1)
    else:
        print(text, end="")

    print(f"\nExtraction completed in {elapsed:.3f} seconds.")


if __name__ == "__main__":
    main()