#!/usr/bin/env python3 """ PDF to Text Extractor - Fast text extraction from PDF files or URLs. Uses PyMuPDF for extremely fast text extraction. Requires: pip install pymupdf Usage: pdf_extractor [--output output.txt] Options: --output, -o Output file path (default: same dir with .txt extension) --help, -h Show this help message """ import argparse import os import sys import urllib.request def download_pdf(url): """Download PDF from URL to current directory.""" try: filename = url.split("/")[-1] or "downloaded.pdf" if not filename.endswith(".pdf"): filename = "downloaded.pdf" urllib.request.urlretrieve(url, filename) print(f"Downloaded to: {filename}") return filename except Exception as e: print(f"Error downloading PDF: {e}", file=sys.stderr) sys.exit(1) def extract_text(pdf_path): """Extract text from PDF using PyMuPDF (extremely fast).""" try: import fitz # PyMuPDF except ImportError: print("Error: pymupdf not installed.", file=sys.stderr) print("Install with: pip install pymupdf", file=sys.stderr) sys.exit(1) try: text = "" doc = fitz.open(pdf_path) for page in doc: text += page.get_text() text += "\n\n" doc.close() return text.strip() except Exception as e: print(f"Error extracting text from {pdf_path}: {e}", file=sys.stderr) sys.exit(1) def get_output_filename(input_path): """Generate output filename in same directory as input.""" base_name = os.path.splitext(os.path.basename(input_path))[0] return os.path.join(os.path.dirname(input_path) or ".", f"{base_name}.txt") def main(): parser = argparse.ArgumentParser( description="Extract text from PDF files or URLs (fast extraction using PyMuPDF).", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: pdf_extractor document.pdf pdf_extractor https://example.com/doc.pdf pdf_extractor file.pdf --output output.txt Requires: pip install pymupdf """ ) parser.add_argument( "input", help="PDF file path or URL to extract text from" ) parser.add_argument( "-o", "--output", help="Output file path (default: same dir with .txt extension)" ) args = parser.parse_args() # Determine input type and handle accordingly if args.input.startswith(("http://", "https://")): print(f"Downloading PDF from URL...") pdf_path = download_pdf(args.input) output_name = os.path.basename(pdf_path).replace(".pdf", "_extracted.txt") default_output = os.path.join(os.getcwd(), output_name) else: if not os.path.exists(args.input): print(f"Error: File '{args.input}' does not exist.", file=sys.stderr) sys.exit(1) pdf_path = args.input default_output = get_output_filename(args.input) # Determine output path output_path = args.output if args.output else default_output # Extract text with timing print(f"Extracting text from {pdf_path}...") import time start_time = time.time() text = extract_text(pdf_path) elapsed = time.time() - start_time # Write to file or stdout if output_path: try: with open(output_path, "w", encoding="utf-8") as f: f.write(text) print(f"Text extracted successfully!") print(f"Output saved to: {output_path}") except Exception as e: print(f"Error writing to file: {e}", file=sys.stderr) sys.exit(1) else: print(text, end="") print(f"\nExtraction completed in {elapsed:.3f} seconds.") if __name__ == "__main__": main()