pdf_tool/pdf_extractor.py

132 lines
3.8 KiB
Python
Executable File

#!/usr/bin/env python3
"""
PDF to Text Extractor - Fast text extraction from PDF files or URLs.
Uses PyMuPDF for extremely fast text extraction.
Requires: pip install pymupdf
Usage:
pdf_extractor <pdf_file_or_url> [--output output.txt]
Options:
--output, -o Output file path (default: same dir with .txt extension)
--help, -h Show this help message
"""
import argparse
import os
import sys
import urllib.request
def download_pdf(url):
"""Download PDF from URL to current directory."""
try:
filename = url.split("/")[-1] or "downloaded.pdf"
if not filename.endswith(".pdf"):
filename = "downloaded.pdf"
urllib.request.urlretrieve(url, filename)
print(f"Downloaded to: {filename}")
return filename
except Exception as e:
print(f"Error downloading PDF: {e}", file=sys.stderr)
sys.exit(1)
def extract_text(pdf_path):
"""Extract text from PDF using PyMuPDF (extremely fast)."""
try:
import fitz # PyMuPDF
except ImportError:
print("Error: pymupdf not installed.", file=sys.stderr)
print("Install with: pip install pymupdf", file=sys.stderr)
sys.exit(1)
try:
text = ""
doc = fitz.open(pdf_path)
for page in doc:
text += page.get_text()
text += "\n\n"
doc.close()
return text.strip()
except Exception as e:
print(f"Error extracting text from {pdf_path}: {e}", file=sys.stderr)
sys.exit(1)
def get_output_filename(input_path):
"""Generate output filename in same directory as input."""
base_name = os.path.splitext(os.path.basename(input_path))[0]
return os.path.join(os.path.dirname(input_path) or ".", f"{base_name}.txt")
def main():
parser = argparse.ArgumentParser(
description="Extract text from PDF files or URLs (fast extraction using PyMuPDF).",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
pdf_extractor document.pdf
pdf_extractor https://example.com/doc.pdf
pdf_extractor file.pdf --output output.txt
Requires: pip install pymupdf
"""
)
parser.add_argument(
"input",
help="PDF file path or URL to extract text from"
)
parser.add_argument(
"-o", "--output",
help="Output file path (default: same dir with .txt extension)"
)
args = parser.parse_args()
# Determine input type and handle accordingly
if args.input.startswith(("http://", "https://")):
print(f"Downloading PDF from URL...")
pdf_path = download_pdf(args.input)
output_name = os.path.basename(pdf_path).replace(".pdf", "_extracted.txt")
default_output = os.path.join(os.getcwd(), output_name)
else:
if not os.path.exists(args.input):
print(f"Error: File '{args.input}' does not exist.", file=sys.stderr)
sys.exit(1)
pdf_path = args.input
default_output = get_output_filename(args.input)
# Determine output path
output_path = args.output if args.output else default_output
# Extract text with timing
print(f"Extracting text from {pdf_path}...")
import time
start_time = time.time()
text = extract_text(pdf_path)
elapsed = time.time() - start_time
# Write to file or stdout
if output_path:
try:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
print(f"Text extracted successfully!")
print(f"Output saved to: {output_path}")
except Exception as e:
print(f"Error writing to file: {e}", file=sys.stderr)
sys.exit(1)
else:
print(text, end="")
print(f"\nExtraction completed in {elapsed:.3f} seconds.")
if __name__ == "__main__":
main()