132 lines
3.8 KiB
Python
Executable File
132 lines
3.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
PDF to Text Extractor - Fast text extraction from PDF files or URLs.
|
|
|
|
Uses PyMuPDF for extremely fast text extraction.
|
|
Requires: pip install pymupdf
|
|
|
|
Usage:
|
|
pdf_extractor <pdf_file_or_url> [--output output.txt]
|
|
|
|
Options:
|
|
--output, -o Output file path (default: same dir with .txt extension)
|
|
--help, -h Show this help message
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import urllib.request
|
|
|
|
|
|
def download_pdf(url):
|
|
"""Download PDF from URL to current directory."""
|
|
try:
|
|
filename = url.split("/")[-1] or "downloaded.pdf"
|
|
if not filename.endswith(".pdf"):
|
|
filename = "downloaded.pdf"
|
|
|
|
urllib.request.urlretrieve(url, filename)
|
|
print(f"Downloaded to: {filename}")
|
|
return filename
|
|
except Exception as e:
|
|
print(f"Error downloading PDF: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def extract_text(pdf_path):
|
|
"""Extract text from PDF using PyMuPDF (extremely fast)."""
|
|
try:
|
|
import fitz # PyMuPDF
|
|
except ImportError:
|
|
print("Error: pymupdf not installed.", file=sys.stderr)
|
|
print("Install with: pip install pymupdf", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
try:
|
|
text = ""
|
|
doc = fitz.open(pdf_path)
|
|
for page in doc:
|
|
text += page.get_text()
|
|
text += "\n\n"
|
|
doc.close()
|
|
return text.strip()
|
|
except Exception as e:
|
|
print(f"Error extracting text from {pdf_path}: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
|
|
def get_output_filename(input_path):
|
|
"""Generate output filename in same directory as input."""
|
|
base_name = os.path.splitext(os.path.basename(input_path))[0]
|
|
return os.path.join(os.path.dirname(input_path) or ".", f"{base_name}.txt")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Extract text from PDF files or URLs (fast extraction using PyMuPDF).",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
pdf_extractor document.pdf
|
|
pdf_extractor https://example.com/doc.pdf
|
|
pdf_extractor file.pdf --output output.txt
|
|
|
|
Requires: pip install pymupdf
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
"input",
|
|
help="PDF file path or URL to extract text from"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"-o", "--output",
|
|
help="Output file path (default: same dir with .txt extension)"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Determine input type and handle accordingly
|
|
if args.input.startswith(("http://", "https://")):
|
|
print(f"Downloading PDF from URL...")
|
|
pdf_path = download_pdf(args.input)
|
|
output_name = os.path.basename(pdf_path).replace(".pdf", "_extracted.txt")
|
|
default_output = os.path.join(os.getcwd(), output_name)
|
|
else:
|
|
if not os.path.exists(args.input):
|
|
print(f"Error: File '{args.input}' does not exist.", file=sys.stderr)
|
|
sys.exit(1)
|
|
pdf_path = args.input
|
|
default_output = get_output_filename(args.input)
|
|
|
|
# Determine output path
|
|
output_path = args.output if args.output else default_output
|
|
|
|
# Extract text with timing
|
|
print(f"Extracting text from {pdf_path}...")
|
|
import time
|
|
start_time = time.time()
|
|
text = extract_text(pdf_path)
|
|
elapsed = time.time() - start_time
|
|
|
|
# Write to file or stdout
|
|
if output_path:
|
|
try:
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
f.write(text)
|
|
print(f"Text extracted successfully!")
|
|
print(f"Output saved to: {output_path}")
|
|
except Exception as e:
|
|
print(f"Error writing to file: {e}", file=sys.stderr)
|
|
sys.exit(1)
|
|
else:
|
|
print(text, end="")
|
|
|
|
print(f"\nExtraction completed in {elapsed:.3f} seconds.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|