reddit_scraper/scraper/selenium_scrapers.py

"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""

import time
import threading
from typing import Optional, Dict, Any, List
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from .cache import get_cache


class RedditScraper:
    """Scrapes OLD Reddit pages (old.reddit.com) using a headless Firefox browser."""

    def __init__(self):
        self.driver = None
        self._initialized = False
        self._lock = threading.RLock()  # Reentrant lock to prevent deadlocks

    def _ensure_browser(self):
        """Ensure Firefox is running in optimized headless mode."""
        with self._lock:
            if not self._initialized:
                options = Options()

                # Core optimization flags
                options.add_argument('--headless')  # Run without GUI (~10% faster startup)
                options.add_argument('--disable-gpu')  # Disable GPU (not needed for headless)
                options.set_preference('permissions.default.image', 2)  # Block images (~30-50% faster loads)

                import os
                gecko_paths = [
                    '/snap/bin/geckodriver',
                    '/usr/local/bin/geckodriver',
                    '/usr/bin/geckodriver'
                ]

                service_path = None
                for path in gecko_paths:
                    if os.path.exists(path):
                        service_path = path
                        break

                if not service_path:
                    raise RuntimeError("geckodriver not found")

                service = Service(executable_path=service_path)
                self.driver = webdriver.Firefox(service=service, options=options)
                self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
                self._initialized = True

    def _wait_for_content(self, timeout: int = 15):
        """Wait for Reddit content to load using smart waits instead of fixed sleep."""
        try:
            # Wait for post containers or subreddit container
            WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, '.thing, .subreddit'))
            )

            # Additional wait for dynamic content (load more button)
            time.sleep(1)  # Small delay to allow lazy-loaded content

        except Exception:
            # Fall back to fixed wait if selectors don't appear
            time.sleep(3)

    def _ensure_helpers_injected(self):
        """Ensure JS helpers are injected into current page context."""
        try:
            # Check if helpers already exist on this page
            has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
            if not has_helpers:
                self._inject_helpers()
        except Exception:
            pass  # Helpers will be re-injected on next use

    def _inject_helpers(self):
        """Inject JavaScript helper functions into the page context."""
        helpers = '''
        window.RSScraperHelpers = {
            // Get all posts with their metadata in a single efficient call
            getPosts: function(maxCount) {
                const posts = [];
                const postElements = document.querySelectorAll('.thing');

                for (const el of Array.from(postElements)) {
                    if (posts.length >= maxCount) break;

                    const titleLink = el.querySelector('a.title.may-blank');
                    if (!titleLink) continue;

                    const title = titleLink.textContent.trim();
                    if (title.length < 3 ||
                        title.includes('Microsoft') ||
                        title.startsWith('r/')) continue;

                    let score = 0;
                    try {
                        const scoreEl = el.querySelector('.score.unvoted');
                        if (scoreEl) {
                            score = parseInt(scoreEl.textContent.replace(/[^0-9]/g, '')) || 0;
                        }
                    } catch(e) {}

                    let author = null;
                    try {
                        const authorLink = el.querySelector('.midcol .author, .author');
                        if (authorLink && authorLink.textContent.trim()) {
                            author = authorLink.textContent.trim();
                        }
                    } catch(e) {}

                    let url = titleLink.href || '';
                    try {
                        const cleanUrl = new URL(url);
                        ['utm_source', 'utm_medium', 'utm_campaign'].forEach(param => {
                            cleanUrl.searchParams.delete(param);
                        });
                        url = cleanUrl.toString();
                    } catch(e) {}

                    posts.push({
                        title: title,
                        author: author,
                        score: score,
                        url: url
                    });
                }

                return posts;
            },

            // Expand all "more comments" expanders recursively before scraping
            expandAllComments: function() {
                let expanded = true;
                let iterations = 0;
                const maxIterations = 5;

                while (expanded && iterations < maxIterations) {
                    expanded = false;
                    iterations++;

                    // Find all "more comments" buttons/links
                    const moreLinks = document.querySelectorAll('a.morecomments, .more');
                    for (const link of moreLinks) {
                        if (link.style.display !== 'none' && !link.classList.contains('done')) {
                            try {
                                link.click();
                                expanded = true;
                            } catch(e) {}
                        }
                    }

                    // Small delay for dynamic content to load
                    if (expanded) {
                        this.sleep(300);
                    }
                }

                return iterations;
            },

            // Get comments with nested replies in a single call
            getComments: function(maxDepth, maxCount) {
                const comments = [];
                const processedIds = new Set();

                // Find all comment containers
                const commentContainers = document.querySelectorAll('.comment, .thing.comment');

                for (const container of Array.from(commentContainers)) {
                    if (comments.length >= maxCount) break;

                    const id = container.getAttribute('data-comment-id') ||
                               container.querySelector('[id$="-container"]')?.id ||
                               Math.random().toString(36).substr(2, 9);

                    if (processedIds.has(id)) continue;
                    processedIds.add(id);

                    const authorEl = container.querySelector('a.author, .author');
                    const bodyEl = container.querySelector('.usertext-body, .md, div.comment-body');
                    const scoreEl = container.querySelector('.score');
                    const createdEl = container.querySelector('.timestamp a');

                    if (!bodyEl || !bodyEl.textContent.trim()) continue;

                    const text = bodyEl.textContent.trim();
                    const lowerText = text.toLowerCase();

                    // Skip UI elements and short content
                    if (text.length < 10 ||
                        text.includes('open menu') ||
                        text.includes('reddit home') ||
                        text.includes('permalink')) continue;

                    comments.push({
                        author: authorEl?.textContent.trim() || 'unknown',
                        body: text,
                        score: scoreEl ? parseInt(scoreEl.textContent.replace(/[^0-9]/g, '')) || 0 : null,
                        created_utc: createdEl?.getAttribute('title') || null,
                        depth: this._getNestedDepth(container),
                        replies: []
                    });
                }

                // Sort by depth to handle nesting
                comments.sort((a, b) => a.depth - b.depth);

                // Build nested structure up to maxDepth
                const buildHierarchy = (comments, maxD) => {
                    return comments.filter(c => c.depth <= maxD).map(c => ({
                        author: c.author,
                        body: c.body,
                        score: c.score,
                        created_utc: c.created_utc,
                        replies: buildHierarchy(
                            comments.filter(r =>
                                r.depth === c.depth + 1 &&
                                c.replies.length < 5 // Limit replies per comment
                            ), maxD - (c.depth + 1) > 0 ? maxD - (c.depth + 1) : 0
                        )
                    }));
                };

                return buildHierarchy(comments, maxDepth);
            },

            // Helper to get nesting depth of a comment element
            _getNestedDepth: function(element) {
                let depth = 0;
                while (element && element.parentElement) {
                    if (element.classList.contains('child')) depth++;
                    else if (element.classList.contains('comment')) break;
                    element = element.parentElement;
                }
                return Math.min(depth, 10); // Cap at 10 levels
            },

            sleep: function(ms) {
                const start = Date.now();
                while (Date.now() - start < ms) {}
            }
        };
        '''
        self.driver.execute_script(helpers)

    def scrape_subreddit_top(
        self,
        subreddit: str,
        limit: int = 10,
        time_range: str = "week",
        depth: int = 3,
        include_comments: bool = True
    ) -> Dict[str, Any]:
        """
        Scrape top posts from a subreddit using Selenium on old.reddit.com.

        Args:
            subreddit: Name of the subreddit (without 'r/')
            limit: Number of top posts to retrieve
            time_range: Time filter ('hour', 'day', 'week', 'month', 'year', 'all')
            depth: Maximum comment nesting depth
            include_comments: Whether to scrape comments

        Returns:
            Dict containing scraped data or error information
        """
        # Check cache first (skip if comments requested, as they change frequently)
        if not include_comments:
            cached_result = get_cache().get(
                subreddit=subreddit,
                limit=limit,
                time_range=time_range,
                depth=depth
            )
            if cached_result is not None:
                return cached_result

        self._ensure_browser()

        try:
            # Navigate to OLD Reddit for cleaner DOM structure
            url = f"https://old.reddit.com/r/{subreddit}/top/?t={time_range}"
            self.driver.get(url)

            # Use smart wait instead of fixed sleep (adapts to actual page load speed)
            self._wait_for_content(timeout=15)

            # Ensure helper functions are available before scraping
            self._ensure_helpers_injected()

            # Extract post data using pre-injected helper function (executed once per page load)
            posts_data = self.driver.execute_script('return window.RSScraperHelpers.getPosts(arguments[0]);', limit)

            # Build result structure
            posts = []
            for post in posts_data:
                # Only scrape comments if URL is a Reddit post (not external link)
                is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']

                if include_comments and is_reddit_post:
                    try:
                        comments = self._scrape_post_comments(post['url'], depth)
                    except Exception as e:
                        print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
                        comments = []
                else:
                    comments = []

                post_obj = {
                    "title": post['title'],
                    "author": post['author'] or None,
                    "score": post['score'],
                    "created_utc": None,
                    "url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
                    "permalink": post['url'].replace('https://old.reddit.com', ''),
                    "comments": comments
                }
                posts.append(post_obj)

            # Build final result structure
            result = {
                "subreddit": subreddit,
                "time_range": time_range,
                "limit": len(posts),
                "posts_count": len(posts),
                "data": posts
            }

            # Cache the result (only if no comments requested)
            if not include_comments:
                get_cache().set(result, subreddit=subreddit, limit=limit, time_range=time_range, depth=depth)

            return result

        except Exception as e:
            import traceback
            print(f"Error during scraping: {e}")
            print(traceback.format_exc())
            return {"Error": f"Unexpected error during scraping: {str(e)}"}

    def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
        """Scrape comments from a specific post with retry logic."""
        if attempts > 3:
            return []

        try:
            self.driver.get(post_url)

            # Use smart wait instead of fixed sleep
            self._wait_for_content(timeout=10)
            self._ensure_helpers_injected()

            # Expand all "more comments" links before scraping (batched operation)
            expanded_iterations = self.driver.execute_script('return window.RSScraperHelpers.expandAllComments();')

            # Additional wait for dynamically loaded content
            if expanded_iterations > 0:
                time.sleep(1.5)

            # Extract all comments with nested structure in single call (batched)
            raw_comments = self.driver.execute_script(
                'return window.RSScraperHelpers.getComments(arguments[0], arguments[1]);',
                max_depth, 20  # max_depth, max_count
            )

            return raw_comments[:15]  # Limit to ~15 comments total

        except Exception as e:
            if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
                print(f"Connection error, retrying... (attempt {attempts+1})")
                time.sleep(0.5 * (2 ** attempts))  # Exponential backoff
                return self._scrape_post_comments(post_url, max_depth, attempts + 1)
            print(f"Comment scraping error: {e}")
            import traceback
            traceback.print_exc()
            return []

    def _clean_text(self, text: str) -> str:
        """Clean and normalize text content."""
        if not text:
            return ""
        lines = text.split("\n")
        cleaned = [line.strip() for line in lines if line.strip()]
        return " ".join(cleaned)

    def close(self):
        """Close browser resources."""
        with self._lock:
            if self._initialized and self.driver:
                try:
                    self.driver.quit()
                except:
                    pass
                self._initialized = False


# Create new scraper instance per request for reliability
def get_scraper():
    """Get fresh scraper instance."""
    return RedditScraper()