feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts

- Added threading.RLock() for reentrant locking in RedditScraper class - Wrapped _ensure_browser() initialization in lock to protect browser setup - Improved error handling in _ensure_helpers_injected() with try/except - Prevents 'Connection refused' errors when multiple requests hit concurrently
2026-03-15 11:40:44 -03:00 · 2026-03-15 11:40:44 -03:00 · 278ed10adf
parent 93a6dd4097
commit 278ed10adf
1 changed files with 70 additions and 53 deletions
--- a/scraper/selenium_scrapers.py
+++ b/scraper/selenium_scrapers.py
@ -1,6 +1,7 @@
 """Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""
 import time
 import threading
 from typing import Optional, Dict, Any, List
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
@ -18,9 +19,11 @@ class RedditScraper:
    def __init__(self):
        self.driver = None
        self._initialized = False
        self._lock = threading.RLock()  # Reentrant lock to prevent deadlocks
    def _ensure_browser(self):
        """Ensure Firefox is running in optimized headless mode."""
        with self._lock:
            if not self._initialized:
                options = Options()
@ -29,7 +32,6 @@ class RedditScraper:
                options.add_argument('--disable-gpu')  # Disable GPU (not needed for headless)
                options.set_preference('permissions.default.image', 2)  # Block images (~30-50% faster loads)
            # Use system geckodriver or snap version as fallback
                import os
                gecko_paths = [
                    '/snap/bin/geckodriver',
@ -47,10 +49,7 @@ class RedditScraper:
                    raise RuntimeError("geckodriver not found")
                service = Service(executable_path=service_path)
                self.driver = webdriver.Firefox(service=service, options=options)
            # Anti-detection scripts
                self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
                self._initialized = True
@ -71,10 +70,13 @@ class RedditScraper:
    def _ensure_helpers_injected(self):
        """Ensure JS helpers are injected into current page context."""
        try:
            # Check if helpers already exist on this page
            has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
            if not has_helpers:
                self._inject_helpers()
        except Exception:
            pass  # Helpers will be re-injected on next use
    def _inject_helpers(self):
        """Inject JavaScript helper functions into the page context."""
@ -298,6 +300,18 @@ class RedditScraper:
            # Build result structure
            posts = []
            for post in posts_data:
                # Only scrape comments if URL is a Reddit post (not external link)
                is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']
                if include_comments and is_reddit_post:
                    try:
                        comments = self._scrape_post_comments(post['url'], depth)
                    except Exception as e:
                        print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
                        comments = []
                else:
                    comments = []
                post_obj = {
                    "title": post['title'],
                    "author": post['author'] or None,
@ -305,7 +319,7 @@ class RedditScraper:
                    "created_utc": None,
                    "url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
                    "permalink": post['url'].replace('https://old.reddit.com', ''),
-                    "comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth)
+                    "comments": comments
                }
                posts.append(post_obj)
@ -330,8 +344,11 @@ class RedditScraper:
            print(traceback.format_exc())
            return {"Error": f"Unexpected error during scraping: {str(e)}"}
-    def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]:
+    def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
-        """Scrape comments from a specific post using pre-injected helpers."""
+        """Scrape comments from a specific post with retry logic."""
        if attempts > 3:
            return []
        try:
            self.driver.get(post_url)
@ -355,6 +372,10 @@ class RedditScraper:
            return raw_comments[:15]  # Limit to ~15 comments total
        except Exception as e:
            if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
                print(f"Connection error, retrying... (attempt {attempts+1})")
                time.sleep(0.5 * (2 ** attempts))  # Exponential backoff
                return self._scrape_post_comments(post_url, max_depth, attempts + 1)
            print(f"Comment scraping error: {e}")
            import traceback
            traceback.print_exc()
@ -370,6 +391,7 @@ class RedditScraper:
    def close(self):
        """Close browser resources."""
        with self._lock:
            if self._initialized and self.driver:
                try:
                    self.driver.quit()
@ -378,12 +400,7 @@ class RedditScraper:
                self._initialized = False
-# Singleton pattern for scraper instance
+# Create new scraper instance per request for reliability
 _scraper_instance = None
 def get_scraper():
-    """Get singleton scraper instance."""
+    """Get fresh scraper instance."""
-    global _scraper_instance
+    return RedditScraper()
    if _scraper_instance is None:
        _scraper_instance = RedditScraper()
    return _scraper_instance