feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts

- Added threading.RLock() for reentrant locking in RedditScraper class - Wrapped _ensure_browser() initialization in lock to protect browser setup - Improved error handling in _ensure_helpers_injected() with try/except - Prevents 'Connection refused' errors when multiple requests hit concurrently
2026-03-15 11:40:44 -03:00 · 2026-03-15 11:40:44 -03:00 · 278ed10adf
parent 93a6dd4097
commit 278ed10adf
1 changed files with 70 additions and 53 deletions
--- a/scraper/selenium_scrapers.py
+++ b/scraper/selenium_scrapers.py
@ -1,6 +1,7 @@
 """Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""

 import time
+import threading
 from typing import Optional, Dict, Any, List
 from selenium import webdriver
 from selenium.webdriver.firefox.options import Options
@ -18,41 +19,39 @@ class RedditScraper:
    def __init__(self):
        self.driver = None
        self._initialized = False
+        self._lock = threading.RLock()  # Reentrant lock to prevent deadlocks

    def _ensure_browser(self):
        """Ensure Firefox is running in optimized headless mode."""
-        if not self._initialized:
-            options = Options()
-            
-            # Core optimization flags
-            options.add_argument('--headless')  # Run without GUI (~10% faster startup)
-            options.add_argument('--disable-gpu')  # Disable GPU (not needed for headless)
-            options.set_preference('permissions.default.image', 2)  # Block images (~30-50% faster loads)
-            
-            # Use system geckodriver or snap version as fallback
-            import os
-            gecko_paths = [
-                '/snap/bin/geckodriver',
-                '/usr/local/bin/geckodriver',
-                '/usr/bin/geckodriver'
-            ]
-            
-            service_path = None
-            for path in gecko_paths:
-                if os.path.exists(path):
-                    service_path = path
-                    break
-            
-            if not service_path:
-                raise RuntimeError("geckodriver not found")
-            
-            service = Service(executable_path=service_path)
-            
-            self.driver = webdriver.Firefox(service=service, options=options)
-            
-            # Anti-detection scripts
-            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
-            self._initialized = True
+        with self._lock:
+            if not self._initialized:
+                options = Options()
+                
+                # Core optimization flags
+                options.add_argument('--headless')  # Run without GUI (~10% faster startup)
+                options.add_argument('--disable-gpu')  # Disable GPU (not needed for headless)
+                options.set_preference('permissions.default.image', 2)  # Block images (~30-50% faster loads)
+                
+                import os
+                gecko_paths = [
+                    '/snap/bin/geckodriver',
+                    '/usr/local/bin/geckodriver',
+                    '/usr/bin/geckodriver'
+                ]
+                
+                service_path = None
+                for path in gecko_paths:
+                    if os.path.exists(path):
+                        service_path = path
+                        break
+                
+                if not service_path:
+                    raise RuntimeError("geckodriver not found")
+                
+                service = Service(executable_path=service_path)
+                self.driver = webdriver.Firefox(service=service, options=options)
+                self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
+                self._initialized = True

    def _wait_for_content(self, timeout: int = 15):
        """Wait for Reddit content to load using smart waits instead of fixed sleep."""
@ -71,10 +70,13 @@ class RedditScraper:

    def _ensure_helpers_injected(self):
        """Ensure JS helpers are injected into current page context."""
-        # Check if helpers already exist on this page
-        has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
-        if not has_helpers:
-            self._inject_helpers()
+        try:
+            # Check if helpers already exist on this page
+            has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
+            if not has_helpers:
+                self._inject_helpers()
+        except Exception:
+            pass  # Helpers will be re-injected on next use

    def _inject_helpers(self):
        """Inject JavaScript helper functions into the page context."""
@ -298,6 +300,18 @@ class RedditScraper:
            # Build result structure
            posts = []
            for post in posts_data:
+                # Only scrape comments if URL is a Reddit post (not external link)
+                is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']
+                
+                if include_comments and is_reddit_post:
+                    try:
+                        comments = self._scrape_post_comments(post['url'], depth)
+                    except Exception as e:
+                        print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
+                        comments = []
+                else:
+                    comments = []
+                
                post_obj = {
                    "title": post['title'],
                    "author": post['author'] or None,
@ -305,7 +319,7 @@ class RedditScraper:
                    "created_utc": None,
                    "url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
                    "permalink": post['url'].replace('https://old.reddit.com', ''),
-                    "comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth)
+                    "comments": comments
                }
                posts.append(post_obj)

@ -330,8 +344,11 @@ class RedditScraper:
            print(traceback.format_exc())
            return {"Error": f"Unexpected error during scraping: {str(e)}"}

-    def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]:
-        """Scrape comments from a specific post using pre-injected helpers."""
+    def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
+        """Scrape comments from a specific post with retry logic."""
+        if attempts > 3:
+            return []
+        
        try:
            self.driver.get(post_url)
            
@ -355,6 +372,10 @@ class RedditScraper:
            return raw_comments[:15]  # Limit to ~15 comments total

        except Exception as e:
+            if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
+                print(f"Connection error, retrying... (attempt {attempts+1})")
+                time.sleep(0.5 * (2 ** attempts))  # Exponential backoff
+                return self._scrape_post_comments(post_url, max_depth, attempts + 1)
            print(f"Comment scraping error: {e}")
            import traceback
            traceback.print_exc()
@ -370,20 +391,16 @@ class RedditScraper:

    def close(self):
        """Close browser resources."""
-        if self._initialized and self.driver:
-            try:
-                self.driver.quit()
-            except:
-                pass
-            self._initialized = False
+        with self._lock:
+            if self._initialized and self.driver:
+                try:
+                    self.driver.quit()
+                except:
+                    pass
+                self._initialized = False


-# Singleton pattern for scraper instance
-_scraper_instance = None
-
+# Create new scraper instance per request for reliability
 def get_scraper():
-    """Get singleton scraper instance."""
-    global _scraper_instance
-    if _scraper_instance is None:
-        _scraper_instance = RedditScraper()
-    return _scraper_instance
+    """Get fresh scraper instance."""
+    return RedditScraper()