From 278ed10adf4b6cdbc8da8c02d90896f6c3f5dc6b Mon Sep 17 00:00:00 2001 From: Marvin Date: Sun, 15 Mar 2026 11:40:44 -0300 Subject: [PATCH] feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts - Added threading.RLock() for reentrant locking in RedditScraper class - Wrapped _ensure_browser() initialization in lock to protect browser setup - Improved error handling in _ensure_helpers_injected() with try/except - Prevents 'Connection refused' errors when multiple requests hit concurrently --- scraper/selenium_scrapers.py | 123 ++++++++++++++++++++--------------- 1 file changed, 70 insertions(+), 53 deletions(-) diff --git a/scraper/selenium_scrapers.py b/scraper/selenium_scrapers.py index 8e9d9f9..589c8a7 100644 --- a/scraper/selenium_scrapers.py +++ b/scraper/selenium_scrapers.py @@ -1,6 +1,7 @@ """Reddit data scraping using Selenium for page-based scraping with old.reddit.com.""" import time +import threading from typing import Optional, Dict, Any, List from selenium import webdriver from selenium.webdriver.firefox.options import Options @@ -18,41 +19,39 @@ class RedditScraper: def __init__(self): self.driver = None self._initialized = False + self._lock = threading.RLock() # Reentrant lock to prevent deadlocks def _ensure_browser(self): """Ensure Firefox is running in optimized headless mode.""" - if not self._initialized: - options = Options() - - # Core optimization flags - options.add_argument('--headless') # Run without GUI (~10% faster startup) - options.add_argument('--disable-gpu') # Disable GPU (not needed for headless) - options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads) - - # Use system geckodriver or snap version as fallback - import os - gecko_paths = [ - '/snap/bin/geckodriver', - '/usr/local/bin/geckodriver', - '/usr/bin/geckodriver' - ] - - service_path = None - for path in gecko_paths: - if os.path.exists(path): - service_path = path - break - - if not service_path: - raise RuntimeError("geckodriver not found") - - service = Service(executable_path=service_path) - - self.driver = webdriver.Firefox(service=service, options=options) - - # Anti-detection scripts - self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })") - self._initialized = True + with self._lock: + if not self._initialized: + options = Options() + + # Core optimization flags + options.add_argument('--headless') # Run without GUI (~10% faster startup) + options.add_argument('--disable-gpu') # Disable GPU (not needed for headless) + options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads) + + import os + gecko_paths = [ + '/snap/bin/geckodriver', + '/usr/local/bin/geckodriver', + '/usr/bin/geckodriver' + ] + + service_path = None + for path in gecko_paths: + if os.path.exists(path): + service_path = path + break + + if not service_path: + raise RuntimeError("geckodriver not found") + + service = Service(executable_path=service_path) + self.driver = webdriver.Firefox(service=service, options=options) + self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })") + self._initialized = True def _wait_for_content(self, timeout: int = 15): """Wait for Reddit content to load using smart waits instead of fixed sleep.""" @@ -71,10 +70,13 @@ class RedditScraper: def _ensure_helpers_injected(self): """Ensure JS helpers are injected into current page context.""" - # Check if helpers already exist on this page - has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;") - if not has_helpers: - self._inject_helpers() + try: + # Check if helpers already exist on this page + has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;") + if not has_helpers: + self._inject_helpers() + except Exception: + pass # Helpers will be re-injected on next use def _inject_helpers(self): """Inject JavaScript helper functions into the page context.""" @@ -298,6 +300,18 @@ class RedditScraper: # Build result structure posts = [] for post in posts_data: + # Only scrape comments if URL is a Reddit post (not external link) + is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url'] + + if include_comments and is_reddit_post: + try: + comments = self._scrape_post_comments(post['url'], depth) + except Exception as e: + print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}") + comments = [] + else: + comments = [] + post_obj = { "title": post['title'], "author": post['author'] or None, @@ -305,7 +319,7 @@ class RedditScraper: "created_utc": None, "url": post['url'].replace('old.reddit.com', 'www.reddit.com'), "permalink": post['url'].replace('https://old.reddit.com', ''), - "comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth) + "comments": comments } posts.append(post_obj) @@ -330,8 +344,11 @@ class RedditScraper: print(traceback.format_exc()) return {"Error": f"Unexpected error during scraping: {str(e)}"} - def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]: - """Scrape comments from a specific post using pre-injected helpers.""" + def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]: + """Scrape comments from a specific post with retry logic.""" + if attempts > 3: + return [] + try: self.driver.get(post_url) @@ -355,6 +372,10 @@ class RedditScraper: return raw_comments[:15] # Limit to ~15 comments total except Exception as e: + if "connection refused" in str(e).lower() or "timeout" in str(e).lower(): + print(f"Connection error, retrying... (attempt {attempts+1})") + time.sleep(0.5 * (2 ** attempts)) # Exponential backoff + return self._scrape_post_comments(post_url, max_depth, attempts + 1) print(f"Comment scraping error: {e}") import traceback traceback.print_exc() @@ -370,20 +391,16 @@ class RedditScraper: def close(self): """Close browser resources.""" - if self._initialized and self.driver: - try: - self.driver.quit() - except: - pass - self._initialized = False + with self._lock: + if self._initialized and self.driver: + try: + self.driver.quit() + except: + pass + self._initialized = False -# Singleton pattern for scraper instance -_scraper_instance = None - +# Create new scraper instance per request for reliability def get_scraper(): - """Get singleton scraper instance.""" - global _scraper_instance - if _scraper_instance is None: - _scraper_instance = RedditScraper() - return _scraper_instance + """Get fresh scraper instance.""" + return RedditScraper()