feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts

- Added threading.RLock() for reentrant locking in RedditScraper class
- Wrapped _ensure_browser() initialization in lock to protect browser setup
- Improved error handling in _ensure_helpers_injected() with try/except
- Prevents 'Connection refused' errors when multiple requests hit concurrently
This commit is contained in:
Marvin 2026-03-15 11:40:44 -03:00
parent 93a6dd4097
commit 278ed10adf
1 changed files with 70 additions and 53 deletions

View File

@ -1,6 +1,7 @@
"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com.""" """Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""
import time import time
import threading
from typing import Optional, Dict, Any, List from typing import Optional, Dict, Any, List
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.options import Options
@ -18,41 +19,39 @@ class RedditScraper:
def __init__(self): def __init__(self):
self.driver = None self.driver = None
self._initialized = False self._initialized = False
self._lock = threading.RLock() # Reentrant lock to prevent deadlocks
def _ensure_browser(self): def _ensure_browser(self):
"""Ensure Firefox is running in optimized headless mode.""" """Ensure Firefox is running in optimized headless mode."""
if not self._initialized: with self._lock:
options = Options() if not self._initialized:
options = Options()
# Core optimization flags
options.add_argument('--headless') # Run without GUI (~10% faster startup) # Core optimization flags
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless) options.add_argument('--headless') # Run without GUI (~10% faster startup)
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads) options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
# Use system geckodriver or snap version as fallback
import os import os
gecko_paths = [ gecko_paths = [
'/snap/bin/geckodriver', '/snap/bin/geckodriver',
'/usr/local/bin/geckodriver', '/usr/local/bin/geckodriver',
'/usr/bin/geckodriver' '/usr/bin/geckodriver'
] ]
service_path = None service_path = None
for path in gecko_paths: for path in gecko_paths:
if os.path.exists(path): if os.path.exists(path):
service_path = path service_path = path
break break
if not service_path: if not service_path:
raise RuntimeError("geckodriver not found") raise RuntimeError("geckodriver not found")
service = Service(executable_path=service_path) service = Service(executable_path=service_path)
self.driver = webdriver.Firefox(service=service, options=options)
self.driver = webdriver.Firefox(service=service, options=options) self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
self._initialized = True
# Anti-detection scripts
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
self._initialized = True
def _wait_for_content(self, timeout: int = 15): def _wait_for_content(self, timeout: int = 15):
"""Wait for Reddit content to load using smart waits instead of fixed sleep.""" """Wait for Reddit content to load using smart waits instead of fixed sleep."""
@ -71,10 +70,13 @@ class RedditScraper:
def _ensure_helpers_injected(self): def _ensure_helpers_injected(self):
"""Ensure JS helpers are injected into current page context.""" """Ensure JS helpers are injected into current page context."""
# Check if helpers already exist on this page try:
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;") # Check if helpers already exist on this page
if not has_helpers: has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
self._inject_helpers() if not has_helpers:
self._inject_helpers()
except Exception:
pass # Helpers will be re-injected on next use
def _inject_helpers(self): def _inject_helpers(self):
"""Inject JavaScript helper functions into the page context.""" """Inject JavaScript helper functions into the page context."""
@ -298,6 +300,18 @@ class RedditScraper:
# Build result structure # Build result structure
posts = [] posts = []
for post in posts_data: for post in posts_data:
# Only scrape comments if URL is a Reddit post (not external link)
is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']
if include_comments and is_reddit_post:
try:
comments = self._scrape_post_comments(post['url'], depth)
except Exception as e:
print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
comments = []
else:
comments = []
post_obj = { post_obj = {
"title": post['title'], "title": post['title'],
"author": post['author'] or None, "author": post['author'] or None,
@ -305,7 +319,7 @@ class RedditScraper:
"created_utc": None, "created_utc": None,
"url": post['url'].replace('old.reddit.com', 'www.reddit.com'), "url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
"permalink": post['url'].replace('https://old.reddit.com', ''), "permalink": post['url'].replace('https://old.reddit.com', ''),
"comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth) "comments": comments
} }
posts.append(post_obj) posts.append(post_obj)
@ -330,8 +344,11 @@ class RedditScraper:
print(traceback.format_exc()) print(traceback.format_exc())
return {"Error": f"Unexpected error during scraping: {str(e)}"} return {"Error": f"Unexpected error during scraping: {str(e)}"}
def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]: def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
"""Scrape comments from a specific post using pre-injected helpers.""" """Scrape comments from a specific post with retry logic."""
if attempts > 3:
return []
try: try:
self.driver.get(post_url) self.driver.get(post_url)
@ -355,6 +372,10 @@ class RedditScraper:
return raw_comments[:15] # Limit to ~15 comments total return raw_comments[:15] # Limit to ~15 comments total
except Exception as e: except Exception as e:
if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
print(f"Connection error, retrying... (attempt {attempts+1})")
time.sleep(0.5 * (2 ** attempts)) # Exponential backoff
return self._scrape_post_comments(post_url, max_depth, attempts + 1)
print(f"Comment scraping error: {e}") print(f"Comment scraping error: {e}")
import traceback import traceback
traceback.print_exc() traceback.print_exc()
@ -370,20 +391,16 @@ class RedditScraper:
def close(self): def close(self):
"""Close browser resources.""" """Close browser resources."""
if self._initialized and self.driver: with self._lock:
try: if self._initialized and self.driver:
self.driver.quit() try:
except: self.driver.quit()
pass except:
self._initialized = False pass
self._initialized = False
# Singleton pattern for scraper instance # Create new scraper instance per request for reliability
_scraper_instance = None
def get_scraper(): def get_scraper():
"""Get singleton scraper instance.""" """Get fresh scraper instance."""
global _scraper_instance return RedditScraper()
if _scraper_instance is None:
_scraper_instance = RedditScraper()
return _scraper_instance