feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts

- Added threading.RLock() for reentrant locking in RedditScraper class
- Wrapped _ensure_browser() initialization in lock to protect browser setup
- Improved error handling in _ensure_helpers_injected() with try/except
- Prevents 'Connection refused' errors when multiple requests hit concurrently
This commit is contained in:
Marvin 2026-03-15 11:40:44 -03:00
parent 93a6dd4097
commit 278ed10adf
1 changed files with 70 additions and 53 deletions

View File

@ -1,6 +1,7 @@
"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""
import time
import threading
from typing import Optional, Dict, Any, List
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
@ -18,41 +19,39 @@ class RedditScraper:
def __init__(self):
self.driver = None
self._initialized = False
self._lock = threading.RLock() # Reentrant lock to prevent deadlocks
def _ensure_browser(self):
"""Ensure Firefox is running in optimized headless mode."""
if not self._initialized:
options = Options()
# Core optimization flags
options.add_argument('--headless') # Run without GUI (~10% faster startup)
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
# Use system geckodriver or snap version as fallback
import os
gecko_paths = [
'/snap/bin/geckodriver',
'/usr/local/bin/geckodriver',
'/usr/bin/geckodriver'
]
service_path = None
for path in gecko_paths:
if os.path.exists(path):
service_path = path
break
if not service_path:
raise RuntimeError("geckodriver not found")
service = Service(executable_path=service_path)
self.driver = webdriver.Firefox(service=service, options=options)
# Anti-detection scripts
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
self._initialized = True
with self._lock:
if not self._initialized:
options = Options()
# Core optimization flags
options.add_argument('--headless') # Run without GUI (~10% faster startup)
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
import os
gecko_paths = [
'/snap/bin/geckodriver',
'/usr/local/bin/geckodriver',
'/usr/bin/geckodriver'
]
service_path = None
for path in gecko_paths:
if os.path.exists(path):
service_path = path
break
if not service_path:
raise RuntimeError("geckodriver not found")
service = Service(executable_path=service_path)
self.driver = webdriver.Firefox(service=service, options=options)
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
self._initialized = True
def _wait_for_content(self, timeout: int = 15):
"""Wait for Reddit content to load using smart waits instead of fixed sleep."""
@ -71,10 +70,13 @@ class RedditScraper:
def _ensure_helpers_injected(self):
"""Ensure JS helpers are injected into current page context."""
# Check if helpers already exist on this page
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
if not has_helpers:
self._inject_helpers()
try:
# Check if helpers already exist on this page
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
if not has_helpers:
self._inject_helpers()
except Exception:
pass # Helpers will be re-injected on next use
def _inject_helpers(self):
"""Inject JavaScript helper functions into the page context."""
@ -298,6 +300,18 @@ class RedditScraper:
# Build result structure
posts = []
for post in posts_data:
# Only scrape comments if URL is a Reddit post (not external link)
is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']
if include_comments and is_reddit_post:
try:
comments = self._scrape_post_comments(post['url'], depth)
except Exception as e:
print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
comments = []
else:
comments = []
post_obj = {
"title": post['title'],
"author": post['author'] or None,
@ -305,7 +319,7 @@ class RedditScraper:
"created_utc": None,
"url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
"permalink": post['url'].replace('https://old.reddit.com', ''),
"comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth)
"comments": comments
}
posts.append(post_obj)
@ -330,8 +344,11 @@ class RedditScraper:
print(traceback.format_exc())
return {"Error": f"Unexpected error during scraping: {str(e)}"}
def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]:
"""Scrape comments from a specific post using pre-injected helpers."""
def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
"""Scrape comments from a specific post with retry logic."""
if attempts > 3:
return []
try:
self.driver.get(post_url)
@ -355,6 +372,10 @@ class RedditScraper:
return raw_comments[:15] # Limit to ~15 comments total
except Exception as e:
if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
print(f"Connection error, retrying... (attempt {attempts+1})")
time.sleep(0.5 * (2 ** attempts)) # Exponential backoff
return self._scrape_post_comments(post_url, max_depth, attempts + 1)
print(f"Comment scraping error: {e}")
import traceback
traceback.print_exc()
@ -370,20 +391,16 @@ class RedditScraper:
def close(self):
"""Close browser resources."""
if self._initialized and self.driver:
try:
self.driver.quit()
except:
pass
self._initialized = False
with self._lock:
if self._initialized and self.driver:
try:
self.driver.quit()
except:
pass
self._initialized = False
# Singleton pattern for scraper instance
_scraper_instance = None
# Create new scraper instance per request for reliability
def get_scraper():
"""Get singleton scraper instance."""
global _scraper_instance
if _scraper_instance is None:
_scraper_instance = RedditScraper()
return _scraper_instance
"""Get fresh scraper instance."""
return RedditScraper()