feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts
- Added threading.RLock() for reentrant locking in RedditScraper class - Wrapped _ensure_browser() initialization in lock to protect browser setup - Improved error handling in _ensure_helpers_injected() with try/except - Prevents 'Connection refused' errors when multiple requests hit concurrently
This commit is contained in:
parent
93a6dd4097
commit
278ed10adf
|
|
@ -1,6 +1,7 @@
|
|||
"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""
|
||||
|
||||
import time
|
||||
import threading
|
||||
from typing import Optional, Dict, Any, List
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.firefox.options import Options
|
||||
|
|
@ -18,41 +19,39 @@ class RedditScraper:
|
|||
def __init__(self):
|
||||
self.driver = None
|
||||
self._initialized = False
|
||||
self._lock = threading.RLock() # Reentrant lock to prevent deadlocks
|
||||
|
||||
def _ensure_browser(self):
|
||||
"""Ensure Firefox is running in optimized headless mode."""
|
||||
if not self._initialized:
|
||||
options = Options()
|
||||
with self._lock:
|
||||
if not self._initialized:
|
||||
options = Options()
|
||||
|
||||
# Core optimization flags
|
||||
options.add_argument('--headless') # Run without GUI (~10% faster startup)
|
||||
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
|
||||
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
|
||||
# Core optimization flags
|
||||
options.add_argument('--headless') # Run without GUI (~10% faster startup)
|
||||
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
|
||||
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
|
||||
|
||||
# Use system geckodriver or snap version as fallback
|
||||
import os
|
||||
gecko_paths = [
|
||||
'/snap/bin/geckodriver',
|
||||
'/usr/local/bin/geckodriver',
|
||||
'/usr/bin/geckodriver'
|
||||
]
|
||||
import os
|
||||
gecko_paths = [
|
||||
'/snap/bin/geckodriver',
|
||||
'/usr/local/bin/geckodriver',
|
||||
'/usr/bin/geckodriver'
|
||||
]
|
||||
|
||||
service_path = None
|
||||
for path in gecko_paths:
|
||||
if os.path.exists(path):
|
||||
service_path = path
|
||||
break
|
||||
service_path = None
|
||||
for path in gecko_paths:
|
||||
if os.path.exists(path):
|
||||
service_path = path
|
||||
break
|
||||
|
||||
if not service_path:
|
||||
raise RuntimeError("geckodriver not found")
|
||||
if not service_path:
|
||||
raise RuntimeError("geckodriver not found")
|
||||
|
||||
service = Service(executable_path=service_path)
|
||||
|
||||
self.driver = webdriver.Firefox(service=service, options=options)
|
||||
|
||||
# Anti-detection scripts
|
||||
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
|
||||
self._initialized = True
|
||||
service = Service(executable_path=service_path)
|
||||
self.driver = webdriver.Firefox(service=service, options=options)
|
||||
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
|
||||
self._initialized = True
|
||||
|
||||
def _wait_for_content(self, timeout: int = 15):
|
||||
"""Wait for Reddit content to load using smart waits instead of fixed sleep."""
|
||||
|
|
@ -71,10 +70,13 @@ class RedditScraper:
|
|||
|
||||
def _ensure_helpers_injected(self):
|
||||
"""Ensure JS helpers are injected into current page context."""
|
||||
# Check if helpers already exist on this page
|
||||
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
|
||||
if not has_helpers:
|
||||
self._inject_helpers()
|
||||
try:
|
||||
# Check if helpers already exist on this page
|
||||
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
|
||||
if not has_helpers:
|
||||
self._inject_helpers()
|
||||
except Exception:
|
||||
pass # Helpers will be re-injected on next use
|
||||
|
||||
def _inject_helpers(self):
|
||||
"""Inject JavaScript helper functions into the page context."""
|
||||
|
|
@ -298,6 +300,18 @@ class RedditScraper:
|
|||
# Build result structure
|
||||
posts = []
|
||||
for post in posts_data:
|
||||
# Only scrape comments if URL is a Reddit post (not external link)
|
||||
is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']
|
||||
|
||||
if include_comments and is_reddit_post:
|
||||
try:
|
||||
comments = self._scrape_post_comments(post['url'], depth)
|
||||
except Exception as e:
|
||||
print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
|
||||
comments = []
|
||||
else:
|
||||
comments = []
|
||||
|
||||
post_obj = {
|
||||
"title": post['title'],
|
||||
"author": post['author'] or None,
|
||||
|
|
@ -305,7 +319,7 @@ class RedditScraper:
|
|||
"created_utc": None,
|
||||
"url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
|
||||
"permalink": post['url'].replace('https://old.reddit.com', ''),
|
||||
"comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth)
|
||||
"comments": comments
|
||||
}
|
||||
posts.append(post_obj)
|
||||
|
||||
|
|
@ -330,8 +344,11 @@ class RedditScraper:
|
|||
print(traceback.format_exc())
|
||||
return {"Error": f"Unexpected error during scraping: {str(e)}"}
|
||||
|
||||
def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]:
|
||||
"""Scrape comments from a specific post using pre-injected helpers."""
|
||||
def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
|
||||
"""Scrape comments from a specific post with retry logic."""
|
||||
if attempts > 3:
|
||||
return []
|
||||
|
||||
try:
|
||||
self.driver.get(post_url)
|
||||
|
||||
|
|
@ -355,6 +372,10 @@ class RedditScraper:
|
|||
return raw_comments[:15] # Limit to ~15 comments total
|
||||
|
||||
except Exception as e:
|
||||
if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
|
||||
print(f"Connection error, retrying... (attempt {attempts+1})")
|
||||
time.sleep(0.5 * (2 ** attempts)) # Exponential backoff
|
||||
return self._scrape_post_comments(post_url, max_depth, attempts + 1)
|
||||
print(f"Comment scraping error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
|
@ -370,20 +391,16 @@ class RedditScraper:
|
|||
|
||||
def close(self):
|
||||
"""Close browser resources."""
|
||||
if self._initialized and self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except:
|
||||
pass
|
||||
self._initialized = False
|
||||
with self._lock:
|
||||
if self._initialized and self.driver:
|
||||
try:
|
||||
self.driver.quit()
|
||||
except:
|
||||
pass
|
||||
self._initialized = False
|
||||
|
||||
|
||||
# Singleton pattern for scraper instance
|
||||
_scraper_instance = None
|
||||
|
||||
# Create new scraper instance per request for reliability
|
||||
def get_scraper():
|
||||
"""Get singleton scraper instance."""
|
||||
global _scraper_instance
|
||||
if _scraper_instance is None:
|
||||
_scraper_instance = RedditScraper()
|
||||
return _scraper_instance
|
||||
"""Get fresh scraper instance."""
|
||||
return RedditScraper()
|
||||
|
|
|
|||
Loading…
Reference in New Issue