feat: Add thread-safe browser access with RLock to prevent concurrent request conflicts
- Added threading.RLock() for reentrant locking in RedditScraper class - Wrapped _ensure_browser() initialization in lock to protect browser setup - Improved error handling in _ensure_helpers_injected() with try/except - Prevents 'Connection refused' errors when multiple requests hit concurrently
This commit is contained in:
parent
93a6dd4097
commit
278ed10adf
|
|
@ -1,6 +1,7 @@
|
||||||
"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""
|
"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com."""
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import threading
|
||||||
from typing import Optional, Dict, Any, List
|
from typing import Optional, Dict, Any, List
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.firefox.options import Options
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
|
@ -18,41 +19,39 @@ class RedditScraper:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.driver = None
|
self.driver = None
|
||||||
self._initialized = False
|
self._initialized = False
|
||||||
|
self._lock = threading.RLock() # Reentrant lock to prevent deadlocks
|
||||||
|
|
||||||
def _ensure_browser(self):
|
def _ensure_browser(self):
|
||||||
"""Ensure Firefox is running in optimized headless mode."""
|
"""Ensure Firefox is running in optimized headless mode."""
|
||||||
if not self._initialized:
|
with self._lock:
|
||||||
options = Options()
|
if not self._initialized:
|
||||||
|
options = Options()
|
||||||
# Core optimization flags
|
|
||||||
options.add_argument('--headless') # Run without GUI (~10% faster startup)
|
# Core optimization flags
|
||||||
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
|
options.add_argument('--headless') # Run without GUI (~10% faster startup)
|
||||||
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
|
options.add_argument('--disable-gpu') # Disable GPU (not needed for headless)
|
||||||
|
options.set_preference('permissions.default.image', 2) # Block images (~30-50% faster loads)
|
||||||
# Use system geckodriver or snap version as fallback
|
|
||||||
import os
|
import os
|
||||||
gecko_paths = [
|
gecko_paths = [
|
||||||
'/snap/bin/geckodriver',
|
'/snap/bin/geckodriver',
|
||||||
'/usr/local/bin/geckodriver',
|
'/usr/local/bin/geckodriver',
|
||||||
'/usr/bin/geckodriver'
|
'/usr/bin/geckodriver'
|
||||||
]
|
]
|
||||||
|
|
||||||
service_path = None
|
service_path = None
|
||||||
for path in gecko_paths:
|
for path in gecko_paths:
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
service_path = path
|
service_path = path
|
||||||
break
|
break
|
||||||
|
|
||||||
if not service_path:
|
if not service_path:
|
||||||
raise RuntimeError("geckodriver not found")
|
raise RuntimeError("geckodriver not found")
|
||||||
|
|
||||||
service = Service(executable_path=service_path)
|
service = Service(executable_path=service_path)
|
||||||
|
self.driver = webdriver.Firefox(service=service, options=options)
|
||||||
self.driver = webdriver.Firefox(service=service, options=options)
|
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
|
||||||
|
self._initialized = True
|
||||||
# Anti-detection scripts
|
|
||||||
self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })")
|
|
||||||
self._initialized = True
|
|
||||||
|
|
||||||
def _wait_for_content(self, timeout: int = 15):
|
def _wait_for_content(self, timeout: int = 15):
|
||||||
"""Wait for Reddit content to load using smart waits instead of fixed sleep."""
|
"""Wait for Reddit content to load using smart waits instead of fixed sleep."""
|
||||||
|
|
@ -71,10 +70,13 @@ class RedditScraper:
|
||||||
|
|
||||||
def _ensure_helpers_injected(self):
|
def _ensure_helpers_injected(self):
|
||||||
"""Ensure JS helpers are injected into current page context."""
|
"""Ensure JS helpers are injected into current page context."""
|
||||||
# Check if helpers already exist on this page
|
try:
|
||||||
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
|
# Check if helpers already exist on this page
|
||||||
if not has_helpers:
|
has_helpers = self.driver.execute_script("return !!window.RSScraperHelpers;")
|
||||||
self._inject_helpers()
|
if not has_helpers:
|
||||||
|
self._inject_helpers()
|
||||||
|
except Exception:
|
||||||
|
pass # Helpers will be re-injected on next use
|
||||||
|
|
||||||
def _inject_helpers(self):
|
def _inject_helpers(self):
|
||||||
"""Inject JavaScript helper functions into the page context."""
|
"""Inject JavaScript helper functions into the page context."""
|
||||||
|
|
@ -298,6 +300,18 @@ class RedditScraper:
|
||||||
# Build result structure
|
# Build result structure
|
||||||
posts = []
|
posts = []
|
||||||
for post in posts_data:
|
for post in posts_data:
|
||||||
|
# Only scrape comments if URL is a Reddit post (not external link)
|
||||||
|
is_reddit_post = 'reddit.com' in post['url'] and '/comments/' in post['url']
|
||||||
|
|
||||||
|
if include_comments and is_reddit_post:
|
||||||
|
try:
|
||||||
|
comments = self._scrape_post_comments(post['url'], depth)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: Failed to scrape comments for {post['title'][:50]}: {e}")
|
||||||
|
comments = []
|
||||||
|
else:
|
||||||
|
comments = []
|
||||||
|
|
||||||
post_obj = {
|
post_obj = {
|
||||||
"title": post['title'],
|
"title": post['title'],
|
||||||
"author": post['author'] or None,
|
"author": post['author'] or None,
|
||||||
|
|
@ -305,7 +319,7 @@ class RedditScraper:
|
||||||
"created_utc": None,
|
"created_utc": None,
|
||||||
"url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
|
"url": post['url'].replace('old.reddit.com', 'www.reddit.com'),
|
||||||
"permalink": post['url'].replace('https://old.reddit.com', ''),
|
"permalink": post['url'].replace('https://old.reddit.com', ''),
|
||||||
"comments": [] if not include_comments else self._scrape_post_comments(post['url'], depth)
|
"comments": comments
|
||||||
}
|
}
|
||||||
posts.append(post_obj)
|
posts.append(post_obj)
|
||||||
|
|
||||||
|
|
@ -330,8 +344,11 @@ class RedditScraper:
|
||||||
print(traceback.format_exc())
|
print(traceback.format_exc())
|
||||||
return {"Error": f"Unexpected error during scraping: {str(e)}"}
|
return {"Error": f"Unexpected error during scraping: {str(e)}"}
|
||||||
|
|
||||||
def _scrape_post_comments(self, post_url: str, max_depth: int = 3) -> List[Dict[str, Any]]:
|
def _scrape_post_comments(self, post_url: str, max_depth: int = 3, attempts=0) -> List[Dict[str, Any]]:
|
||||||
"""Scrape comments from a specific post using pre-injected helpers."""
|
"""Scrape comments from a specific post with retry logic."""
|
||||||
|
if attempts > 3:
|
||||||
|
return []
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.driver.get(post_url)
|
self.driver.get(post_url)
|
||||||
|
|
||||||
|
|
@ -355,6 +372,10 @@ class RedditScraper:
|
||||||
return raw_comments[:15] # Limit to ~15 comments total
|
return raw_comments[:15] # Limit to ~15 comments total
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
if "connection refused" in str(e).lower() or "timeout" in str(e).lower():
|
||||||
|
print(f"Connection error, retrying... (attempt {attempts+1})")
|
||||||
|
time.sleep(0.5 * (2 ** attempts)) # Exponential backoff
|
||||||
|
return self._scrape_post_comments(post_url, max_depth, attempts + 1)
|
||||||
print(f"Comment scraping error: {e}")
|
print(f"Comment scraping error: {e}")
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
@ -370,20 +391,16 @@ class RedditScraper:
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""Close browser resources."""
|
"""Close browser resources."""
|
||||||
if self._initialized and self.driver:
|
with self._lock:
|
||||||
try:
|
if self._initialized and self.driver:
|
||||||
self.driver.quit()
|
try:
|
||||||
except:
|
self.driver.quit()
|
||||||
pass
|
except:
|
||||||
self._initialized = False
|
pass
|
||||||
|
self._initialized = False
|
||||||
|
|
||||||
|
|
||||||
# Singleton pattern for scraper instance
|
# Create new scraper instance per request for reliability
|
||||||
_scraper_instance = None
|
|
||||||
|
|
||||||
def get_scraper():
|
def get_scraper():
|
||||||
"""Get singleton scraper instance."""
|
"""Get fresh scraper instance."""
|
||||||
global _scraper_instance
|
return RedditScraper()
|
||||||
if _scraper_instance is None:
|
|
||||||
_scraper_instance = RedditScraper()
|
|
||||||
return _scraper_instance
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue