"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com.""" import time from typing import Optional, Dict, Any, List from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.firefox.service import Service class RedditScraper: """Scrapes OLD Reddit pages (old.reddit.com) using a headless Firefox browser.""" def __init__(self): self.driver = None self._initialized = False def _ensure_browser(self): """Ensure Firefox is running in headless mode.""" if not self._initialized: options = Options() options.add_argument('--headless') options.set_preference('dom.webdriver.enabled', False) # Custom user agent to appear more human-like options.set_preference('general.useragent.override', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36') # Use geckodriver from snap service = Service(executable_path='/snap/bin/geckodriver') self.driver = webdriver.Firefox(service=service, options=options) # Anti-detection scripts self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })") self._initialized = True def scrape_subreddit_top( self, subreddit: str, limit: int = 10, time_range: str = "week", depth: int = 3, include_comments: bool = True ) -> Dict[str, Any]: """ Scrape top posts from a subreddit using Selenium on old.reddit.com. Args: subreddit: Name of the subreddit (without 'r/') limit: Number of top posts to retrieve time_range: Time filter ('hour', 'day', 'week', 'month', 'year', 'all') depth: Maximum comment nesting depth include_comments: Whether to scrape comments Returns: Dict containing scraped data or error information """ self._ensure_browser() try: # Navigate to OLD Reddit for cleaner DOM structure url = f"https://old.reddit.com/r/{subreddit}/top/?t={time_range}" self.driver.get(url) # Wait for content to load time.sleep(4) # Extract post data using old.reddit.com specific selectors (tested and working!) posts_data = self.driver.execute_script(''' const posts = []; const postElements = document.querySelectorAll('.thing'); for (const el of Array.from(postElements)) { if (posts.length >= 10) break; // Title link const titleLink = el.querySelector('a.title.may-blank'); if (!titleLink) continue; const title = titleLink.textContent.trim(); // Skip ads and very short titles if (title.length < 3 || title.includes('Microsoft') || title.startsWith('r/')) continue; // Score let score = 0; try { const scoreEl = el.querySelector('.score.unvoted'); if (scoreEl) { score = parseInt(scoreEl.textContent.replace(/[^0-9]/g, '')) || 0; } } catch(e) {} // Author - try multiple selector patterns let author = null; try { const authorLink = el.querySelector('.midcol .author, .author'); if (authorLink && authorLink.textContent.trim()) { author = authorLink.textContent.trim(); } } catch(e) {} // URL - clean tracking params let url = titleLink.href || ''; try { const cleanUrl = new URL(url); ['utm_source', 'utm_medium', 'utm_campaign'].forEach(param => { cleanUrl.searchParams.delete(param); }); url = cleanUrl.toString(); } catch(e) {} posts.push({ title: title, author: author, score: score, url: url }); } return posts; ''') # Build result structure posts = [] for post in posts_data: post_obj = { "title": post['title'], "author": post['author'] or None, "score": post['score'], "created_utc": None, # Not easily accessible via DOM scraping "url": post['url'].replace('old.reddit.com', 'www.reddit.com'), "permalink": post['url'].replace('https://old.reddit.com', ''), "comments": [] if not include_comments else self._scrape_post_comments(post['url']) } posts.append(post_obj) return { "subreddit": subreddit, "time_range": time_range, "limit": len(posts), "posts_count": len(posts), "data": posts } except Exception as e: import traceback print(f"Error during scraping: {e}") print(traceback.format_exc()) return {"Error": f"Unexpected error during scraping: {str(e)}"} def _scrape_post_comments(self, post_url: str) -> List[Dict[str, Any]]: """Scrape comments from a specific post using Selenium.""" try: self.driver.get(post_url) time.sleep(2) # Extract comments using old.reddit.com structure rawComments = self.driver.execute_script(''' const comments = []; // On old Reddit, find comment containers const candidates = document.querySelectorAll('.usertext-body, .md, div.comment'); for (const el of Array.from(candidates)) { if (comments.length >= 10) break; const text = el.textContent.trim(); // Skip empty or very short content, and UI elements const lowerText = text.toLowerCase(); if (!text || text.length < 20 || text.includes('open menu') || text.includes('reddit home')) continue; comments.push({ author: 'unknown', body: text, score: null, created_utc: null }); } return comments.slice(0, 10); ''') # Clean up comment bodies for comment in rawComments: comment['body'] = self._clean_text(comment.get('body', '')) return rawComments[:10] # Limit to 10 comments per post except Exception as e: print(f"Comment scraping error: {e}") import traceback traceback.print_exc() return [] def _clean_text(self, text: str) -> str: """Clean and normalize text content.""" if not text: return "" lines = text.split("\n") cleaned = [line.strip() for line in lines if line.strip()] return " ".join(cleaned) def close(self): """Close browser resources.""" if self._initialized and self.driver: try: self.driver.quit() except: pass self._initialized = False # Singleton pattern for scraper instance _scraper_instance = None def get_scraper(): """Get singleton scraper instance.""" global _scraper_instance if _scraper_instance is None: _scraper_instance = RedditScraper() return _scraper_instance