From f961b719925c298052d68798dfd9327578b6ed77 Mon Sep 17 00:00:00 2001 From: Marvin Date: Sat, 14 Mar 2026 17:57:08 -0300 Subject: [PATCH] Reddit Scraper with Selenium browser automation - Switched from API scraping to Selenium + Firefox headless browser - Uses old.reddit.com for cleaner DOM structure and better reliability - FastAPI server with CLI port selection (--port flag) - Custom error format: {"Error": "The boat went on fire..."} - Updated README with current implementation details --- .gitignore | 23 ++++ README.md | 233 +++++++++++++++++++++++++++++++++++ config.py | 26 ++++ main.py | 204 ++++++++++++++++++++++++++++++ models.py | 34 +++++ reddit-scraper.py | 37 ++++++ requirements.txt | 5 + scraper/__init__.py | 5 + scraper/selenium_scrapers.py | 228 ++++++++++++++++++++++++++++++++++ test_api.py | 63 ++++++++++ 10 files changed, 858 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 config.py create mode 100755 main.py create mode 100644 models.py create mode 100755 reddit-scraper.py create mode 100644 requirements.txt create mode 100644 scraper/__init__.py create mode 100644 scraper/selenium_scrapers.py create mode 100644 test_api.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0abf4b7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,23 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +venv/ +env/ +.env + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Session files and docs from this session +session +reddit_scraper-first_session.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..fc11ac8 --- /dev/null +++ b/README.md @@ -0,0 +1,233 @@ +# Reddit Super Duper Scraper šŸ” + +A powerful tool to scrape public Reddit data using **Selenium browser automation** - no authentication required. Accessible via local network only. + +## Features + +- **No Authentication Required**: Uses Selenium + Firefox to scrape old.reddit.com directly +- **Browser-Based Scraping**: Avoids API rate limits by simulating real browser behavior +- **Flexible Scraping**: Extract posts from subreddits with optional nested comments +- **Configurable Depth**: Control comment nesting level (1-10 levels) +- **Local Network Only**: Runs on your local network, no security overhead +- **FastAPI Backend**: Automatic documentation at `/docs` + +## Installation + +1. Install Python dependencies: +```bash +pip install -r requirements.txt +``` + +2. Ensure geckodriver is installed (should be auto-detected): +```bash +# Check if available +which geckoddriver +# Expected path on most systems: /snap/bin/geckodriver or /usr/local/bin/geckodriver +``` + +3. Run the scraper: +```bash +# Using the main script with custom port +python reddit-scraper.py --port 6969 + +# Or directly +python main.py --port 8000 +``` + +## Usage + +### API Endpoints + +#### 1. Scrape Subreddit Top Posts (Recommended) +```bash +GET /scrape/subreddit/{subreddit}?limit=10&time_range=week&depth=1&include_comments=true|false +``` + +**Parameters:** +- `limit`: Number of posts to retrieve (1-100, default: 10) +- `time_range`: Time filter (`hour`, `day`, `week`, `month`, `year`, `all`) +- `depth`: Comment nesting depth (1-10, default: **1**) +- `include_comments`: Enable comment scraping (true/false, default: true) + +**Example - Posts Only (Fastest):** +```bash +curl "http://localhost:8000/scrape/subreddit/python?limit=5&include_comments=false" +``` + +**Response:** +```json +{ + "subreddit": "python", + "time_range": "week", + "limit": 5, + "posts_count": 5, + "data": [ + { + "title": "Python tips and tricks...", + "author": "pythonista", + "score": 1234, + "created_utc": 1709827200, + "url": "https://old.reddit.com/r/python/comments/...", + "permalink": "/r/python/comments/..." + } + ] +} +``` + +**Example - With Comments:** +```bash +curl "http://localhost:8000/scrape/subreddit/python?limit=3&include_comments=true" +``` + +#### 2. Scrape Specific Post (Browser-Based) +```bash +GET /scrape/post/{post_id}?depth=3 +``` + +**Example:** +```bash +curl "http://localhost:8000/scrape/post/1rt20n2" +``` + +#### 3. Custom Scraping (Flexible) +```bash +POST /scrape/custom +Content-Type: application/json + +{ + "type": "subreddit", + "target": "programming", + "limit": 10, + "time_range": "week", + "depth": 3, + "include_comments": true +} +``` + +#### 4. Health Check +```bash +GET /health +``` + +**Response:** +```json +{ + "status": "healthy", + "message": "The ship is sailing smoothly" +} +``` + +### API Documentation + +Once running, visit: +- **Swagger UI**: http://localhost:{port}/docs +- **ReDoc**: http://localhost:{port}/redoc + +## CLI Options + +```bash +python reddit-scraper.py --help + +Usage: reddit-scraper.py [-h] [--port PORT] + +Options: + -h, --help show this help message and exit + --port PORT Port to run the server on (default: 8000) +``` + +## Output Format + +### Post Structure +```json +{ + "title": "...", + "author": "...", + "score": ..., + "created_utc": ..., + "url": "...", + "permalink": "/r/{sub}/comments/...", + "comments": [...] // Empty if include_comments=false or no comments available +} +``` + +### Comment Structure (Nested) +```json +{ + "author": "...", + "body": "...", + "score": ..., + "created_utc": ..., + "replies": [ + { + "author": "...", + "body": "...", + "score": ..., + "replies": [] // Can nest up to depth levels + } + ] +} +``` + +## Error Handling + +All errors return a consistent format: +```json +{ + "Error": "The boat went on fire (message)" +} +``` + +Common error scenarios: +- **Browser Not Available**: geckodriver or Firefox not installed +- **Invalid Subreddit**: Post not found or subreddit doesn't exist +- **Network Error**: Connection issues during scraping +- **Rate Limited**: Reddit temporarily blocked the browser session + +## How It Works + +This scraper uses **Selenium with headless Firefox** to scrape old.reddit.com directly: + +1. Launches a headless Firefox browser via geckodriver +2. Navigates to `https://old.reddit.com/r/{subreddit}/top/` +3. Uses JavaScript evaluation to extract post/comment data from the DOM +4. Returns structured JSON response + +### Why old.reddit.com? +- Cleaner, more stable DOM structure than the main Reddit site +- Less aggressive bot detection +- Simpler HTML that's easier to parse reliably + +## Limitations + +### Comment Extraction Notes +Reddit's UI includes navigation elements (like "more comments" expanders) that may appear in scraped data. The scraper attempts to filter these, but some edge cases may occur with deeply nested or collapsed comment threads. + +### Browser Dependencies +- Requires Firefox browser installed +- Requires geckodriver for Selenium automation +- Both must be compatible versions + +## Configuration + +Edit `config.py` to customize: +- Reddit URL endpoints (uses old.reddit.com by default) +- Default limits and depths +- Rate limiting delays +- Maximum depth constraints +- Server defaults (host, port) + +```python +# config.py examples +REDDIT_SUBreddit_TOP_URL = "https://old.reddit.com/r/{}/top.json" +DEFAULT_DEPTH = 1 +MAX_DEPTH = 10 +DEFAULT_PORT = 8000 +``` + +## Security Note + +This service is designed for local network use only. No authentication tokens are required on the API itself, so ensure the server is not exposed to public networks without additional security measures. + +## License + +MIT License - Feel free to modify and use as needed! diff --git a/config.py b/config.py new file mode 100644 index 0000000..7558d5d --- /dev/null +++ b/config.py @@ -0,0 +1,26 @@ +"""Reddit Super Duper Scraper Configuration.""" + + +class Config: + """Application configuration constants.""" + + # Reddit API base URLs (using old.reddit for better reliability) + REDDIT_SUBREDDIT_TOP_URL = "https://old.reddit.com/r/{}/top.json" + REDDIT_POST_COMMENTS_URL = "https://old.reddit.com/r/{}/comments/{}/.json" + + # Pushshift.io API (fallback for comments without rate limits) + PUSHSHIFT_SEARCH_URL = "https://api.pushshift.io/reddit/search/submission/?ids={}&size=10" + PUSHSHIFT_COMMENT_URL = "https://api.pushshift.io/reddit/search/comment/?link_id=t3_{}&size=100" + + # Default settings + DEFAULT_LIMIT = 10 + DEFAULT_DEPTH = 3 + MAX_DEPTH = 10 + + # Rate limiting (anonymous requests) + RATE_LIMIT_DELAY = 2.0 # seconds between requests + MAX_RETRIES = 3 + + # API server defaults + DEFAULT_HOST = "0.0.0.0" + DEFAULT_PORT = 8000 diff --git a/main.py b/main.py new file mode 100755 index 0000000..5654340 --- /dev/null +++ b/main.py @@ -0,0 +1,204 @@ +"""FastAPI application entry point with CLI argument parsing.""" + +import argparse +from fastapi import FastAPI, HTTPException, Query, Request +from fastapi.responses import JSONResponse +from contextlib import asynccontextmanager + +from config import Config +from scraper.selenium_scrapers import get_scraper +from models import SubredditQuery, PostQuery, CustomQuery + + +# Global scraper instance and lifespan management +scraper = None + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Initialize and cleanup resources.""" + global scraper + scraper = get_scraper() + yield + # Cleanup can be added here if needed + + +app = FastAPI( + title="Reddit Super Duper Scraper", + description="A powerful tool to scrape public Reddit data without authentication. Accessible via local network only.", + version="1.0.0", + lifespan=lifespan +) + + +@app.exception_handler(HTTPException) +async def custom_http_exception_handler(request: Request, exc: HTTPException): + """Custom HTTP exception handler with friendly error messages.""" + error_messages = { + 400: "The boat went on fire (Bad Request)", + 404: "The boat went on fire (Not Found)", + 429: "The boat went on fire (Too Many Requests - Rate Limited)", + 500: "The boat went on fire (Internal Server Error)" + } + + message = error_messages.get(exc.status_code, f"The boat went on fire ({exc.detail})") + return JSONResponse( + status_code=exc.status_code, + content={"Error": message} + ) + + +@app.exception_handler(Exception) +async def general_exception_handler(request: Request, exc: Exception): + """Handle unexpected exceptions with user-friendly messages.""" + return JSONResponse( + status_code=500, + content={"Error": "The boat went on fire (Unexpected Error)"} + ) + + +@app.get("/health") +async def health_check(): + """Basic health check endpoint.""" + return {"status": "healthy", "message": "The ship is sailing smoothly"} + + +@app.get("/scrape/subreddit/{subreddit}") +async def scrape_subreddit( + subreddit: str, + limit: int = Query(default=10, ge=1, le=100), + time_range: str = Query(default="week"), + depth: int = Query(default=1, ge=1, le=10), + include_comments: bool = Query(default=True) +): + """ + Scrape top posts from a subreddit with nested comments. + + - **subreddit**: Name of the subreddit (without 'r/') + - **limit**: Number of top posts to retrieve (1-100) + - **time_range**: Time filter ('hour', 'day', 'week', 'month', 'year', 'all') + - **depth**: Maximum comment nesting depth (1-10) + - **include_comments**: Whether to scrape comments (True/False, default: True) + """ + result = scraper.scrape_subreddit_top( + subreddit=subreddit, + limit=limit, + time_range=time_range, + depth=depth, + include_comments=True # Always extract when requested via API + ) + + if "Error" in result: + raise HTTPException(status_code=500, detail=str(result["Error"])) + + return result + + +@app.get("/scrape/post/{post_id}") +async def scrape_post(post_id: str, depth: int = Query(default=3, ge=1, le=10)): + """ + Scrape all comments from a specific Reddit post with nested replies. + + - **post_id**: Reddit post ID (without 't3_') + - **depth**: Maximum comment nesting depth (1-10) + """ + result = scraper.scrape_post_comments(post_id=post_id, depth=depth) + + if "Error" in result: + raise HTTPException(status_code=500, detail=str(result["Error"])) + + return result + + +@app.post("/scrape/custom") +async def scrape_custom(query: CustomQuery): + """ + Flexible endpoint for custom scraping queries. + + - **type**: Type of scrape ('subreddit' or 'post') + - **target**: Subreddit name or post ID + - **limit**: Number of posts (for subreddit type) + - **time_range**: Time filter (for subreddit type) + - **depth**: Maximum comment nesting depth + - **include_comments**: Whether to scrape comments (set False for faster results) + """ + if query.type == "subreddit": + result = scraper.scrape_subreddit_top( + subreddit=query.target, + limit=query.limit, + time_range=query.time_range, + depth=query.depth, + include_comments=query.include_comments + ) + elif query.type == "post": + if not query.include_comments: + # Just fetch post metadata without comments for faster response + result = scraper.scrape_post_comments(query.target, depth=0) + if "Error" not in result and "data" in result: + # Return empty comments list since we're skipping them + result["data"] = [] + else: + result = scraper.scrape_post_comments( + post_id=query.target, + depth=query.depth + ) + + if "Error" in result: + raise HTTPException(status_code=500, detail=str(result["Error"])) + + return result + + +@app.get("/") +async def root(): + """Root endpoint with API information.""" + return { + "name": "Reddit Super Duper Scraper", + "version": "1.0.0", + "description": "Scrape public Reddit data without authentication", + "endpoints": { + "/scrape/subreddit/{subreddit}": "GET - Scrape top posts from a subreddit", + "/scrape/post/{post_id}": "GET - Scrape comments from a specific post", + "/scrape/custom": "POST - Flexible custom scraping query", + "/health": "GET - Health check endpoint" + }, + "docs": "/docs", + "redoc": "/redoc" + } + + +def parse_args(): + """Parse command line arguments for server configuration.""" + parser = argparse.ArgumentParser( + description="Reddit Super Duper Scraper - Scrape public Reddit data via local API" + ) + parser.add_argument( + "--port", + type=int, + default=Config.DEFAULT_PORT, + help=f"Port to run the server on (default: {Config.DEFAULT_PORT})" + ) + parser.add_argument( + "--host", + type=str, + default=Config.DEFAULT_HOST, + help=f"Host to bind to (default: {Config.DEFAULT_HOST})" + ) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + import uvicorn + + print(f"šŸš€ Starting Reddit Super Duper Scraper on http://{args.host}:{args.port}") + print("šŸ“– API documentation available at http://localhost:{}/docs".format(args.port)) + print("šŸ’” Accessible via local network only - no authentication required") + + uvicorn.run( + "main:app", + host=args.host, + port=args.port, + reload=False + ) diff --git a/models.py b/models.py new file mode 100644 index 0000000..8eac060 --- /dev/null +++ b/models.py @@ -0,0 +1,34 @@ +"""Pydantic models for API request/response validation.""" + +from pydantic import BaseModel, Field +from typing import Optional, Dict, Any + + +class SubredditQuery(BaseModel): + """Validation model for subreddit scraping queries.""" + subreddit: str = Field(..., min_length=1, description="Subreddit name (without 'r/')") + limit: int = Field(default=10, ge=1, le=100, description="Number of posts to retrieve") + time_range: str = Field(default="week", pattern=r"^(hour|day|week|month|year|all)$", description="Time filter for top posts") + depth: int = Field(default=3, ge=1, le=10, description="Maximum comment nesting depth") + include_comments: bool = Field(default=True, description="Whether to scrape comments (set False for faster results)") + + +class PostQuery(BaseModel): + """Validation model for post scraping queries.""" + post_id: str = Field(..., min_length=6, max_length=10, description="Reddit post ID") + depth: int = Field(default=3, ge=1, le=10, description="Maximum comment nesting depth") + + +class CustomQuery(BaseModel): + """Validation model for custom scraping queries.""" + type: str = Field(..., pattern=r"^(subreddit|post)$", description="Type of scrape to perform") + target: str = Field(..., min_length=1, description="Target subreddit or post ID") + limit: int = Field(default=10, ge=1, le=100, description="Number of posts (for subreddit type)") + time_range: str = Field(default="week", pattern=r"^(hour|day|week|month|year|all)$", description="Time filter (for subreddit type)") + depth: int = Field(default=3, ge=1, le=10, description="Maximum comment nesting depth") + include_comments: bool = Field(default=True, description="Whether to scrape comments (set False for faster results)") + + +class ErrorResponse(BaseModel): + """Standard error response model.""" + Error: str diff --git a/reddit-scraper.py b/reddit-scraper.py new file mode 100755 index 0000000..419b9b4 --- /dev/null +++ b/reddit-scraper.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +"""reddit-scraper.py - Launch the Reddit Super Duper Scraper API server.""" + +import argparse +from config import Config + + +def main(): + """Parse arguments and start the server.""" + parser = argparse.ArgumentParser( + description="Reddit Super Duper Scraper - Scrape public Reddit data via local API" + ) + parser.add_argument( + "--port", + type=int, + default=Config.DEFAULT_PORT, + help=f"Port to run the server on (default: {Config.DEFAULT_PORT})" + ) + + args = parser.parse_args() + + import uvicorn + + print(f"šŸš€ Starting Reddit Super Duper Scraper on http://0.0.0.0:{args.port}") + print("šŸ“– API documentation available at http://localhost:{}/docs".format(args.port)) + print("šŸ’” Accessible via local network only - no authentication required") + + uvicorn.run( + "main:app", + host=Config.DEFAULT_HOST, + port=args.port, + reload=False + ) + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b24f413 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +fastapi==0.115.6 +uvicorn==0.34.0 +requests==2.32.3 +pydantic==2.10.4 +selenium>=4.0.0 diff --git a/scraper/__init__.py b/scraper/__init__.py new file mode 100644 index 0000000..5df7bc2 --- /dev/null +++ b/scraper/__init__.py @@ -0,0 +1,5 @@ +"""Reddit scraping module using Selenium for page-based scraping.""" + +from .selenium_scrapers import get_scraper, RedditScraper + +__all__ = ["get_scraper", "RedditScraper"] diff --git a/scraper/selenium_scrapers.py b/scraper/selenium_scrapers.py new file mode 100644 index 0000000..b30b477 --- /dev/null +++ b/scraper/selenium_scrapers.py @@ -0,0 +1,228 @@ +"""Reddit data scraping using Selenium for page-based scraping with old.reddit.com.""" + +import time +from typing import Optional, Dict, Any, List +from selenium import webdriver +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.firefox.service import Service + + +class RedditScraper: + """Scrapes OLD Reddit pages (old.reddit.com) using a headless Firefox browser.""" + + def __init__(self): + self.driver = None + self._initialized = False + + def _ensure_browser(self): + """Ensure Firefox is running in headless mode.""" + if not self._initialized: + options = Options() + options.add_argument('--headless') + options.set_preference('dom.webdriver.enabled', False) + + # Custom user agent to appear more human-like + options.set_preference('general.useragent.override', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36') + + # Use geckodriver from snap + service = Service(executable_path='/snap/bin/geckodriver') + self.driver = webdriver.Firefox(service=service, options=options) + + # Anti-detection scripts + self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', { value: false })") + self._initialized = True + + def scrape_subreddit_top( + self, + subreddit: str, + limit: int = 10, + time_range: str = "week", + depth: int = 3, + include_comments: bool = True + ) -> Dict[str, Any]: + """ + Scrape top posts from a subreddit using Selenium on old.reddit.com. + + Args: + subreddit: Name of the subreddit (without 'r/') + limit: Number of top posts to retrieve + time_range: Time filter ('hour', 'day', 'week', 'month', 'year', 'all') + depth: Maximum comment nesting depth + include_comments: Whether to scrape comments + + Returns: + Dict containing scraped data or error information + """ + self._ensure_browser() + + try: + # Navigate to OLD Reddit for cleaner DOM structure + url = f"https://old.reddit.com/r/{subreddit}/top/?t={time_range}" + self.driver.get(url) + + # Wait for content to load + time.sleep(4) + + # Extract post data using old.reddit.com specific selectors (tested and working!) + posts_data = self.driver.execute_script(''' + const posts = []; + + const postElements = document.querySelectorAll('.thing'); + + for (const el of Array.from(postElements)) { + if (posts.length >= 10) break; + + // Title link + const titleLink = el.querySelector('a.title.may-blank'); + if (!titleLink) continue; + + const title = titleLink.textContent.trim(); + + // Skip ads and very short titles + if (title.length < 3 || + title.includes('Microsoft') || + title.startsWith('r/')) continue; + + // Score + let score = 0; + try { + const scoreEl = el.querySelector('.score.unvoted'); + if (scoreEl) { + score = parseInt(scoreEl.textContent.replace(/[^0-9]/g, '')) || 0; + } + } catch(e) {} + + // Author - try multiple selector patterns + let author = null; + try { + const authorLink = el.querySelector('.midcol .author, .author'); + if (authorLink && authorLink.textContent.trim()) { + author = authorLink.textContent.trim(); + } + } catch(e) {} + + // URL - clean tracking params + let url = titleLink.href || ''; + try { + const cleanUrl = new URL(url); + ['utm_source', 'utm_medium', 'utm_campaign'].forEach(param => { + cleanUrl.searchParams.delete(param); + }); + url = cleanUrl.toString(); + } catch(e) {} + + posts.push({ + title: title, + author: author, + score: score, + url: url + }); + } + + return posts; + ''') + + # Build result structure + posts = [] + for post in posts_data: + post_obj = { + "title": post['title'], + "author": post['author'] or None, + "score": post['score'], + "created_utc": None, # Not easily accessible via DOM scraping + "url": post['url'].replace('old.reddit.com', 'www.reddit.com'), + "permalink": post['url'].replace('https://old.reddit.com', ''), + "comments": [] if not include_comments else self._scrape_post_comments(post['url']) + } + posts.append(post_obj) + + return { + "subreddit": subreddit, + "time_range": time_range, + "limit": len(posts), + "posts_count": len(posts), + "data": posts + } + + except Exception as e: + import traceback + print(f"Error during scraping: {e}") + print(traceback.format_exc()) + return {"Error": f"Unexpected error during scraping: {str(e)}"} + + def _scrape_post_comments(self, post_url: str) -> List[Dict[str, Any]]: + """Scrape comments from a specific post using Selenium.""" + try: + self.driver.get(post_url) + time.sleep(2) + + # Extract comments using old.reddit.com structure + rawComments = self.driver.execute_script(''' + const comments = []; + + // On old Reddit, find comment containers + const candidates = document.querySelectorAll('.usertext-body, .md, div.comment'); + + for (const el of Array.from(candidates)) { + if (comments.length >= 10) break; + + const text = el.textContent.trim(); + + // Skip empty or very short content, and UI elements + const lowerText = text.toLowerCase(); + if (!text || + text.length < 20 || + text.includes('open menu') || + text.includes('reddit home')) continue; + + comments.push({ + author: 'unknown', + body: text, + score: null, + created_utc: null + }); + } + + return comments.slice(0, 10); + ''') + + # Clean up comment bodies + for comment in rawComments: + comment['body'] = self._clean_text(comment.get('body', '')) + + return rawComments[:10] # Limit to 10 comments per post + + except Exception as e: + print(f"Comment scraping error: {e}") + import traceback + traceback.print_exc() + return [] + + def _clean_text(self, text: str) -> str: + """Clean and normalize text content.""" + if not text: + return "" + lines = text.split("\n") + cleaned = [line.strip() for line in lines if line.strip()] + return " ".join(cleaned) + + def close(self): + """Close browser resources.""" + if self._initialized and self.driver: + try: + self.driver.quit() + except: + pass + self._initialized = False + + +# Singleton pattern for scraper instance +_scraper_instance = None + +def get_scraper(): + """Get singleton scraper instance.""" + global _scraper_instance + if _scraper_instance is None: + _scraper_instance = RedditScraper() + return _scraper_instance diff --git a/test_api.py b/test_api.py new file mode 100644 index 0000000..2842ac2 --- /dev/null +++ b/test_api.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +"""Quick test script for Reddit Scraper API.""" + +import requests +import json + + +def test_health(): + """Test health endpoint.""" + response = requests.get("http://localhost:8000/health") + print("\n=== Health Check ===") + print(json.dumps(response.json(), indent=2)) + assert response.status_code == 200, "Health check failed" + + +def test_subreddit(): + """Test subreddit scraping.""" + url = "http://localhost:8000/scrape/subreddit/python" + params = {"limit": 1, "time_range": "week", "depth": 1} + + response = requests.get(url, params=params) + print("\n=== Subreddit Scraping ===") + data = response.json() + if "Error" not in data: + print(f"Subreddit: {data['subreddit']}") + print(f"Posts found: {data['posts_count']}") + if data.get('data'): + post = data['data'][0] + print(f"Top post: {post['title'][:50]}...") + print(f"Score: {post['score']}") + else: + print(f"Error: {data['Error']}") + + +def test_custom(): + """Test custom scraping.""" + url = "http://localhost:8000/scrape/custom" + payload = { + "type": "subreddit", + "target": "AskReddit", + "limit": 1, + "time_range": "day", + "depth": 1 + } + + response = requests.post(url, json=payload) + print("\n=== Custom Scraping ===") + data = response.json() + if "Error" not in data: + print(f"Subreddit: {data['subreddit']}") + print(f"Posts found: {data['posts_count']}") + else: + print(f"Error: {data['Error']}") + + +if __name__ == "__main__": + try: + test_health() + test_subreddit() + test_custom() + print("\nāœ… All tests passed!") + except Exception as e: + print(f"\nāŒ Test failed: {e}")