#!/bin/bash # Comprehensive Test Suite for PDF Text Extraction Daemon # Tests various PDF types from sample-files.com BASE_URL="http://localhost:8000" echo "==============================================" echo "COMPREHENSIVE PDF EXTRACTOR TEST SUITE" echo "==============================================" echo "" # Define test cases declare -a TESTS=( "basic-text|https://sample-files.com/downloads/documents/pdf/basic-text.pdf|72.9 KB|1 page|Simple text document" "image-doc|https://sample-files.com/downloads/documents/pdf/image-doc.pdf|7.97 MB|6 pages|Image-heavy PDF" "fillable-form|https://sample-files.com/downloads/documents/pdf/fillable-form.pdf|52.7 KB|2 pages|Interactive form" "dev-example|https://sample-files.com/downloads/documents/pdf/dev-example.pdf|690 KB|6 pages|Developer example" ) PASS=0 FAIL=0 for TEST in "${TESTS[@]}"; do IFS='|' read -r NAME URL SIZE PAGES DESC <<< "$TEST" echo "----------------------------------------------" echo "Test: $NAME" echo "URL: $URL" echo "Expected: $SIZE, $PAGES ($DESC)" echo "----------------------------------------------" START_TIME=$(date +%s%N) # Make API call RESULT=$(curl -s "$BASE_URL/extract?url=$URL") END_TIME=$(date +%s%N) ELAPSED_MS=$(( (END_TIME - START_TIME) / 1000000 )) # Parse response SUCCESS=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('success', False))" 2>/dev/null) EXTRACTED_PAGES=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('pages', 0))" 2>/dev/null) FILE_SIZE=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('file_size_kb', 0))" 2>/dev/null) EXTRACTION_TIME=$(echo "$RESULT" | python3 -c "import sys,json; print(round(json.load(sys.stdin).get('extraction_time_ms', 0), 2))" 2>/dev/null) MESSAGE=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('message', 'N/A'))" 2>/dev/null) echo "" echo "Results:" echo " Status: $SUCCESS" echo " Pages extracted: $EXTRACTED_PAGES" echo " File size: ${FILE_SIZE} KB" echo " Extraction time: ${EXTRACTION_TIME}ms" echo " Total round-trip: ${ELAPSED_MS}ms" echo " Message: $MESSAGE" # Validate results if [ "$SUCCESS" = "True" ] && [ -n "$EXTRACTED_PAGES" ]; then echo "" echo "✓ PASS" ((PASS++)) else echo "" echo "✗ FAIL: $RESULT" ((FAIL++)) fi echo "" done # Test error handling echo "==============================================" echo "ERROR HANDLING TESTS" echo "==============================================" echo "" # Invalid URL format echo "Test: Invalid URL format (no http://)" RESULT=$(curl -s "$BASE_URL/extract?url=not-a-url.pdf") if echo "$RESULT" | grep -q "must start with"; then echo "✓ PASS (Correctly rejected invalid URL)" else echo "✗ FAIL (Should reject without http://)" ((FAIL++)) fi echo "" # Non-existent URL echo "Test: Non-existent PDF URL" RESULT=$(curl -s "$BASE_URL/extract?url=https://example.com/nonexistent.pdf") if echo "$RESULT" | grep -q "404"; then echo "✓ PASS (Correctly returned 404)" else echo "✗ FAIL (Should return 404)" ((FAIL++)) fi echo "" # Test with output file parameter echo "==============================================" echo "OUTPUT FILE TEST" echo "==============================================" echo "" echo "Test: Extract with custom output file" RESULT=$(curl -s "$BASE_URL/extract?url=https://sample-files.com/downloads/documents/pdf/basic-text.pdf&output_file=test_output.txt") if [ -f /tmp/test_output.txt ]; then echo "✓ PASS (Output file created)" echo " File size: $(ls -lh /tmp/test_output.txt | awk '{print $5}')" ((PASS++)) else echo "✗ FAIL (Output file not found)" ((FAIL++)) fi echo "" # Summary echo "==============================================" echo "TEST SUMMARY" echo "==============================================" echo "Passed: $PASS" echo "Failed: $FAIL" TOTAL=$((PASS + FAIL)) echo "Total: $TOTAL" echo "" if [ $FAIL -eq 0 ]; then echo "✓ ALL TESTS PASSED!" else echo "✗ Some tests failed. Review output above." fi