pdf_tool/comprehensive_test.sh

132 lines
4.2 KiB
Bash
Executable File

#!/bin/bash
# Comprehensive Test Suite for PDF Text Extraction Daemon
# Tests various PDF types from sample-files.com
BASE_URL="http://localhost:8000"
echo "=============================================="
echo "COMPREHENSIVE PDF EXTRACTOR TEST SUITE"
echo "=============================================="
echo ""
# Define test cases
declare -a TESTS=(
"basic-text|https://sample-files.com/downloads/documents/pdf/basic-text.pdf|72.9 KB|1 page|Simple text document"
"image-doc|https://sample-files.com/downloads/documents/pdf/image-doc.pdf|7.97 MB|6 pages|Image-heavy PDF"
"fillable-form|https://sample-files.com/downloads/documents/pdf/fillable-form.pdf|52.7 KB|2 pages|Interactive form"
"dev-example|https://sample-files.com/downloads/documents/pdf/dev-example.pdf|690 KB|6 pages|Developer example"
)
PASS=0
FAIL=0
for TEST in "${TESTS[@]}"; do
IFS='|' read -r NAME URL SIZE PAGES DESC <<< "$TEST"
echo "----------------------------------------------"
echo "Test: $NAME"
echo "URL: $URL"
echo "Expected: $SIZE, $PAGES ($DESC)"
echo "----------------------------------------------"
START_TIME=$(date +%s%N)
# Make API call
RESULT=$(curl -s "$BASE_URL/extract?url=$URL")
END_TIME=$(date +%s%N)
ELAPSED_MS=$(( (END_TIME - START_TIME) / 1000000 ))
# Parse response
SUCCESS=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('success', False))" 2>/dev/null)
EXTRACTED_PAGES=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('pages', 0))" 2>/dev/null)
FILE_SIZE=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('file_size_kb', 0))" 2>/dev/null)
EXTRACTION_TIME=$(echo "$RESULT" | python3 -c "import sys,json; print(round(json.load(sys.stdin).get('extraction_time_ms', 0), 2))" 2>/dev/null)
MESSAGE=$(echo "$RESULT" | python3 -c "import sys,json; print(json.load(sys.stdin).get('message', 'N/A'))" 2>/dev/null)
echo ""
echo "Results:"
echo " Status: $SUCCESS"
echo " Pages extracted: $EXTRACTED_PAGES"
echo " File size: ${FILE_SIZE} KB"
echo " Extraction time: ${EXTRACTION_TIME}ms"
echo " Total round-trip: ${ELAPSED_MS}ms"
echo " Message: $MESSAGE"
# Validate results
if [ "$SUCCESS" = "True" ] && [ -n "$EXTRACTED_PAGES" ]; then
echo ""
echo "✓ PASS"
((PASS++))
else
echo ""
echo "✗ FAIL: $RESULT"
((FAIL++))
fi
echo ""
done
# Test error handling
echo "=============================================="
echo "ERROR HANDLING TESTS"
echo "=============================================="
echo ""
# Invalid URL format
echo "Test: Invalid URL format (no http://)"
RESULT=$(curl -s "$BASE_URL/extract?url=not-a-url.pdf")
if echo "$RESULT" | grep -q "must start with"; then
echo "✓ PASS (Correctly rejected invalid URL)"
else
echo "✗ FAIL (Should reject without http://)"
((FAIL++))
fi
echo ""
# Non-existent URL
echo "Test: Non-existent PDF URL"
RESULT=$(curl -s "$BASE_URL/extract?url=https://example.com/nonexistent.pdf")
if echo "$RESULT" | grep -q "404"; then
echo "✓ PASS (Correctly returned 404)"
else
echo "✗ FAIL (Should return 404)"
((FAIL++))
fi
echo ""
# Test with output file parameter
echo "=============================================="
echo "OUTPUT FILE TEST"
echo "=============================================="
echo ""
echo "Test: Extract with custom output file"
RESULT=$(curl -s "$BASE_URL/extract?url=https://sample-files.com/downloads/documents/pdf/basic-text.pdf&output_file=test_output.txt")
if [ -f /tmp/test_output.txt ]; then
echo "✓ PASS (Output file created)"
echo " File size: $(ls -lh /tmp/test_output.txt | awk '{print $5}')"
((PASS++))
else
echo "✗ FAIL (Output file not found)"
((FAIL++))
fi
echo ""
# Summary
echo "=============================================="
echo "TEST SUMMARY"
echo "=============================================="
echo "Passed: $PASS"
echo "Failed: $FAIL"
TOTAL=$((PASS + FAIL))
echo "Total: $TOTAL"
echo ""
if [ $FAIL -eq 0 ]; then
echo "✓ ALL TESTS PASSED!"
else
echo "✗ Some tests failed. Review output above."
fi