diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml new file mode 100644 index 000000000..2e0937a5a --- /dev/null +++ b/.github/workflows/integration-tests.yml @@ -0,0 +1,148 @@ +name: Integration Tests + +on: + pull_request: + paths: + - '**.go' + - 'go.mod' + - 'go.sum' + - 'tests/integration/**' + - '.github/workflows/integration-tests.yml' + workflow_dispatch: + inputs: + test_type: + description: 'Test type to run' + required: false + default: 'quick' + type: choice + options: + - 'quick' + - 'all' + - 'long' + +permissions: + contents: read + +jobs: + quick-tests: + name: Quick Integration Tests + runs-on: ubuntu-latest + if: github.event_name == 'pull_request' || inputs.test_type == 'quick' || inputs.test_type == 'all' + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: "go.mod" + + - name: Build binaries + run: | + go build -o bin/litestream ./cmd/litestream + go build -o bin/litestream-test ./cmd/litestream-test + + - name: Run quick integration tests + run: | + go test -v -tags=integration -timeout=30m ./tests/integration/... \ + -run="TestFreshStart|TestDatabaseIntegrity|TestRapidCheckpoints" + env: + CGO_ENABLED: 1 + + - name: Upload test logs + if: failure() + uses: actions/upload-artifact@v4 + with: + name: quick-test-logs + path: | + /tmp/litestream-*/*.log + /tmp/*-test.log + + scenario-tests: + name: Scenario Integration Tests + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' && (inputs.test_type == 'all' || inputs.test_type == 'long') + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: "go.mod" + + - name: Build binaries + run: | + go build -o bin/litestream ./cmd/litestream + go build -o bin/litestream-test ./cmd/litestream-test + + - name: Run all scenario tests + run: | + go test -v -tags=integration -timeout=1h ./tests/integration/... \ + -run="Test(FreshStart|DatabaseIntegrity|DatabaseDeletion|RapidCheckpoints|WALGrowth|ConcurrentOperations|BusyTimeout)" + env: + CGO_ENABLED: 1 + + - name: Upload test logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: scenario-test-logs + path: | + /tmp/litestream-*/*.log + /tmp/*-test.log + + long-running-tests: + name: Long-Running Integration Tests + runs-on: ubuntu-latest + if: github.event_name == 'workflow_dispatch' && inputs.test_type == 'long' + timeout-minutes: 600 + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version-file: "go.mod" + + - name: Build binaries + run: | + go build -o bin/litestream ./cmd/litestream + go build -o bin/litestream-test ./cmd/litestream-test + + - name: Run long tests + run: | + go test -v -tags="integration,long" -timeout=10h ./tests/integration/... \ + -run="TestOvernight|Test1GBBoundary" + env: + CGO_ENABLED: 1 + + - name: Upload test logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: long-test-logs + path: | + /tmp/litestream-*/*.log + /tmp/*-test.log + + summary: + name: Test Summary + runs-on: ubuntu-latest + needs: [quick-tests] + if: always() && (github.event_name == 'pull_request' || inputs.test_type == 'quick' || inputs.test_type == 'all') + steps: + - name: Generate summary + run: | + echo "## Integration Test Results" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + if [ "${{ needs.quick-tests.result }}" == "success" ]; then + echo "✅ **Quick Tests:** Passed" >> $GITHUB_STEP_SUMMARY + elif [ "${{ needs.quick-tests.result }}" == "failure" ]; then + echo "❌ **Quick Tests:** Failed" >> $GITHUB_STEP_SUMMARY + elif [ "${{ needs.quick-tests.result }}" == "skipped" ]; then + echo "⏭️ **Quick Tests:** Skipped" >> $GITHUB_STEP_SUMMARY + fi + + echo "" >> $GITHUB_STEP_SUMMARY + echo "---" >> $GITHUB_STEP_SUMMARY + echo "**Triggered by:** @${{ github.actor }}" >> $GITHUB_STEP_SUMMARY + + # Note: Scenario and long-running tests run independently on workflow_dispatch. + # Check individual job results for those test suites. diff --git a/cmd/litestream-test/scripts/README.md b/cmd/litestream-test/scripts/README.md index 9f3c39010..fe1280875 100644 --- a/cmd/litestream-test/scripts/README.md +++ b/cmd/litestream-test/scripts/README.md @@ -13,18 +13,11 @@ go build -o bin/litestream-test ./cmd/litestream-test ## Quick Reference +> **Note:** Some tests have been migrated to Go integration tests in `tests/integration/`. See [tests/integration/README.md](../../tests/integration/README.md) for the Go-based test suite. + | Script | Purpose | Duration | Status | |--------|---------|----------|--------| | verify-test-setup.sh | Environment validation | ~5s | ✅ Stable | -| test-fresh-start.sh | Fresh database creation | ~30s | ✅ Stable | -| test-rapid-checkpoints.sh | Checkpoint stress test | ~2min | ✅ Stable | -| test-wal-growth.sh | Large WAL handling (100MB+) | ~5min | ✅ Stable | -| test-concurrent-operations.sh | Multi-database concurrent replication | ~5min | ✅ Stable | -| test-database-integrity.sh | Complex data integrity validation | ~3min | ✅ Stable | -| test-database-deletion.sh | Database deletion scenarios | ~2min | ✅ Stable | -| test-replica-failover.sh | Replica failover testing | ~3min | ✅ Stable | -| test-busy-timeout.sh | Database busy timeout handling | ~2min | ✅ Stable | -| test-1gb-boundary.sh | SQLite 1GB lock page boundary | ~10min | ⚠️ Blocked by #754 | | reproduce-critical-bug.sh | Checkpoint during downtime bug | ~2min | 🐛 Reproduces #752 | | test-754-s3-scenarios.sh | Issue #754 S3 vs file replication | ~10min | 🐛 Tests #754 | | test-754-restore-focus.sh | Issue #754 restore focus | ~5min | 🐛 Tests #754 | @@ -58,148 +51,6 @@ Verifies that the test environment is properly configured with required binaries - SQLite3 available - Python dependencies for S3 mock -### Core Functionality Tests - -#### test-fresh-start.sh -Tests replication with a fresh database that doesn't exist when Litestream starts. - -```bash -./cmd/litestream-test/scripts/test-fresh-start.sh -``` - -**Tests:** -- Starting Litestream before database exists -- Database creation while Litestream is running -- Automatic detection of new database -- Replication and restore integrity - -#### test-database-integrity.sh -Creates complex data patterns and verifies integrity after restore. - -```bash -./cmd/litestream-test/scripts/test-database-integrity.sh -``` - -**Tests:** -- Complex data patterns (multiple tables, indexes) -- SQLite PRAGMA integrity_check -- Full database restoration -- Data consistency verification - -#### test-database-deletion.sh -Tests scenarios where the source database is deleted during replication. - -```bash -./cmd/litestream-test/scripts/test-database-deletion.sh -``` - -**Tests:** -- Database deletion during active replication -- Recovery behavior -- Replica consistency - -#### test-replica-failover.sh -Tests replica failover scenarios with multiple replicas. - -```bash -./cmd/litestream-test/scripts/test-replica-failover.sh -``` - -**Tests:** -- Multiple replica configuration -- Failover when primary replica fails -- Data consistency across replicas - -### Stress & Performance Tests - -#### test-rapid-checkpoints.sh -Tests Litestream under rapid checkpoint pressure with continuous writes. - -```bash -./cmd/litestream-test/scripts/test-rapid-checkpoints.sh -``` - -**Tests:** -- 100+ writes/second -- Forced rapid checkpoints -- Replication under checkpoint pressure -- Data integrity under stress - -**Key Metrics:** -- Checkpoint frequency -- WAL file growth -- Replication lag -- Error rates - -#### test-wal-growth.sh -Tests handling of large WAL files (100MB+) under sustained write load. - -```bash -./cmd/litestream-test/scripts/test-wal-growth.sh -``` - -**Tests:** -- Sustained high write rates (400+ writes/sec) -- Large WAL file creation and handling -- Checkpoint behavior with large WALs -- Replication performance with large data - -**Key Findings:** -- Successfully handles 100MB+ WAL files -- Maintains data integrity -- Handles 400+ writes/second - -#### test-concurrent-operations.sh -Tests multiple databases replicating simultaneously with competing operations. - -```bash -./cmd/litestream-test/scripts/test-concurrent-operations.sh -``` - -**Tests:** -- Multiple databases (3-5) replicating concurrently -- Mixed read/write operations -- Competing checkpoints -- Resource contention handling - -#### test-busy-timeout.sh -Tests database busy timeout handling with concurrent access. - -```bash -./cmd/litestream-test/scripts/test-busy-timeout.sh -``` - -**Tests:** -- Concurrent database access -- Busy timeout configuration -- Lock contention handling -- Recovery from busy states - -### Boundary & Edge Case Tests - -#### test-1gb-boundary.sh -Tests SQLite's 1GB lock page boundary handling. - -```bash -./cmd/litestream-test/scripts/test-1gb-boundary.sh -``` - -**Tests:** -- Database growth beyond 1GB (with 4KB pages) -- Lock page at #262145 properly skipped -- Replication across lock page boundary -- Restoration integrity after crossing boundary - -**Status:** ⚠️ Currently blocked by ltx v0.5.0 flag compatibility issue (#754) - -**Lock Page Numbers by Page Size:** -| Page Size | Lock Page # | -|-----------|-------------| -| 4KB | 262145 | -| 8KB | 131073 | -| 16KB | 65537 | -| 32KB | 32769 | - ### Bug Reproduction Scripts #### reproduce-critical-bug.sh diff --git a/cmd/litestream-test/scripts/test-1gb-boundary.sh b/cmd/litestream-test/scripts/test-1gb-boundary.sh deleted file mode 100755 index b182b5c81..000000000 --- a/cmd/litestream-test/scripts/test-1gb-boundary.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash - -# Test Script: SQLite 1GB Lock Page Boundary -# -# This test verifies that Litestream correctly handles the SQLite lock page -# at the 1GB boundary (0x40000000). This page is reserved by SQLite and -# cannot contain data - Litestream must skip it during replication. -# -# The lock page number varies by page size: -# - 4KB: page 262145 -# - 8KB: page 131073 -# - 16KB: page 65537 -# - 32KB: page 32769 - -set -e - -echo "==========================================" -echo "SQLite 1GB Lock Page Boundary Test" -echo "==========================================" -echo "" -echo "Testing Litestream's handling of SQLite's reserved lock page at 1GB" -echo "" - -# Configuration -DB="/tmp/1gb-test.db" -REPLICA="/tmp/1gb-replica" -LITESTREAM_TEST="./bin/litestream-test" -LITESTREAM="./bin/litestream" - -# Clean up any previous test -echo "[SETUP] Cleaning up previous test files..." -rm -f "$DB"* -rm -rf "$REPLICA" - -# Check for required binaries -if [ ! -f "$LITESTREAM_TEST" ]; then - echo "ERROR: litestream-test not found at $LITESTREAM_TEST" - echo "Build with: go build -o bin/litestream-test ./cmd/litestream-test" - exit 1 -fi - -if [ ! -f "$LITESTREAM" ]; then - echo "ERROR: litestream not found at $LITESTREAM" - echo "Build with: go build -o bin/litestream ./cmd/litestream" - exit 1 -fi - -test_page_size() { - local PAGE_SIZE=$1 - local LOCK_PGNO=$2 - - echo "" - echo "=======================================" - echo "Testing with page size: $PAGE_SIZE bytes" - echo "Lock page should be at: $LOCK_PGNO" - echo "=======================================" - - # Clean up for this test - rm -f "$DB"* - rm -rf "$REPLICA" - - # Create database with specific page size - echo "[1] Creating database with page_size=$PAGE_SIZE..." - sqlite3 "$DB" </dev/null || stat -c%s "$DB") - PAGE_COUNT=$(sqlite3 "$DB" "PRAGMA page_count;") - echo " Database size: $(( DB_SIZE / 1024 / 1024 ))MB" - echo " Page count: $PAGE_COUNT" - echo " Lock page at: $LOCK_PGNO" - - # Verify we've crossed the boundary - if [ "$PAGE_COUNT" -le "$LOCK_PGNO" ]; then - echo " WARNING: Database doesn't cross lock page boundary!" - echo " Need at least $LOCK_PGNO pages, have $PAGE_COUNT" - else - echo " ✓ Database crosses lock page boundary" - fi - - # Start Litestream replication - echo "[3] Starting Litestream replication..." - $LITESTREAM replicate "$DB" "file://$REPLICA" > /tmp/litestream-1gb.log 2>&1 & - LITESTREAM_PID=$! - sleep 3 - - if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo "ERROR: Litestream failed to start" - cat /tmp/litestream-1gb.log - return 1 - fi - echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - - # Add more data to trigger replication across the boundary - echo "[4] Adding data around the lock page boundary..." - # Use litestream-test load to ensure continuous writes - $LITESTREAM_TEST load -db "$DB" -write-rate 10 -duration 10s -pattern constant & - LOAD_PID=$! - - # Let it run and create multiple transactions - echo "[5] Running writes for 10 seconds to ensure multiple transactions..." - sleep 10 - - # Stop writes and let replication catch up - kill $LOAD_PID 2>/dev/null || true - sleep 5 - - # Check for errors in log - if grep -i "error\|panic\|fatal" /tmp/litestream-1gb.log > /dev/null 2>&1; then - echo " WARNING: Errors detected in Litestream log:" - grep -i "error\|panic\|fatal" /tmp/litestream-1gb.log | head -5 - fi - - # Stop Litestream - kill $LITESTREAM_PID 2>/dev/null || true - sleep 2 - - # Attempt restore - echo "[6] Testing restore..." - rm -f /tmp/restored-1gb.db - if $LITESTREAM restore -o /tmp/restored-1gb.db "file://$REPLICA" > /tmp/restore-1gb.log 2>&1; then - echo " ✓ Restore successful" - - # Verify integrity - INTEGRITY=$(sqlite3 /tmp/restored-1gb.db "PRAGMA integrity_check;" 2>/dev/null || echo "FAILED") - if [ "$INTEGRITY" = "ok" ]; then - echo " ✓ Integrity check passed" - else - echo " ✗ Integrity check failed: $INTEGRITY" - return 1 - fi - - # Compare page counts - RESTORED_COUNT=$(sqlite3 /tmp/restored-1gb.db "PRAGMA page_count;" 2>/dev/null || echo "0") - echo " Original pages: $PAGE_COUNT" - echo " Restored pages: $RESTORED_COUNT" - - if [ "$PAGE_COUNT" -eq "$RESTORED_COUNT" ]; then - echo " ✓ Page count matches" - else - echo " ✗ Page count mismatch!" - return 1 - fi - - # Check data integrity - ORIG_ROWS=$(sqlite3 "$DB" "SELECT COUNT(*) FROM test_data;") - REST_ROWS=$(sqlite3 /tmp/restored-1gb.db "SELECT COUNT(*) FROM test_data;") - echo " Original rows: $ORIG_ROWS" - echo " Restored rows: $REST_ROWS" - - if [ "$ORIG_ROWS" -eq "$REST_ROWS" ]; then - echo " ✓ Data integrity verified" - echo "" - echo " TEST PASSED for page_size=$PAGE_SIZE" - else - echo " ✗ Row count mismatch!" - return 1 - fi - else - echo " ✗ Restore FAILED!" - cat /tmp/restore-1gb.log - return 1 - fi - - # Clean up - rm -f /tmp/restored-1gb.db -} - -# Test with different page sizes -echo "Testing SQLite lock page handling at 1GB boundary" -echo "This verifies Litestream correctly skips the reserved lock page" -echo "" - -# Default 4KB page size (most common) -if ! test_page_size 4096 262145; then - echo "CRITICAL: Test failed for 4KB pages!" - exit 1 -fi - -# 8KB page size -if ! test_page_size 8192 131073; then - echo "CRITICAL: Test failed for 8KB pages!" - exit 1 -fi - -# 16KB page size (if time permits - these are large databases) -# Uncomment to test: -# if ! test_page_size 16384 65537; then -# echo "CRITICAL: Test failed for 16KB pages!" -# exit 1 -# fi - -echo "" -echo "==========================================" -echo "All 1GB boundary tests PASSED!" -echo "==========================================" -echo "" -echo "Litestream correctly handles the SQLite lock page at 1GB boundary" -echo "for all tested page sizes." -echo "" - -# Clean up -pkill -f "litestream replicate" 2>/dev/null || true -echo "Test complete." diff --git a/cmd/litestream-test/scripts/test-busy-timeout.sh b/cmd/litestream-test/scripts/test-busy-timeout.sh deleted file mode 100755 index 100596d41..000000000 --- a/cmd/litestream-test/scripts/test-busy-timeout.sh +++ /dev/null @@ -1,225 +0,0 @@ -#!/bin/bash -set -e - -# Test busy timeout handling with concurrent writes -# This test verifies proper handling of write lock conflicts between app and Litestream - -echo "==========================================" -echo "Busy Timeout and Write Lock Conflict Test" -echo "==========================================" -echo "" -echo "Testing write lock conflict handling with various busy_timeout settings" -echo "" - -# Configuration -DB="/tmp/busy-test.db" -REPLICA="/tmp/busy-replica" -LITESTREAM="./bin/litestream" -LITESTREAM_TEST="./bin/litestream-test" - -# Cleanup function -cleanup() { - pkill -f "litestream replicate.*busy-test.db" 2>/dev/null || true - pkill -f "litestream-test load.*busy-test.db" 2>/dev/null || true - rm -f "$DB" "$DB-wal" "$DB-shm" "$DB-litestream" - rm -rf "$REPLICA" - rm -f /tmp/busy-*.log -} - -trap cleanup EXIT - -echo "[SETUP] Cleaning up previous test files..." -cleanup - -echo "" -echo "[1] Creating test database..." -sqlite3 "$DB" < /tmp/busy-litestream.log 2>&1 & -LITESTREAM_PID=$! -sleep 2 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream failed to start" - cat /tmp/busy-litestream.log - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -echo "" -echo "==========================================" -echo "Test 1: No busy_timeout (default behavior)" -echo "==========================================" - -echo "[3] Starting aggressive writes without busy_timeout..." -ERRORS_NO_TIMEOUT=0 -SUCCESS_NO_TIMEOUT=0 - -for i in {1..100}; do - if sqlite3 "$DB" "INSERT INTO test (data) VALUES (randomblob(1000));" 2>/dev/null; then - ((SUCCESS_NO_TIMEOUT++)) - else - ((ERRORS_NO_TIMEOUT++)) - fi -done - -echo " Results without busy_timeout:" -echo " ✓ Successful writes: $SUCCESS_NO_TIMEOUT" -echo " ✗ Failed writes (SQLITE_BUSY): $ERRORS_NO_TIMEOUT" - -if [ $ERRORS_NO_TIMEOUT -gt 0 ]; then - echo " ⚠️ Conflicts detected without busy_timeout (expected)" -else - echo " ✓ No conflicts (may indicate low checkpoint frequency)" -fi - -echo "" -echo "==========================================" -echo "Test 2: With 5-second busy_timeout (recommended)" -echo "==========================================" - -echo "[4] Testing with recommended 5-second timeout..." -ERRORS_WITH_TIMEOUT=0 -SUCCESS_WITH_TIMEOUT=0 - -for i in {1..100}; do - if sqlite3 "$DB" "PRAGMA busy_timeout = 5000; INSERT INTO test (data) VALUES (randomblob(1000));" 2>/dev/null; then - ((SUCCESS_WITH_TIMEOUT++)) - else - ((ERRORS_WITH_TIMEOUT++)) - fi -done - -echo " Results with 5s busy_timeout:" -echo " ✓ Successful writes: $SUCCESS_WITH_TIMEOUT" -echo " ✗ Failed writes: $ERRORS_WITH_TIMEOUT" - -if [ $ERRORS_WITH_TIMEOUT -eq 0 ]; then - echo " ✓ All writes succeeded with proper timeout!" -elif [ $ERRORS_WITH_TIMEOUT -lt $ERRORS_NO_TIMEOUT ]; then - echo " ✓ Timeout reduced conflicts significantly" -else - echo " ⚠️ Timeout didn't help (may need investigation)" -fi - -echo "" -echo "==========================================" -echo "Test 3: Concurrent high-frequency writes" -echo "==========================================" - -echo "[5] Starting 3 concurrent write processes..." - -# Start multiple concurrent writers -( - for i in {1..50}; do - sqlite3 "$DB" "PRAGMA busy_timeout = 5000; INSERT INTO test (data) VALUES ('Writer1: ' || randomblob(500));" 2>/dev/null - sleep 0.01 - done -) > /tmp/busy-writer1.log 2>&1 & -WRITER1_PID=$! - -( - for i in {1..50}; do - sqlite3 "$DB" "PRAGMA busy_timeout = 5000; INSERT INTO test (data) VALUES ('Writer2: ' || randomblob(500));" 2>/dev/null - sleep 0.01 - done -) > /tmp/busy-writer2.log 2>&1 & -WRITER2_PID=$! - -( - for i in {1..50}; do - sqlite3 "$DB" "PRAGMA busy_timeout = 5000; INSERT INTO test (data) VALUES ('Writer3: ' || randomblob(500));" 2>/dev/null - sleep 0.01 - done -) > /tmp/busy-writer3.log 2>&1 & -WRITER3_PID=$! - -echo " Writers started: PID $WRITER1_PID, $WRITER2_PID, $WRITER3_PID" - -# Monitor for conflicts -sleep 1 -echo "" -echo "[6] Forcing checkpoints during concurrent writes..." -for i in {1..5}; do - sqlite3 "$DB" "PRAGMA busy_timeout = 5000; PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || true - sleep 1 -done - -# Wait for writers to complete -wait $WRITER1_PID 2>/dev/null -wait $WRITER2_PID 2>/dev/null -wait $WRITER3_PID 2>/dev/null - -echo " ✓ Concurrent writers completed" - -echo "" -echo "[7] Checking for lock contention in Litestream log..." -CHECKPOINT_ERRORS=$(grep -c "checkpoint" /tmp/busy-litestream.log 2>/dev/null || echo "0") -SYNC_ERRORS=$(grep -c "database is locked" /tmp/busy-litestream.log 2>/dev/null || echo "0") - -echo " Litestream errors:" -echo " Checkpoint errors: $CHECKPOINT_ERRORS" -echo " Lock errors: $SYNC_ERRORS" - -if [ "$SYNC_ERRORS" -eq "0" ]; then - echo " ✓ No lock errors in Litestream" -else - echo " ⚠️ Some lock contention detected (may be normal under high load)" -fi - -echo "" -echo "==========================================" -echo "Test 4: Checkpoint during write transaction" -echo "==========================================" - -echo "[8] Testing checkpoint during long transaction..." - -# Start a long transaction -sqlite3 "$DB" "PRAGMA busy_timeout = 5000; BEGIN EXCLUSIVE;" 2>/dev/null & -TRANS_PID=$! -sleep 0.5 - -# Try to checkpoint while transaction is held -CHECKPOINT_RESULT=$(sqlite3 "$DB" "PRAGMA busy_timeout = 1000; PRAGMA wal_checkpoint(FULL);" 2>&1 || echo "FAILED") - -if [[ "$CHECKPOINT_RESULT" == *"FAILED"* ]] || [[ "$CHECKPOINT_RESULT" == *"database is locked"* ]]; then - echo " ✓ Checkpoint correctly blocked by exclusive transaction" -else - echo " ⚠️ Unexpected checkpoint behavior: $CHECKPOINT_RESULT" -fi - -# Clean up transaction -kill $TRANS_PID 2>/dev/null || true - -echo "" -echo "[9] Final statistics..." -TOTAL_ROWS=$(sqlite3 "$DB" "SELECT COUNT(*) FROM test;") -WAL_SIZE=$(du -h "$DB-wal" 2>/dev/null | cut -f1 || echo "0") -DB_SIZE=$(du -h "$DB" | cut -f1) - -echo " Database stats:" -echo " Total rows inserted: $TOTAL_ROWS" -echo " Database size: $DB_SIZE" -echo " WAL size: $WAL_SIZE" - -echo "" -echo "==========================================" -echo "Busy Timeout Test Summary:" -echo " Without timeout: $ERRORS_NO_TIMEOUT conflicts" -echo " With 5s timeout: $ERRORS_WITH_TIMEOUT conflicts" -echo " Concurrent writes: Completed successfully" -echo " Lock contention: Properly handled" -echo "" -if [ $ERRORS_WITH_TIMEOUT -lt $ERRORS_NO_TIMEOUT ] || [ $ERRORS_WITH_TIMEOUT -eq 0 ]; then - echo "✅ TEST PASSED: busy_timeout improves conflict handling" -else - echo "⚠️ TEST NOTICE: Timeout may need tuning for this workload" -fi -echo "==========================================" diff --git a/cmd/litestream-test/scripts/test-concurrent-operations.sh b/cmd/litestream-test/scripts/test-concurrent-operations.sh deleted file mode 100755 index b320ab541..000000000 --- a/cmd/litestream-test/scripts/test-concurrent-operations.sh +++ /dev/null @@ -1,300 +0,0 @@ -#!/bin/bash - -# Test Script: Concurrent Database Operations -# -# This test verifies Litestream's behavior under heavy concurrent load with -# multiple databases replicating simultaneously, mixed operations, and -# competing checkpoints. - -set -e - -echo "============================================" -echo "Concurrent Database Operations Test" -echo "============================================" -echo "" -echo "Testing Litestream with multiple concurrent databases and operations" -echo "" - -# Configuration -BASE_DIR="/tmp/concurrent-test" -LITESTREAM_TEST="./bin/litestream-test" -LITESTREAM="./bin/litestream" -NUM_DBS=5 -DB_SIZE="50MB" -DURATION="30s" - -# Clean up any previous test -echo "[SETUP] Cleaning up previous test files..." -rm -rf "$BASE_DIR" -mkdir -p "$BASE_DIR" - -# Check for required binaries -if [ ! -f "$LITESTREAM_TEST" ]; then - echo "ERROR: litestream-test not found at $LITESTREAM_TEST" - echo "Build with: go build -o bin/litestream-test ./cmd/litestream-test" - exit 1 -fi - -if [ ! -f "$LITESTREAM" ]; then - echo "ERROR: litestream not found at $LITESTREAM" - echo "Build with: go build -o bin/litestream ./cmd/litestream" - exit 1 -fi - -# Create configuration file for multiple databases -echo "[1] Creating Litestream configuration for $NUM_DBS databases..." -cat > "$BASE_DIR/litestream.yml" <> "$BASE_DIR/litestream.yml" < "$BASE_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo "ERROR: Litestream failed to start" - cat "$BASE_DIR/litestream.log" - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -# Start concurrent operations on all databases -echo "" -echo "[4] Starting concurrent operations on all databases..." -PIDS=() - -# Different workload patterns for each database -for i in $(seq 1 $NUM_DBS); do - case $i in - 1) - # High-frequency writes - echo " DB$i: High-frequency writes (500/sec)" - $LITESTREAM_TEST load -db "$BASE_DIR/db${i}.db" \ - -write-rate 500 -duration "$DURATION" \ - -pattern constant > "$BASE_DIR/load${i}.log" 2>&1 & - ;; - 2) - # Burst writes - echo " DB$i: Burst writes (1000/sec burst)" - $LITESTREAM_TEST load -db "$BASE_DIR/db${i}.db" \ - -write-rate 1000 -duration "$DURATION" \ - -pattern burst > "$BASE_DIR/load${i}.log" 2>&1 & - ;; - 3) - # Mixed with checkpoints - echo " DB$i: Moderate writes with periodic checkpoints" - ( - $LITESTREAM_TEST load -db "$BASE_DIR/db${i}.db" \ - -write-rate 100 -duration "$DURATION" \ - -pattern constant > "$BASE_DIR/load${i}.log" 2>&1 & - LOAD_PID=$! - - # Periodic checkpoints - for j in {1..6}; do - sleep 5 - sqlite3 "$BASE_DIR/db${i}.db" "PRAGMA wal_checkpoint(PASSIVE);" 2>/dev/null || true - done - - wait $LOAD_PID - ) & - ;; - 4) - # Shrinking operations - echo " DB$i: Writes with periodic shrinking" - ( - $LITESTREAM_TEST load -db "$BASE_DIR/db${i}.db" \ - -write-rate 50 -duration "$DURATION" \ - -pattern wave > "$BASE_DIR/load${i}.log" 2>&1 & - LOAD_PID=$! - - # Periodic shrinks - for j in {1..3}; do - sleep 10 - $LITESTREAM_TEST shrink -db "$BASE_DIR/db${i}.db" \ - -delete-percentage 30 2>/dev/null || true - done - - wait $LOAD_PID - ) & - ;; - 5) - # Large transactions - echo " DB$i: Large batch transactions" - for j in {1..10}; do - sqlite3 "$BASE_DIR/db${i}.db" </dev/null; then - echo " ERROR: Litestream crashed!" - cat "$BASE_DIR/litestream.log" | tail -20 - exit 1 - fi - - # Check for errors - ERROR_COUNT=$(grep -i "error\|panic" "$BASE_DIR/litestream.log" 2>/dev/null | wc -l || echo "0") - if [ "$ERROR_COUNT" -gt 0 ]; then - echo " Errors detected: $ERROR_COUNT" - fi - - echo " Progress: ${ELAPSED}s / ${MAX_ELAPSED}s" -done - -# Stop all operations -echo "" -echo "[6] Stopping operations..." -for pid in "${PIDS[@]}"; do - kill $pid 2>/dev/null || true -done -wait - -# Give Litestream time to catch up -echo " Waiting for final sync..." -sleep 5 - -# Collect metrics -echo "" -echo "[7] Collecting metrics..." -for i in $(seq 1 $NUM_DBS); do - DB_SIZE=$(stat -f%z "$BASE_DIR/db${i}.db" 2>/dev/null || stat -c%s "$BASE_DIR/db${i}.db") - WAL_SIZE=$(stat -f%z "$BASE_DIR/db${i}.db-wal" 2>/dev/null || stat -c%s "$BASE_DIR/db${i}.db-wal" 2>/dev/null || echo "0") - REPLICA_COUNT=$(find "$BASE_DIR/replica${i}" -type f 2>/dev/null | wc -l || echo "0") - - echo " DB$i:" - echo " Database size: $((DB_SIZE / 1024 / 1024))MB" - echo " WAL size: $((WAL_SIZE / 1024 / 1024))MB" - echo " Replica files: $REPLICA_COUNT" -done - -# Stop Litestream -kill $LITESTREAM_PID 2>/dev/null || true -sleep 2 - -# Test restoration for all databases -echo "" -echo "[8] Testing restoration of all databases..." -RESTORE_FAILED=0 - -for i in $(seq 1 $NUM_DBS); do - echo " Restoring DB$i..." - rm -f "$BASE_DIR/restored${i}.db" - - if $LITESTREAM restore -config "$BASE_DIR/litestream.yml" \ - -o "$BASE_DIR/restored${i}.db" "$BASE_DIR/db${i}.db" > "$BASE_DIR/restore${i}.log" 2>&1; then - - # Verify integrity - INTEGRITY=$(sqlite3 "$BASE_DIR/restored${i}.db" "PRAGMA integrity_check;" 2>/dev/null || echo "FAILED") - if [ "$INTEGRITY" = "ok" ]; then - echo " ✓ DB$i restored successfully" - else - echo " ✗ DB$i integrity check failed!" - RESTORE_FAILED=$((RESTORE_FAILED + 1)) - fi - else - echo " ✗ DB$i restore failed!" - cat "$BASE_DIR/restore${i}.log" - RESTORE_FAILED=$((RESTORE_FAILED + 1)) - fi -done - -# Check for race conditions or deadlocks in logs -echo "" -echo "[9] Analyzing logs for issues..." -ISSUES_FOUND=0 - -# Check for deadlocks -if grep -i "deadlock" "$BASE_DIR/litestream.log" > /dev/null 2>&1; then - echo " ✗ Deadlock detected!" - ISSUES_FOUND=$((ISSUES_FOUND + 1)) -fi - -# Check for database locked errors -LOCKED_COUNT=$(grep -c "database is locked" "$BASE_DIR/litestream.log" 2>/dev/null || echo "0") -if [ "$LOCKED_COUNT" -gt 10 ]; then - echo " ⚠ High number of 'database locked' errors: $LOCKED_COUNT" - ISSUES_FOUND=$((ISSUES_FOUND + 1)) -fi - -# Check for checkpoint failures -CHECKPOINT_ERRORS=$(grep -c "checkpoint.*error\|checkpoint.*fail" "$BASE_DIR/litestream.log" 2>/dev/null || echo "0") -if [ "$CHECKPOINT_ERRORS" -gt 0 ]; then - echo " ⚠ Checkpoint errors detected: $CHECKPOINT_ERRORS" -fi - -# Summary -echo "" -echo "============================================" -echo "Test Results Summary" -echo "============================================" -echo "" -echo "Databases tested: $NUM_DBS" -echo "Restore failures: $RESTORE_FAILED" -echo "Critical issues found: $ISSUES_FOUND" - -if [ "$RESTORE_FAILED" -eq 0 ] && [ "$ISSUES_FOUND" -eq 0 ]; then - echo "" - echo "✅ CONCURRENT OPERATIONS TEST PASSED" - echo "" - echo "Litestream successfully handled:" - echo "- $NUM_DBS databases replicating simultaneously" - echo "- Mixed workload patterns (high-frequency, burst, batch)" - echo "- Concurrent checkpoints and shrinking operations" - echo "- All databases restored successfully" -else - echo "" - echo "❌ CONCURRENT OPERATIONS TEST FAILED" - echo "" - echo "Issues detected during concurrent operations" - echo "Check logs at: $BASE_DIR/" - exit 1 -fi - -# Clean up -pkill -f litestream-test 2>/dev/null || true -pkill -f "litestream replicate" 2>/dev/null || true -echo "" -echo "Test complete. Artifacts saved in: $BASE_DIR/" diff --git a/cmd/litestream-test/scripts/test-database-deletion.sh b/cmd/litestream-test/scripts/test-database-deletion.sh deleted file mode 100755 index f07063450..000000000 --- a/cmd/litestream-test/scripts/test-database-deletion.sh +++ /dev/null @@ -1,172 +0,0 @@ -#!/bin/bash -set -e - -# Test database deletion and recreation scenarios -# This test verifies proper handling when databases are deleted and recreated - -echo "==========================================" -echo "Database Deletion and Recreation Test" -echo "==========================================" -echo "" -echo "Testing Litestream's handling of database deletion and recreation" -echo "" - -# Configuration -DB="/tmp/deletion-test.db" -REPLICA="/tmp/deletion-replica" -LITESTREAM="./bin/litestream" - -# Cleanup function -cleanup() { - pkill -f "litestream replicate.*deletion-test.db" 2>/dev/null || true - rm -f "$DB" "$DB-wal" "$DB-shm" "$DB-litestream" - rm -rf "$REPLICA" - rm -f /tmp/deletion-*.log -} - -trap cleanup EXIT - -echo "[SETUP] Cleaning up previous test files..." -cleanup - -echo "" -echo "[1] Creating initial database..." -sqlite3 "$DB" < /tmp/deletion-litestream.log 2>&1 & -LITESTREAM_PID=$! -sleep 2 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream failed to start" - cat /tmp/deletion-litestream.log - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -echo "" -echo "[3] Letting replication stabilize..." -sleep 3 -echo " ✓ Initial replication complete" - -echo "" -echo "==========================================" -echo "Test 1: Delete database while Litestream running" -echo "==========================================" - -echo "[4] Deleting database files..." -rm -f "$DB" "$DB-wal" "$DB-shm" -echo " ✓ Database files deleted" - -echo "" -echo "[5] Creating new database with different schema..." -sqlite3 "$DB" </dev/null || echo "0") -WARNINGS=$(grep -c "WAL" /tmp/deletion-litestream.log 2>/dev/null || echo "0") -echo " Litestream errors: $ERRORS" -echo " WAL warnings: $WARNINGS" - -if [ $ERRORS -gt 0 ]; then - echo " ⚠️ Errors detected (expected when database deleted)" - tail -5 /tmp/deletion-litestream.log | grep ERROR || true -fi - -echo "" -echo "==========================================" -echo "Test 2: Check for leftover WAL corruption" -echo "==========================================" - -echo "[7] Stopping Litestream..." -kill $LITESTREAM_PID 2>/dev/null || true -wait $LITESTREAM_PID 2>/dev/null -echo " ✓ Litestream stopped" - -echo "" -echo "[8] Simulating leftover WAL file scenario..." -# Create a database with WAL -sqlite3 "$DB" </dev/null | head -5 || true - -echo "" -echo "[10] Creating new database with leftover WAL..." -sqlite3 "$DB" <&1) -if [ "$INTEGRITY" = "ok" ]; then - echo " ✓ No corruption despite leftover WAL" -else - echo " ✗ CORRUPTION DETECTED: $INTEGRITY" - echo " This confirms leftover WAL files can corrupt new databases!" -fi - -echo "" -echo "==========================================" -echo "Test 3: Clean deletion procedure" -echo "==========================================" - -echo "[11] Demonstrating proper deletion procedure..." - -# Clean up everything -rm -f "$DB" "$DB-wal" "$DB-shm" -rm -rf "$DB-litestream" -echo " ✓ All database files removed" - -# Create fresh database -sqlite3 "$DB" </dev/null || true - rm -f "$DB" "$DB-wal" "$DB-shm" "$DB-litestream" - rm -f "$RESTORED" "$RESTORED-wal" "$RESTORED-shm" - rm -rf "$REPLICA" - rm -f "$LITESTREAM_CONFIG" - rm -f /tmp/integrity-*.log -} - -trap cleanup EXIT - -echo "[SETUP] Cleaning up previous test files..." -cleanup - -echo "" -echo "[1] Creating database with complex data patterns..." -# Create database with various data types and constraints -sqlite3 "$DB" </dev/null - sqlite3 "$DB" "INSERT INTO transactions (amount, type, balance) VALUES (ABS(RANDOM() % 1000) + 0.01, CASE WHEN RANDOM() % 2 = 0 THEN 'credit' ELSE 'debit' END, ABS(RANDOM() % 10000));" 2>/dev/null -done -INITIAL_SIZE=$(du -h "$DB" | cut -f1) -echo " ✓ Database populated: $INITIAL_SIZE" - -echo "" -echo "[3] Running initial integrity check..." -INITIAL_INTEGRITY=$(sqlite3 "$DB" "PRAGMA integrity_check;") -if [ "$INITIAL_INTEGRITY" != "ok" ]; then - echo " ✗ Initial database has integrity issues: $INITIAL_INTEGRITY" - exit 1 -fi -echo " ✓ Initial integrity check: $INITIAL_INTEGRITY" - -# Get checksums for verification -USERS_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM users;") -POSTS_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM posts;") -TRANS_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM transactions;") -TABLE_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM posts;" 2>/dev/null || echo "0") - -echo "" -echo "[4] Starting Litestream replication..." -"$LITESTREAM" replicate "$DB" "file://$REPLICA" > /tmp/integrity-litestream.log 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream failed to start" - cat /tmp/integrity-litestream.log - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -echo "" -echo "[5] Making changes while replicating..." -# Add more data and modify existing -sqlite3 "$DB" </dev/null 2>&1 -sleep 2 - -echo " ✓ Changes made and checkpoint executed" - -echo "" -echo "[6] Stopping Litestream and attempting restore..." -kill $LITESTREAM_PID -wait $LITESTREAM_PID 2>/dev/null - -# Attempt restore -"$LITESTREAM" restore -o "$RESTORED" "file://$REPLICA" > /tmp/integrity-restore.log 2>&1 -RESTORE_EXIT=$? - -if [ $RESTORE_EXIT -ne 0 ]; then - echo " ✗ Restore failed with exit code: $RESTORE_EXIT" - cat /tmp/integrity-restore.log - exit 1 -fi -echo " ✓ Restore completed" - -echo "" -echo "[7] Running integrity check on restored database..." -RESTORED_INTEGRITY=$(sqlite3 "$RESTORED" "PRAGMA integrity_check;" 2>&1) - -if [ "$RESTORED_INTEGRITY" != "ok" ]; then - echo " ✗ CRITICAL: Restored database FAILED integrity check!" - echo " Result: $RESTORED_INTEGRITY" - - # Try to get more info - echo "" - echo " Attempting detailed analysis:" - sqlite3 "$RESTORED" "PRAGMA foreign_key_check;" 2>/dev/null || echo " Foreign key check failed" - sqlite3 "$RESTORED" "SELECT COUNT(*) FROM sqlite_master;" 2>/dev/null || echo " Cannot read schema" - - exit 1 -else - echo " ✓ Integrity check PASSED: $RESTORED_INTEGRITY" -fi - -echo "" -echo "[8] Verifying data consistency..." -# Check row counts -RESTORED_USERS=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM users;" 2>/dev/null || echo "ERROR") -RESTORED_POSTS=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM posts;" 2>/dev/null || echo "ERROR") -RESTORED_TRANS=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM transactions;" 2>/dev/null || echo "ERROR") -RESTORED_TABLE=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM posts;" 2>/dev/null || echo "0") - -# Expected counts after changes -EXPECTED_USERS=4 # 3 original + 1 added -EXPECTED_POSTS=104 # 3 original + 100 bulk - 1 deleted + 2 added -EXPECTED_TRANS=105 # 3 original + 100 bulk + 2 added - -echo " Data verification:" -echo " Users: $RESTORED_USERS (expected: $EXPECTED_USERS)" -echo " Posts: $RESTORED_POSTS (expected: $EXPECTED_POSTS)" -echo " Transactions: $RESTORED_TRANS (expected: $EXPECTED_TRANS)" -echo " Test Table: $RESTORED_TABLE (expected: $TABLE_COUNT)" - -DATA_INTACT=true -if [ "$RESTORED_USERS" != "$EXPECTED_USERS" ]; then - echo " ✗ User count mismatch!" - DATA_INTACT=false -fi -if [ "$RESTORED_POSTS" != "$EXPECTED_POSTS" ]; then - echo " ✗ Post count mismatch!" - DATA_INTACT=false -fi -if [ "$RESTORED_TRANS" != "$EXPECTED_TRANS" ]; then - echo " ✗ Transaction count mismatch!" - DATA_INTACT=false -fi - -echo "" -echo "[9] Testing constraint enforcement..." -# Test that constraints still work -CONSTRAINT_TEST=$(sqlite3 "$RESTORED" "INSERT INTO transactions (amount, type) VALUES (0, 'credit');" 2>&1 || echo "CONSTRAINT_OK") -if [[ "$CONSTRAINT_TEST" == *"CONSTRAINT_OK"* ]] || [[ "$CONSTRAINT_TEST" == *"CHECK constraint failed"* ]]; then - echo " ✓ Check constraints working" -else - echo " ✗ Check constraints not enforced!" - DATA_INTACT=false -fi - -# Test foreign keys -FK_TEST=$(sqlite3 "$RESTORED" "PRAGMA foreign_keys=ON; INSERT INTO posts (user_id, title) VALUES (999, 'Bad FK');" 2>&1 || echo "FK_OK") -if [[ "$FK_TEST" == *"FK_OK"* ]] || [[ "$FK_TEST" == *"FOREIGN KEY constraint failed"* ]]; then - echo " ✓ Foreign key constraints working" -else - echo " ✗ Foreign key constraints not enforced!" - DATA_INTACT=false -fi - -echo "" -if [ "$DATA_INTACT" = true ] && [ "$RESTORED_INTEGRITY" = "ok" ]; then - echo "✅ TEST PASSED: Database integrity preserved after restore" -else - echo "❌ TEST FAILED: Database integrity issues detected" - exit 1 -fi - -echo "" -echo "==========================================" -echo "Summary:" -echo " Integrity Check: $RESTORED_INTEGRITY" -echo " Data Consistency: $DATA_INTACT" -echo " Constraints: Working" -echo "==========================================" diff --git a/cmd/litestream-test/scripts/test-fresh-start.sh b/cmd/litestream-test/scripts/test-fresh-start.sh deleted file mode 100755 index f3365f01d..000000000 --- a/cmd/litestream-test/scripts/test-fresh-start.sh +++ /dev/null @@ -1,139 +0,0 @@ -#!/bin/bash - -# Test: Starting replication with a fresh (empty) database -# This tests if Litestream works better when it creates the database from scratch - -set -e - -echo "==========================================" -echo "Fresh Start Database Test" -echo "==========================================" -echo "" -echo "Testing if Litestream works correctly when starting fresh" -echo "" - -# Configuration -DB="/tmp/fresh-test.db" -REPLICA="/tmp/fresh-replica" -LITESTREAM="./bin/litestream" -LITESTREAM_TEST="./bin/litestream-test" - -# Clean up -echo "[SETUP] Cleaning up..." -rm -f "$DB"* -rm -rf "$REPLICA" - -# Check binaries -if [ ! -f "$LITESTREAM" ]; then - echo "ERROR: $LITESTREAM not found" - exit 1 -fi - -if [ ! -f "$LITESTREAM_TEST" ]; then - echo "ERROR: $LITESTREAM_TEST not found" - exit 1 -fi - -# Start Litestream BEFORE creating database -echo "" -echo "[1] Starting Litestream with non-existent database..." -$LITESTREAM replicate "$DB" "file://$REPLICA" > /tmp/fresh-test.log 2>&1 & -LITESTREAM_PID=$! -sleep 2 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✓ Expected: Litestream waiting for database to be created" -else - echo " ✓ Litestream running (PID: $LITESTREAM_PID)" -fi - -# Now create and populate the database -echo "" -echo "[2] Creating database while Litestream is running..." -sqlite3 "$DB" </dev/null || echo "0") -if [ "$ERROR_COUNT" -gt 1 ]; then - echo " ⚠ Found $ERROR_COUNT errors:" - grep "ERROR" /tmp/fresh-test.log | head -3 -else - echo " ✓ No significant errors" -fi - -# Check replica files -echo "" -echo "[6] Checking replica files..." -if [ -d "$REPLICA/ltx" ]; then - FILE_COUNT=$(find "$REPLICA/ltx" -name "*.ltx" | wc -l) - echo " ✓ Replica created with $FILE_COUNT LTX files" - ls -la "$REPLICA/ltx/0/" 2>/dev/null | head -3 -else - echo " ✗ No replica files created!" -fi - -# Stop Litestream -kill $LITESTREAM_PID 2>/dev/null || true -sleep 2 - -# Test restore -echo "" -echo "[7] Testing restore..." -rm -f /tmp/fresh-restored.db -if $LITESTREAM restore -o /tmp/fresh-restored.db "file://$REPLICA" 2>&1; then - echo " ✓ Restore successful" - - # Verify data - ORIG_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM test;") - REST_COUNT=$(sqlite3 /tmp/fresh-restored.db "SELECT COUNT(*) FROM test;") - - if [ "$ORIG_COUNT" -eq "$REST_COUNT" ]; then - echo " ✓ Data integrity verified: $ORIG_COUNT rows" - echo "" - echo "TEST PASSED: Fresh start works correctly" - else - echo " ✗ Data mismatch: Original=$ORIG_COUNT, Restored=$REST_COUNT" - echo "" - echo "TEST FAILED: Data loss detected" - fi -else - echo " ✗ Restore failed!" - echo "" - echo "TEST FAILED: Cannot restore database" -fi - -echo "" -echo "==========================================" -echo "Test artifacts:" -echo " Database: $DB" -echo " Replica: $REPLICA" -echo " Log: /tmp/fresh-test.log" -echo "==========================================" diff --git a/cmd/litestream-test/scripts/test-rapid-checkpoints.sh b/cmd/litestream-test/scripts/test-rapid-checkpoints.sh deleted file mode 100755 index ac405de80..000000000 --- a/cmd/litestream-test/scripts/test-rapid-checkpoints.sh +++ /dev/null @@ -1,173 +0,0 @@ -#!/bin/bash - -# Test: Rapid Checkpoint Cycling -# This tests Litestream's behavior under rapid checkpoint pressure - -set -e - -echo "==========================================" -echo "Rapid Checkpoint Cycling Test" -echo "==========================================" -echo "" -echo "Testing Litestream under rapid checkpoint pressure" -echo "" - -# Configuration -DB="/tmp/checkpoint-cycle.db" -REPLICA="/tmp/checkpoint-cycle-replica" -LITESTREAM="./bin/litestream" -LITESTREAM_TEST="./bin/litestream-test" - -# Clean up -echo "[SETUP] Cleaning up..." -rm -f "$DB"* -rm -rf "$REPLICA" - -# Start with fresh database -echo "[1] Creating initial database..." -sqlite3 "$DB" < /tmp/checkpoint-cycle.log 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream failed to start" - cat /tmp/checkpoint-cycle.log - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -# Start continuous writes in background -echo "" -echo "[3] Starting continuous writes..." -( - while kill -0 $LITESTREAM_PID 2>/dev/null; do - sqlite3 "$DB" "INSERT INTO test (data) VALUES (randomblob(1000));" 2>/dev/null || true - sleep 0.01 # 100 writes/sec attempt - done -) & -WRITE_PID=$! -echo " ✓ Write loop started" - -# Rapid checkpoint cycling -echo "" -echo "[4] Starting rapid checkpoint cycling (30 seconds)..." -echo " Testing all checkpoint modes in rapid succession..." - -CHECKPOINT_COUNT=0 -ERRORS=0 -START_TIME=$(date +%s) - -while [ $(($(date +%s) - START_TIME)) -lt 30 ]; do - # Cycle through different checkpoint modes - for MODE in PASSIVE FULL RESTART TRUNCATE; do - if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream crashed during checkpoint!" - break 2 - fi - - # Execute checkpoint - OUTPUT=$(sqlite3 "$DB" "PRAGMA wal_checkpoint($MODE);" 2>&1) || { - ERRORS=$((ERRORS + 1)) - echo " ⚠ Checkpoint $MODE error: $OUTPUT" - } - CHECKPOINT_COUNT=$((CHECKPOINT_COUNT + 1)) - - # Very brief pause - sleep 0.1 - done -done - -echo " Executed $CHECKPOINT_COUNT checkpoints with $ERRORS errors" - -# Stop writes -kill $WRITE_PID 2>/dev/null || true - -# Let Litestream catch up -echo "" -echo "[5] Letting Litestream stabilize..." -sleep 5 - -# Check Litestream health -if kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✓ Litestream survived rapid checkpointing" -else - echo " ✗ Litestream died during test" -fi - -# Check for sync errors -echo "" -echo "[6] Checking for sync errors..." -SYNC_ERRORS=$(grep -c "sync error" /tmp/checkpoint-cycle.log 2>/dev/null || echo "0") -FLAGS_ERRORS=$(grep -c "no flags allowed" /tmp/checkpoint-cycle.log 2>/dev/null || echo "0") - -if [ "$FLAGS_ERRORS" -gt 0 ]; then - echo " ✗ ltx v0.5.0 flag errors detected: $FLAGS_ERRORS" -elif [ "$SYNC_ERRORS" -gt 0 ]; then - echo " ⚠ Sync errors detected: $SYNC_ERRORS" -else - echo " ✓ No sync errors" -fi - -# Check replica status -echo "" -echo "[7] Checking replica status..." -if [ -d "$REPLICA/ltx" ]; then - LTX_COUNT=$(find "$REPLICA/ltx" -name "*.ltx" | wc -l) - echo " ✓ Replica has $LTX_COUNT LTX files" -else - echo " ✗ No replica created!" -fi - -# Get final stats -ROW_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") -WAL_SIZE=$(stat -f%z "$DB-wal" 2>/dev/null || stat -c%s "$DB-wal" 2>/dev/null || echo "0") -echo " Final row count: $ROW_COUNT" -echo " Final WAL size: $((WAL_SIZE / 1024))KB" - -# Stop Litestream -kill $LITESTREAM_PID 2>/dev/null || true -sleep 2 - -# Test restore -echo "" -echo "[8] Testing restore after rapid checkpointing..." -rm -f /tmp/checkpoint-restored.db -if $LITESTREAM restore -o /tmp/checkpoint-restored.db "file://$REPLICA" 2>&1 | tee /tmp/restore-checkpoint.log; then - REST_COUNT=$(sqlite3 /tmp/checkpoint-restored.db "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") - - if [ "$REST_COUNT" -eq "$ROW_COUNT" ]; then - echo " ✓ Restore successful: $REST_COUNT rows" - echo "" - echo "TEST PASSED: Survived $CHECKPOINT_COUNT rapid checkpoints" - else - echo " ⚠ Row count mismatch: Original=$ROW_COUNT, Restored=$REST_COUNT" - LOSS=$((ROW_COUNT - REST_COUNT)) - echo " Data loss: $LOSS rows" - echo "" - echo "TEST FAILED: Data loss after rapid checkpointing" - fi -else - echo " ✗ Restore failed!" - cat /tmp/restore-checkpoint.log - echo "" - echo "TEST FAILED: Cannot restore after rapid checkpointing" -fi - -echo "" -echo "==========================================" -echo "Summary:" -echo " Checkpoints executed: $CHECKPOINT_COUNT" -echo " Checkpoint errors: $ERRORS" -echo " Sync errors: $SYNC_ERRORS" -echo " Flag errors: $FLAGS_ERRORS" -echo " Rows written: $ROW_COUNT" -echo "==========================================" diff --git a/cmd/litestream-test/scripts/test-replica-failover.sh b/cmd/litestream-test/scripts/test-replica-failover.sh deleted file mode 100755 index 20d6e9d23..000000000 --- a/cmd/litestream-test/scripts/test-replica-failover.sh +++ /dev/null @@ -1,203 +0,0 @@ -#!/bin/bash -set -e - -# Test multiple replica failover (Issue #687) -# This test verifies that restore falls back to healthy replicas when primary fails - -echo "==========================================" -echo "Multiple Replica Failover Test" -echo "==========================================" -echo "" -echo "Testing if restore falls back to healthy replicas when first is unavailable" -echo "" - -# Configuration -DB="/tmp/failover-test.db" -REPLICA1="/tmp/failover-replica1" -REPLICA2="/tmp/failover-replica2" -REPLICA3="/tmp/failover-replica3" -RESTORED="/tmp/failover-restored.db" -LITESTREAM_CONFIG="/tmp/failover-litestream.yml" -LITESTREAM="./bin/litestream" -LITESTREAM_TEST="./bin/litestream-test" - -# Cleanup function -cleanup() { - pkill -f "litestream replicate.*failover-test" 2>/dev/null || true - rm -f "$DB" "$DB-wal" "$DB-shm" "$DB-litestream" - rm -f "$RESTORED" "$RESTORED-wal" "$RESTORED-shm" - rm -rf "$REPLICA1" "$REPLICA2" "$REPLICA3" - rm -f "$LITESTREAM_CONFIG" - rm -f /tmp/failover-*.log -} - -trap cleanup EXIT - -echo "[SETUP] Cleaning up previous test files..." -cleanup - -echo "" -echo "[1] Creating test database..." -sqlite3 "$DB" < "$LITESTREAM_CONFIG" < /tmp/failover-litestream.log 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream failed to start" - cat /tmp/failover-litestream.log - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -echo "" -echo "[4] Adding data to ensure replication..." -for i in {1..10}; do - sqlite3 "$DB" "INSERT INTO test (data) VALUES ('Replicated data $i');" -done -sqlite3 "$DB" "PRAGMA wal_checkpoint(FULL);" >/dev/null 2>&1 -sleep 3 -echo " ✓ Added 10 rows and checkpointed" - -# Verify all replicas exist -echo "" -echo "[5] Verifying all replicas have data..." -for replica in "$REPLICA1" "$REPLICA2" "$REPLICA3"; do - if [ -d "$replica" ]; then - FILES=$(ls -1 "$replica"/generations/*/wal/*.ltx 2>/dev/null | wc -l) - echo " ✓ $(basename $replica): $FILES LTX files" - else - echo " ✗ $(basename $replica): Not created!" - exit 1 - fi -done - -echo "" -echo "[6] Stopping Litestream..." -kill $LITESTREAM_PID -wait $LITESTREAM_PID 2>/dev/null -echo " ✓ Litestream stopped" - -# Test 1: All replicas available -echo "" -echo "[7] Test 1: Restore with all replicas available..." -"$LITESTREAM" restore -config "$LITESTREAM_CONFIG" -o "$RESTORED" "$DB" > /tmp/failover-restore1.log 2>&1 -if [ $? -eq 0 ]; then - COUNT=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") - echo " ✓ Restore successful with all replicas: $COUNT rows" - rm -f "$RESTORED" "$RESTORED-wal" "$RESTORED-shm" -else - echo " ✗ Restore failed with all replicas available" - cat /tmp/failover-restore1.log -fi - -# Test 2: First replica corrupted -echo "" -echo "[8] Test 2: Corrupting first replica..." -rm -rf "$REPLICA1"/generations/*/wal/*.ltx -echo "CORRUPTED" > "$REPLICA1/CORRUPTED" -echo " ✓ First replica corrupted" - -echo " Attempting restore with first replica corrupted..." -"$LITESTREAM" restore -config "$LITESTREAM_CONFIG" -o "$RESTORED" "$DB" > /tmp/failover-restore2.log 2>&1 -if [ $? -eq 0 ]; then - COUNT=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") - if [ "$COUNT" -eq "11" ]; then - echo " ✓ Successfully fell back to healthy replicas: $COUNT rows" - else - echo " ✗ Restore succeeded but data incorrect: $COUNT rows (expected 11)" - fi - rm -f "$RESTORED" "$RESTORED-wal" "$RESTORED-shm" -else - echo " ✗ FAILED: Did not fall back to healthy replicas" - cat /tmp/failover-restore2.log -fi - -# Test 3: First replica missing entirely -echo "" -echo "[9] Test 3: Removing first replica entirely..." -rm -rf "$REPLICA1" -echo " ✓ First replica removed" - -echo " Attempting restore with first replica missing..." -"$LITESTREAM" restore -config "$LITESTREAM_CONFIG" -o "$RESTORED" "$DB" > /tmp/failover-restore3.log 2>&1 -if [ $? -eq 0 ]; then - COUNT=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") - if [ "$COUNT" -eq "11" ]; then - echo " ✓ Successfully fell back to remaining replicas: $COUNT rows" - else - echo " ✗ Restore succeeded but data incorrect: $COUNT rows (expected 11)" - fi - rm -f "$RESTORED" "$RESTORED-wal" "$RESTORED-shm" -else - echo " ✗ FAILED: Did not fall back when first replica missing" - cat /tmp/failover-restore3.log -fi - -# Test 4: Only last replica healthy -echo "" -echo "[10] Test 4: Corrupting second replica too..." -rm -rf "$REPLICA2" -echo " ✓ Second replica removed" - -echo " Attempting restore with only third replica healthy..." -"$LITESTREAM" restore -config "$LITESTREAM_CONFIG" -o "$RESTORED" "$DB" > /tmp/failover-restore4.log 2>&1 -if [ $? -eq 0 ]; then - COUNT=$(sqlite3 "$RESTORED" "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") - if [ "$COUNT" -eq "11" ]; then - echo " ✓ Successfully restored from last healthy replica: $COUNT rows" - else - echo " ✗ Restore succeeded but data incorrect: $COUNT rows (expected 11)" - fi - rm -f "$RESTORED" "$RESTORED-wal" "$RESTORED-shm" -else - echo " ✗ FAILED: Could not restore from last healthy replica" - cat /tmp/failover-restore4.log -fi - -# Test 5: All replicas unavailable -echo "" -echo "[11] Test 5: Removing all replicas..." -rm -rf "$REPLICA3" -echo " ✓ All replicas removed" - -echo " Attempting restore with no healthy replicas..." -"$LITESTREAM" restore -config "$LITESTREAM_CONFIG" -o "$RESTORED" "$DB" > /tmp/failover-restore5.log 2>&1 -if [ $? -ne 0 ]; then - echo " ✓ Correctly failed when no replicas available" -else - echo " ✗ Unexpected success with no replicas" -fi - -echo "" -echo "==========================================" -echo "Failover Test Summary:" -echo " ✓ Restore works with all replicas" -echo " ✓ Falls back when first replica corrupted" -echo " ✓ Falls back when first replica missing" -echo " ✓ Works with only last replica healthy" -echo " ✓ Correctly fails when no replicas available" -echo "==========================================" diff --git a/cmd/litestream-test/scripts/test-wal-growth.sh b/cmd/litestream-test/scripts/test-wal-growth.sh deleted file mode 100755 index 7e6d95ce0..000000000 --- a/cmd/litestream-test/scripts/test-wal-growth.sh +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - -# Test: WAL Growth and Size Limits -# This tests how Litestream handles extreme WAL growth scenarios - -set -e - -echo "==========================================" -echo "WAL Growth and Size Limits Test" -echo "==========================================" -echo "" -echo "Testing Litestream's handling of large WAL files" -echo "" - -# Configuration -DB="/tmp/wal-growth.db" -REPLICA="/tmp/wal-growth-replica" -LITESTREAM="./bin/litestream" -TARGET_WAL_SIZE_MB=100 # Target WAL size in MB - -# Clean up -echo "[SETUP] Cleaning up..." -rm -f "$DB"* -rm -rf "$REPLICA" - -# Create fresh database -echo "[1] Creating database..." -sqlite3 "$DB" < /tmp/wal-growth.log 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream failed to start" - cat /tmp/wal-growth.log | head -10 - exit 1 -fi -echo " ✓ Litestream running (PID: $LITESTREAM_PID)" - -# Write data until WAL reaches target size -echo "" -echo "[3] Growing WAL to ${TARGET_WAL_SIZE_MB}MB..." -echo " Writing large blobs without checkpointing..." - -BATCH_COUNT=0 -while true; do - # Check current WAL size - WAL_SIZE=$(stat -f%z "$DB-wal" 2>/dev/null || stat -c%s "$DB-wal" 2>/dev/null || echo "0") - WAL_SIZE_MB=$((WAL_SIZE / 1024 / 1024)) - - if [ $WAL_SIZE_MB -ge $TARGET_WAL_SIZE_MB ]; then - echo " ✓ WAL reached ${WAL_SIZE_MB}MB" - break - fi - - # Write a batch of large records - sqlite3 "$DB" </dev/null || true -BEGIN; -INSERT INTO test (data) SELECT randomblob(10000) FROM generate_series(1, 100); -COMMIT; -EOF - - BATCH_COUNT=$((BATCH_COUNT + 1)) - if [ $((BATCH_COUNT % 10)) -eq 0 ]; then - echo " WAL size: ${WAL_SIZE_MB}MB / ${TARGET_WAL_SIZE_MB}MB" - fi - - # Check if Litestream is still alive - if ! kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✗ Litestream died during WAL growth!" - break - fi -done - -# Check Litestream status -echo "" -echo "[4] Checking Litestream status with large WAL..." -if kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✓ Litestream still running with ${WAL_SIZE_MB}MB WAL" - - # Check replication lag - sleep 5 - LATEST_LTX=$(ls -t "$REPLICA/ltx/0/" 2>/dev/null | head -1) - if [ -n "$LATEST_LTX" ]; then - echo " ✓ Still replicating (latest: $LATEST_LTX)" - else - echo " ⚠ No recent replication activity" - fi -else - echo " ✗ Litestream crashed!" -fi - -# Check for errors -echo "" -echo "[5] Checking for errors..." -ERROR_COUNT=$(grep -c "ERROR" /tmp/wal-growth.log 2>/dev/null || echo "0") -OOM_COUNT=$(grep -c -i "out of memory\|oom" /tmp/wal-growth.log 2>/dev/null || echo "0") - -if [ "$OOM_COUNT" -gt 0 ]; then - echo " ✗ Out of memory errors detected!" -elif [ "$ERROR_COUNT" -gt 1 ]; then - echo " ⚠ Errors detected: $ERROR_COUNT" - grep "ERROR" /tmp/wal-growth.log | tail -3 -else - echo " ✓ No significant errors" -fi - -# Get statistics -echo "" -echo "[6] Statistics..." -ROW_COUNT=$(sqlite3 "$DB" "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") -DB_SIZE=$(stat -f%z "$DB" 2>/dev/null || stat -c%s "$DB" 2>/dev/null || echo "0") -LTX_COUNT=$(find "$REPLICA" -name "*.ltx" 2>/dev/null | wc -l || echo "0") - -echo " Database size: $((DB_SIZE / 1024 / 1024))MB" -echo " WAL size: ${WAL_SIZE_MB}MB" -echo " Row count: $ROW_COUNT" -echo " LTX files: $LTX_COUNT" - -# Now checkpoint and see what happens -echo "" -echo "[7] Executing checkpoint on large WAL..." -CHECKPOINT_START=$(date +%s) -CHECKPOINT_RESULT=$(sqlite3 "$DB" "PRAGMA wal_checkpoint(FULL);" 2>&1) || echo "Failed" -CHECKPOINT_END=$(date +%s) -CHECKPOINT_TIME=$((CHECKPOINT_END - CHECKPOINT_START)) - -echo " Checkpoint result: $CHECKPOINT_RESULT" -echo " Checkpoint time: ${CHECKPOINT_TIME}s" - -# Check WAL size after checkpoint -NEW_WAL_SIZE=$(stat -f%z "$DB-wal" 2>/dev/null || stat -c%s "$DB-wal" 2>/dev/null || echo "0") -NEW_WAL_SIZE_MB=$((NEW_WAL_SIZE / 1024 / 1024)) -echo " WAL size after checkpoint: ${NEW_WAL_SIZE_MB}MB" - -# Let Litestream catch up -echo "" -echo "[8] Letting Litestream catch up after checkpoint..." -sleep 10 - -# Check if Litestream survived -if kill -0 $LITESTREAM_PID 2>/dev/null; then - echo " ✓ Litestream survived large checkpoint" -else - echo " ✗ Litestream died after checkpoint" -fi - -# Stop Litestream -kill $LITESTREAM_PID 2>/dev/null || true -sleep 2 - -# Test restore -echo "" -echo "[9] Testing restore after large WAL handling..." -rm -f /tmp/wal-restored.db -if $LITESTREAM restore -o /tmp/wal-restored.db "file://$REPLICA" 2>&1 | tee /tmp/restore-wal.log; then - REST_COUNT=$(sqlite3 /tmp/wal-restored.db "SELECT COUNT(*) FROM test;" 2>/dev/null || echo "0") - - if [ "$REST_COUNT" -eq "$ROW_COUNT" ]; then - echo " ✓ Restore successful: $REST_COUNT rows" - echo "" - echo "TEST PASSED: Handled ${TARGET_WAL_SIZE_MB}MB WAL successfully" - else - echo " ⚠ Row count mismatch: Original=$ROW_COUNT, Restored=$REST_COUNT" - echo "" - echo "TEST FAILED: Data loss with large WAL" - fi -else - echo " ✗ Restore failed!" - echo "" - echo "TEST FAILED: Cannot restore after large WAL" -fi - -echo "" -echo "==========================================" -echo "Summary:" -echo " Maximum WAL size tested: ${WAL_SIZE_MB}MB" -echo " Checkpoint time: ${CHECKPOINT_TIME}s" -echo " Data integrity: $([ "$REST_COUNT" -eq "$ROW_COUNT" ] && echo "✓ Preserved" || echo "✗ Lost")" -echo "==========================================" diff --git a/scripts/README.md b/scripts/README.md index 2dd3a4b97..fc8f8ff26 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,14 +1,12 @@ -# Integration Test Scripts +# Utility Scripts -Long-running integration test scripts for comprehensive Litestream validation. These scripts are designed for extended testing scenarios, including overnight tests and production-like workloads. +Utility scripts for Litestream testing and distribution. ## Overview -This directory contains integration test scripts that run for extended periods (30 minutes to 8+ hours) to validate Litestream's behavior under sustained load and realistic production scenarios. +This directory contains utility scripts for post-test analysis and packaging. All long-running soak tests have been migrated to Go integration tests in `tests/integration/`. -**Key Difference from `cmd/litestream-test/scripts/`:** -- **This directory:** Long-running integration tests (minutes to hours) -- **`cmd/litestream-test/scripts/`:** Focused scenario tests (seconds to minutes) +> **Note:** For all soak tests (2-8 hours), see the Go-based test suite in [tests/integration/](../tests/integration/README.md). The bash soak tests have been migrated to Go for better maintainability and cross-platform support ## Prerequisites @@ -17,209 +15,7 @@ go build -o bin/litestream ./cmd/litestream go build -o bin/litestream-test ./cmd/litestream-test ``` -## Test Scripts - -### test-quick-validation.sh - -Quick validation test that runs for a configurable duration (default: 30 minutes). - -```bash -./scripts/test-quick-validation.sh - -TEST_DURATION=2h ./scripts/test-quick-validation.sh - -TEST_DURATION=1h ./scripts/test-quick-validation.sh -``` - -**Default Configuration:** -- Duration: 30 minutes (configurable via `TEST_DURATION`) -- Database: 10MB initial population -- Write rate: 100 writes/second -- Pattern: Wave (simulates varying load) -- Payload size: 4KB -- Workers: 4 -- Replica: File-based - -**Features:** -- Aggressive test settings for quick feedback -- Very frequent snapshots (1 minute intervals) -- Rapid compaction cycles (30s, 1m, 5m, 15m) -- Real-time monitoring every 30 seconds -- Automatic validation and restore testing -- Comprehensive final report - -**Monitoring:** -```bash -tail -f /tmp/litestream-quick-*/logs/monitor.log -tail -f /tmp/litestream-quick-*/logs/litestream.log -``` - -**What it Tests:** -- Snapshot creation frequency -- Compaction behavior across multiple intervals -- LTX file generation and management -- Checkpoint behavior under load -- Replication integrity -- Restoration success -- Error handling - -**When to Use:** -- Before running overnight tests -- Validating configuration changes -- Quick regression testing -- CI/CD integration (with short duration) -- Pre-release validation - -**Success Criteria:** -- LTX segments created (>0) -- No critical errors in logs -- Successful restoration -- Row counts match between source and restored database - -### test-overnight.sh - -Comprehensive 8-hour test with file-based replication. - -```bash -./scripts/test-overnight.sh -``` - -**Configuration:** -- Duration: 8 hours -- Database: 100MB initial population -- Write rate: 50 writes/second -- Pattern: Wave (simulates varying load) -- Payload size: 2KB -- Workers: 4 -- Replica: File-based (`/tmp/litestream-overnight-*/replica`) - -**Features:** -- Extended monitoring with 1-minute updates -- Snapshot every 10 minutes -- Aggressive compaction intervals: - - 30 seconds → 30s duration - - 1 minute → 1m duration - - 5 minutes → 5m duration - - 15 minutes → 1h duration - - 30 minutes → 6h duration - - 1 hour → 24h duration -- 720-hour retention (30 days) -- Checkpoint every 30 seconds -- Automatic validation after completion - -**Real-time Monitoring:** -```bash -tail -f /tmp/litestream-overnight-*/logs/monitor.log -tail -f /tmp/litestream-overnight-*/logs/litestream.log -tail -f /tmp/litestream-overnight-*/logs/load.log -``` - -**What it Tests:** -- Long-term replication stability -- Compaction effectiveness over time -- Memory stability under sustained load -- WAL file management -- Checkpoint consistency -- Replica file count growth patterns -- Error accumulation over time -- Recovery from transient issues - -**Expected Behavior:** -- Steady database growth over 8 hours -- Regular snapshot creation (48 total) -- Active compaction reducing old LTX files -- Stable memory usage -- No error accumulation -- Successful final validation - -**Artifacts:** -- Test directory: `/tmp/litestream-overnight-/` -- Logs: Monitor, litestream, load, populate, validate -- Database: Source and restored versions -- Replica: Full replica directory with LTX files - -### test-overnight-s3.sh - -Comprehensive 8-hour test with S3 replication. - -```bash -export AWS_ACCESS_KEY_ID=your_key -export AWS_SECRET_ACCESS_KEY=your_secret -export S3_BUCKET=your-test-bucket -export AWS_REGION=us-east-1 - -./scripts/test-overnight-s3.sh -``` - -**Configuration:** -- Duration: 8 hours -- Database: 100MB initial population -- Write rate: 100 writes/second (higher than file test) -- Pattern: Wave (simulates varying load) -- Payload size: 4KB (larger than file test) -- Workers: 8 (more than file test) -- Replica: S3 bucket with unique timestamped path - -**S3-Specific Settings:** -- Force path style: false -- Skip verify: false -- Optional SSE encryption support -- Region configurable via environment - -**Features:** -- Higher load than file-based test (S3 can handle more) -- S3 connectivity validation before start -- S3-specific error monitoring (403, 404, 500, 503) -- Upload operation tracking -- S3 object count monitoring -- Restoration from S3 after completion -- Automatic row count comparison - -**Real-time Monitoring:** -```bash -tail -f /tmp/litestream-overnight-s3-*/logs/monitor.log -tail -f /tmp/litestream-overnight-s3-*/logs/litestream.log - -aws s3 ls s3://your-bucket/litestream-overnight-/ --recursive -``` - -**What it Tests:** -- S3 replication stability -- Network resilience over 8 hours -- S3 API call efficiency -- Multipart upload handling -- S3-specific error recovery -- Cross-region replication (if configured) -- S3 cost implications (API calls, storage) -- Restoration from cloud storage - -**S3 Monitoring Includes:** -- Snapshot count in S3 -- WAL segment count in S3 -- Total S3 object count -- S3 storage size -- Upload operation count -- S3-specific errors - -**Expected Behavior:** -- Successful S3 connectivity throughout -- Regular S3 uploads without failures -- S3 object counts grow over time -- Compaction reduces old S3 objects -- Successful S3 restore at end -- Row count match between source and restored - -**Prerequisites:** -- Valid AWS credentials -- S3 bucket with write permissions -- Network connectivity to S3 -- AWS CLI installed (for monitoring) - -**Cost Considerations:** -- ~8 hours of continuous uploads -- Estimated API calls: Thousands of PUTs/GETs -- Storage: 100MB+ depending on replication -- Consider using a test/dev account +## Available Scripts ### analyze-test-results.sh @@ -272,37 +68,9 @@ Homebrew tap setup script for packaging and distribution. **Purpose:** Automates Homebrew tap setup for Litestream distribution. Not a test script per se, but part of the release process. -## Usage Patterns +## Usage -### Quick Validation Before Overnight Test - -```bash -TEST_DURATION=30m ./scripts/test-quick-validation.sh -``` - -If this passes, proceed to overnight: -```bash -./scripts/test-overnight.sh -``` - -### Running Multiple Overnight Tests - -File and S3 tests can run concurrently (different machines recommended): - -```bash -./scripts/test-overnight.sh & -./scripts/test-overnight-s3.sh & -``` - -### Custom Duration Testing - -```bash -TEST_DURATION=2h ./scripts/test-quick-validation.sh -TEST_DURATION=4h ./scripts/test-quick-validation.sh -TEST_DURATION=12h ./scripts/test-quick-validation.sh -``` - -### Analyzing Results +### Analyzing Test Results ```bash ls /tmp/litestream-overnight-* -dt | head -1 @@ -310,25 +78,15 @@ ls /tmp/litestream-overnight-* -dt | head -1 ./scripts/analyze-test-results.sh $(ls /tmp/litestream-overnight-* -dt | head -1) ``` -### Continuous Integration - -For CI/CD, use shorter durations: - -```bash -TEST_DURATION=5m ./scripts/test-quick-validation.sh -TEST_DURATION=15m ./scripts/test-quick-validation.sh -``` - ## Test Duration Guide | Duration | Use Case | Test Type | Expected Results | |----------|----------|-----------|------------------| -| 5 minutes | CI/CD smoke test | Quick validation | Basic functionality | -| 30 minutes | Pre-overnight validation | Quick validation | Config verification | -| 1 hour | Short integration | Quick validation | Pattern detection | -| 2 hours | Extended integration | Quick validation | Compaction cycles | -| 8 hours | Overnight stability | Overnight test | Full validation | -| 12+ hours | Stress testing | Overnight test | Edge case discovery | +| 5 minutes | CI/CD smoke test | Go integration tests | Basic functionality | +| 30 minutes | Short integration | Go integration tests | Pattern detection | +| 2-8 hours | Soak testing | Go soak tests (local only) | Full validation | + +> **Note:** All soak tests are now Go-based in `tests/integration/`. See [tests/integration/README.md](../tests/integration/README.md) for details on running comprehensive, MinIO, and overnight S3 soak tests. ## Monitoring and Debugging @@ -409,12 +167,7 @@ sqlite3 /tmp/litestream-*/restored.db "SELECT COUNT(*) FROM test_data" ### Stopping Tests Early -Tests can be interrupted with Ctrl+C. They will cleanup gracefully: -```bash -./scripts/test-overnight.sh -^C -Cleaning up... -``` +Go tests can be interrupted with Ctrl+C. They will cleanup gracefully via defer statements. ## Test Artifacts @@ -436,59 +189,24 @@ All tests create timestamped directories with comprehensive artifacts: └── restored.db # Restored database for validation ``` -## Integration with Other Tests - -These scripts complement the scenario tests in `cmd/litestream-test/scripts/`: +## Integration with Go Tests -**Relationship:** -- `cmd/litestream-test/scripts/` → Focused scenarios (seconds to ~30 minutes) -- `scripts/` → Integration tests (30 minutes to 8+ hours) +These utility scripts complement the Go integration test suite: -**Workflow:** -1. Run focused scenario tests during development -2. Run quick validation (30min) before major changes -3. Run overnight tests (8h) before releases -4. Analyze results with analysis script +**Test Locations:** +- `tests/integration/` → All integration and soak tests (Go-based) +- `cmd/litestream-test/scripts/` → Scenario and debugging tests (bash, being phased out) +- `scripts/` → Utilities only (this directory) -## Success Criteria - -### Quick Validation (30min) - -✅ Pass Criteria: -- LTX segments created (>0) -- At least 1 snapshot created -- Multiple compaction cycles completed -- No critical errors -- Successful restoration -- Row count matches - -### Overnight Tests (8h) - -✅ Pass Criteria: -- No process crashes -- Error count < 10 (excluding transient) -- Steady database growth -- Regular snapshots (40+) -- Active compaction visible -- Successful final restoration -- Row count match -- Memory usage stable +**Testing Workflow:** +1. Run quick integration tests during development +2. Run full integration test suite before major changes +3. Run soak tests (2-8h) locally before releases: `TestComprehensiveSoak`, `TestMinIOSoak`, `TestOvernightS3Soak` +4. Analyze results with `analyze-test-results.sh` ## Related Documentation +- [Go Integration Tests](../tests/integration/README.md) - Complete Go-based test suite including soak tests - [litestream-test CLI Tool](../cmd/litestream-test/README.md) - Testing harness documentation - [Scenario Test Scripts](../cmd/litestream-test/scripts/README.md) - Focused test scenarios - [S3 Retention Testing](../cmd/litestream-test/S3-RETENTION-TESTING.md) - S3-specific testing - -## Contributing - -When adding new integration scripts: - -1. Follow naming conventions (`test-*.sh`) -2. Include clear duration estimates in comments -3. Create comprehensive monitoring -4. Generate timestamped test directories -5. Implement graceful cleanup with `trap` -6. Provide clear success/failure output -7. Update this README with script documentation -8. Consider both file and S3 variants if applicable diff --git a/scripts/test-comprehensive.sh b/scripts/test-comprehensive.sh deleted file mode 100755 index b02679cd1..000000000 --- a/scripts/test-comprehensive.sh +++ /dev/null @@ -1,392 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Comprehensive validation test with aggressive settings -# This test exercises all Litestream features: replication, snapshots, compaction, checkpoints -# Can be run for any duration - defaults to 2 hours for thorough testing - -TEST_DURATION="${TEST_DURATION:-2h}" -TEST_DIR="/tmp/litestream-comprehensive-$(date +%Y%m%d-%H%M%S)" -DB_PATH="$TEST_DIR/test.db" -REPLICA_PATH="$TEST_DIR/replica" -CONFIG_FILE="$TEST_DIR/litestream.yml" -LOG_DIR="$TEST_DIR/logs" - -echo "================================================" -echo "Litestream Comprehensive Validation Test" -echo "================================================" -echo "Duration: $TEST_DURATION" -echo "Test directory: $TEST_DIR" -echo "Start time: $(date)" -echo "" -echo "This test uses aggressive settings to validate:" -echo " - Continuous replication" -echo " - Snapshot generation (every 10m)" -echo " - Compaction (30s/1m/5m intervals)" -echo " - Checkpoint operations" -echo " - Database restoration" -echo "" - -cleanup() { - echo "" - echo "Cleaning up..." - - # Kill all spawned processes - jobs -p | xargs -r kill 2>/dev/null || true - wait - - echo "Test completed at: $(date)" - echo "Results saved in: $TEST_DIR" -} - -trap cleanup EXIT INT TERM - -# Create directories -mkdir -p "$TEST_DIR" "$LOG_DIR" "$REPLICA_PATH" - -# Build binaries if needed -echo "Building binaries..." -if [ ! -f bin/litestream ]; then - go build -o bin/litestream ./cmd/litestream -fi -if [ ! -f bin/litestream-test ]; then - go build -o bin/litestream-test ./cmd/litestream-test -fi - -# Create test database and populate BEFORE starting litestream -echo "Creating and populating test database..." -sqlite3 "$DB_PATH" < "$LOG_DIR/populate.log" 2>&1 -if [ $? -ne 0 ]; then - echo "Warning: Population failed, but continuing..." - cat "$LOG_DIR/populate.log" -fi - -# Create configuration with Ben's recommended aggressive settings -echo "Creating test configuration with aggressive intervals..." -cat > "$CONFIG_FILE" < "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! - -sleep 3 - -if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "ERROR: Litestream failed to start!" - tail -50 "$LOG_DIR/litestream.log" - exit 1 -fi - -echo "Litestream running (PID: $LITESTREAM_PID)" -echo "" - -# Start load generator with heavy sustained load -echo "Starting load generator (heavy sustained load)..." -bin/litestream-test load \ - -db "$DB_PATH" \ - -write-rate 500 \ - -duration "$TEST_DURATION" \ - -pattern wave \ - -payload-size 4096 \ - -read-ratio 0.3 \ - -workers 8 \ - > "$LOG_DIR/load.log" 2>&1 & -LOAD_PID=$! - -echo "Load generator running (PID: $LOAD_PID)" -echo "" - -# Monitor function with detailed metrics -monitor_comprehensive() { - local last_checkpoint_count=0 - local last_compaction_count=0 - local last_sync_count=0 - - while true; do - sleep 60 # Check every minute - - echo "[$(date +%H:%M:%S)] Status Report" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - - # Database metrics - if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - echo " Database size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - - # WAL file size (indicates write activity) - if [ -f "$DB_PATH-wal" ]; then - WAL_SIZE=$(stat -f%z "$DB_PATH-wal" 2>/dev/null || stat -c%s "$DB_PATH-wal" 2>/dev/null) - echo " WAL size: $(numfmt --to=iec-i --suffix=B $WAL_SIZE 2>/dev/null || echo "$WAL_SIZE bytes")" - fi - - # Row count - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - echo " Rows in database: $ROW_COUNT" - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - echo " Rows in database: $ROW_COUNT" - fi - fi - - # Replication metrics - if [ -d "$REPLICA_PATH" ]; then - # Count snapshot files - SNAPSHOTS=$(find "$REPLICA_PATH" -name "*snapshot*.ltx" 2>/dev/null | wc -l | tr -d ' ') - # Count LTX files (WAL segments) - LTX_FILES=$(find "$REPLICA_PATH" -name "*.ltx" 2>/dev/null | wc -l | tr -d ' ') - REPLICA_SIZE=$(du -sh "$REPLICA_PATH" 2>/dev/null | cut -f1) - echo " Replica: $SNAPSHOTS snapshots, $LTX_FILES segments, size: $REPLICA_SIZE" - fi - - # Operation metrics (with delta since last check) - if [ -f "$LOG_DIR/litestream.log" ]; then - CHECKPOINT_COUNT=$(grep -c "checkpoint" "$LOG_DIR/litestream.log" 2>/dev/null) - CHECKPOINT_COUNT=${CHECKPOINT_COUNT:-0} - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" 2>/dev/null) - COMPACTION_COUNT=${COMPACTION_COUNT:-0} - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" 2>/dev/null) - SYNC_COUNT=${SYNC_COUNT:-0} - - CHECKPOINT_DELTA=$((CHECKPOINT_COUNT - last_checkpoint_count)) - COMPACTION_DELTA=$((COMPACTION_COUNT - last_compaction_count)) - SYNC_DELTA=$((SYNC_COUNT - last_sync_count)) - - echo " Operations: $CHECKPOINT_COUNT checkpoints (+$CHECKPOINT_DELTA), $COMPACTION_COUNT compactions (+$COMPACTION_DELTA)" - echo " Syncs: $SYNC_COUNT total (+$SYNC_DELTA in last minute)" - - last_checkpoint_count=$CHECKPOINT_COUNT - last_compaction_count=$COMPACTION_COUNT - last_sync_count=$SYNC_COUNT - fi - - # Check for errors (excluding known non-critical) - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ') - if [ "$ERROR_COUNT" -gt 0 ]; then - echo " ⚠ Critical errors: $ERROR_COUNT" - grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | tail -2 - fi - - # Load generator status - if [ -f "$LOG_DIR/load.log" ]; then - LOAD_STATUS=$(tail -1 "$LOG_DIR/load.log" 2>/dev/null | grep -oE "writes_per_sec=[0-9.]+" | cut -d= -f2 || echo "0") - echo " Write rate: ${LOAD_STATUS:-0} writes/sec" - fi - - # Check processes - if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo " ✗ Litestream stopped unexpectedly!" - break - fi - - if ! kill -0 "$LOAD_PID" 2>/dev/null; then - echo " ✓ Load test completed" - break - fi - - echo "" - done -} - -echo "Running comprehensive test for $TEST_DURATION..." -echo "Monitor will report every 60 seconds" -echo "================================================" -echo "" - -# Start monitoring in background -monitor_comprehensive & -MONITOR_PID=$! - -# Wait for load test to complete -wait "$LOAD_PID" 2>/dev/null || true - -# Stop the monitor -kill $MONITOR_PID 2>/dev/null || true -wait $MONITOR_PID 2>/dev/null || true - -echo "" -echo "================================================" -echo "Final Test Results" -echo "================================================" - -# Final statistics -echo "Database Statistics:" -if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - # Find the actual table name - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - ROW_COUNT="0" - fi - echo " Final size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - echo " Total rows: $ROW_COUNT" -fi - -echo "" -echo "Replication Statistics:" -if [ -d "$REPLICA_PATH" ]; then - SNAPSHOT_COUNT=$(find "$REPLICA_PATH" -name "*snapshot*.ltx" 2>/dev/null | wc -l | tr -d ' ') - LTX_COUNT=$(find "$REPLICA_PATH" -name "*.ltx" 2>/dev/null | wc -l | tr -d ' ') - REPLICA_SIZE=$(du -sh "$REPLICA_PATH" | cut -f1) - echo " Snapshots created: $SNAPSHOT_COUNT" - echo " LTX segments: $LTX_COUNT" - echo " Replica size: $REPLICA_SIZE" -fi - -echo "" -echo "Operation Counts:" -if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" || echo "0") - CHECKPOINT_COUNT=$(grep -c "checkpoint" "$LOG_DIR/litestream.log" || echo "0") - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" || echo "0") - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") -else - COMPACTION_COUNT="0" - CHECKPOINT_COUNT="0" - SYNC_COUNT="0" - ERROR_COUNT="0" -fi -echo " Compactions: $COMPACTION_COUNT" -echo " Checkpoints: $CHECKPOINT_COUNT" -echo " Syncs: $SYNC_COUNT" -echo " Errors: $ERROR_COUNT" - -# Validation test -echo "" -echo "Testing validation..." -bin/litestream-test validate \ - -source "$DB_PATH" \ - -replica "$REPLICA_PATH" \ - > "$LOG_DIR/validate.log" 2>&1 - -if [ $? -eq 0 ]; then - echo " ✓ Validation passed!" -else - echo " ✗ Validation failed!" - tail -10 "$LOG_DIR/validate.log" -fi - -# Test restoration -echo "" -echo "Testing restoration..." -RESTORE_DB="$TEST_DIR/restored.db" -bin/litestream restore -o "$RESTORE_DB" "file://$REPLICA_PATH" > "$LOG_DIR/restore.log" 2>&1 - -if [ $? -eq 0 ]; then - # Get row count from restored database - TABLES=$(sqlite3 "$RESTORE_DB" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - else - RESTORED_COUNT="0" - fi - - if [ "$RESTORED_COUNT" = "$ROW_COUNT" ]; then - echo " ✓ Restoration successful! ($RESTORED_COUNT rows match)" - else - echo " ⚠ Row count mismatch! Original: $ROW_COUNT, Restored: $RESTORED_COUNT" - fi -else - echo " ✗ Restoration failed!" - tail -10 "$LOG_DIR/restore.log" -fi - -# Summary -echo "" -echo "================================================" -echo "Test Summary" -echo "================================================" - -# Count critical errors (exclude known non-critical ones) -CRITICAL_ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ') - -# Determine test result -TEST_PASSED=true -ISSUES="" - -if [ "$CRITICAL_ERROR_COUNT" -gt 0 ]; then - TEST_PASSED=false - ISSUES="$ISSUES\n - Critical errors detected: $CRITICAL_ERROR_COUNT" -fi - -if [ "$LTX_COUNT" -eq 0 ]; then - TEST_PASSED=false - ISSUES="$ISSUES\n - No LTX segments created (replication not working)" -fi - -if [ "$CHECKPOINT_COUNT" -eq 0 ]; then - ISSUES="$ISSUES\n - No checkpoints recorded (may need more aggressive settings)" -fi - -if [ "$COMPACTION_COUNT" -eq 0 ]; then - ISSUES="$ISSUES\n - No compactions occurred (unexpected for this test duration)" -fi - -if [ "$TEST_PASSED" = true ]; then - echo "✓ COMPREHENSIVE TEST PASSED!" - echo "" - echo "Successfully validated:" - echo " - Continuous replication ($LTX_COUNT segments)" - echo " - Compaction ($COMPACTION_COUNT operations)" - [ "$CHECKPOINT_COUNT" -gt 0 ] && echo " - Checkpoints ($CHECKPOINT_COUNT operations)" - [ "$SNAPSHOT_COUNT" -gt 0 ] && echo " - Snapshots ($SNAPSHOT_COUNT created)" - echo " - Database restoration" - echo "" - echo "The configuration is ready for production use." -else - echo "⚠ TEST COMPLETED WITH ISSUES:" - echo -e "$ISSUES" - echo "" - echo "Review the logs for details:" - echo " $LOG_DIR/litestream.log" -fi - -echo "" -echo "Full test results available in: $TEST_DIR" -echo "================================================" diff --git a/scripts/test-minio-s3.sh b/scripts/test-minio-s3.sh deleted file mode 100755 index 2a4a56967..000000000 --- a/scripts/test-minio-s3.sh +++ /dev/null @@ -1,464 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# MinIO S3-compatible test with Docker -# This test runs Litestream against a local MinIO instance to simulate S3 behavior - -TEST_DURATION="${TEST_DURATION:-2h}" -TEST_DIR="/tmp/litestream-minio-$(date +%Y%m%d-%H%M%S)" -DB_PATH="$TEST_DIR/test.db" -CONFIG_FILE="$TEST_DIR/litestream.yml" -LOG_DIR="$TEST_DIR/logs" - -# MinIO settings - use alternative ports to avoid conflicts -MINIO_CONTAINER_NAME="litestream-minio-test" -MINIO_PORT=9100 -MINIO_CONSOLE_PORT=9101 -MINIO_ROOT_USER="minioadmin" -MINIO_ROOT_PASSWORD="minioadmin" -MINIO_BUCKET="litestream-test" -MINIO_ENDPOINT="http://localhost:${MINIO_PORT}" -S3_PATH="s3://${MINIO_BUCKET}/litestream-test-$(date +%Y%m%d-%H%M%S)" - -echo "================================================" -echo "Litestream MinIO S3 Test" -echo "================================================" -echo "Duration: $TEST_DURATION" -echo "Test directory: $TEST_DIR" -echo "MinIO endpoint: $MINIO_ENDPOINT" -echo "MinIO bucket: $MINIO_BUCKET" -echo "Start time: $(date)" -echo "" - -# Check for Docker -if ! command -v docker &> /dev/null; then - echo "Error: Docker is not installed or not in PATH" - echo "Please install Docker to run this test" - exit 1 -fi - -cleanup() { - echo "" - echo "================================================" - echo "Cleaning up..." - echo "================================================" - - # Kill all spawned processes - jobs -p | xargs -r kill 2>/dev/null || true - wait 2>/dev/null || true - - # Stop and remove MinIO container - if [ -n "${MINIO_CONTAINER_NAME:-}" ]; then - echo "Stopping MinIO container..." - docker stop "$MINIO_CONTAINER_NAME" 2>/dev/null || true - docker rm "$MINIO_CONTAINER_NAME" 2>/dev/null || true - fi - - echo "" - echo "Test completed at: $(date)" - echo "Results saved in: $TEST_DIR" -} - -trap cleanup EXIT INT TERM - -# Create directories -mkdir -p "$TEST_DIR" "$LOG_DIR" - -# Clean up any existing container -if docker ps -a | grep -q "$MINIO_CONTAINER_NAME"; then - echo "Removing existing MinIO container..." - docker stop "$MINIO_CONTAINER_NAME" 2>/dev/null || true - docker rm "$MINIO_CONTAINER_NAME" 2>/dev/null || true -fi - -# Start MinIO container -echo "Starting MinIO container..." -docker run -d \ - --name "$MINIO_CONTAINER_NAME" \ - -p "${MINIO_PORT}:9000" \ - -p "${MINIO_CONSOLE_PORT}:9001" \ - -e "MINIO_ROOT_USER=${MINIO_ROOT_USER}" \ - -e "MINIO_ROOT_PASSWORD=${MINIO_ROOT_PASSWORD}" \ - minio/minio server /data --console-address ":9001" - -echo "Waiting for MinIO to start..." -sleep 5 - -# Check if MinIO is running -if ! docker ps | grep -q "$MINIO_CONTAINER_NAME"; then - echo "Error: MinIO container failed to start" - docker logs "$MINIO_CONTAINER_NAME" 2>&1 - exit 1 -fi - -echo "MinIO is running!" -echo " API: http://localhost:${MINIO_PORT} (mapped from container port 9000)" -echo " Console: http://localhost:${MINIO_CONSOLE_PORT} (mapped from container port 9001)" -echo " Credentials: ${MINIO_ROOT_USER}/${MINIO_ROOT_PASSWORD}" -echo "" - -# Create MinIO bucket using mc (MinIO Client) in Docker -echo "Creating MinIO bucket..." -docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc mb "minio/${MINIO_BUCKET}" 2>/dev/null || true - -echo "Bucket '${MINIO_BUCKET}' ready" -echo "" - -# Build binaries if needed -echo "Building binaries..." -if [ ! -f bin/litestream ]; then - go build -o bin/litestream ./cmd/litestream -fi -if [ ! -f bin/litestream-test ]; then - go build -o bin/litestream-test ./cmd/litestream-test -fi - -# Create and populate test database -echo "Creating and populating test database..." -sqlite3 "$DB_PATH" < "$LOG_DIR/populate.log" 2>&1 -if [ $? -ne 0 ]; then - echo "Warning: Population failed, but continuing..." - cat "$LOG_DIR/populate.log" -fi - -# Create Litestream configuration for MinIO -echo "Creating Litestream configuration for MinIO S3..." -cat > "$CONFIG_FILE" < "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! - -sleep 3 - -if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "ERROR: Litestream failed to start!" - echo "Last 50 lines of log:" - tail -50 "$LOG_DIR/litestream.log" - exit 1 -fi - -echo "Litestream running (PID: $LITESTREAM_PID)" -echo "" - -# Start load generator -echo "Starting load generator (heavy sustained load)..." -bin/litestream-test load \ - -db "$DB_PATH" \ - -write-rate 500 \ - -duration "$TEST_DURATION" \ - -pattern wave \ - -payload-size 4096 \ - -read-ratio 0.3 \ - -workers 8 \ - > "$LOG_DIR/load.log" 2>&1 & -LOAD_PID=$! - -echo "Load generator running (PID: $LOAD_PID)" -echo "" - -# Monitor function for MinIO -monitor_minio() { - local last_checkpoint_count=0 - local last_compaction_count=0 - local last_sync_count=0 - - while true; do - sleep 60 - - echo "[$(date +%H:%M:%S)] Status Report" - echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" - - # Database metrics - if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - echo " Database size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - - if [ -f "$DB_PATH-wal" ]; then - WAL_SIZE=$(stat -f%z "$DB_PATH-wal" 2>/dev/null || stat -c%s "$DB_PATH-wal" 2>/dev/null) - echo " WAL size: $(numfmt --to=iec-i --suffix=B $WAL_SIZE 2>/dev/null || echo "$WAL_SIZE bytes")" - fi - - # Row count - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - echo " Rows in database: $ROW_COUNT" - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - echo " Rows in database: $ROW_COUNT" - fi - fi - - # MinIO/S3 metrics using docker exec - echo "" - echo " MinIO S3 Statistics:" - - # Count objects in MinIO - OBJECT_COUNT=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc ls "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | wc -l | tr -d ' ' || echo "0") - - # Count LTX files (modern format) and snapshots - LTX_COUNT=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc ls "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | grep -c "\.ltx" || echo "0") - - SNAPSHOT_COUNT=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc ls "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | grep -c "snapshot" || echo "0") - - echo " Total objects: $OBJECT_COUNT" - echo " LTX segments: $LTX_COUNT" - echo " Snapshots: $SNAPSHOT_COUNT" - - # Operation metrics - if [ -f "$LOG_DIR/litestream.log" ]; then - CHECKPOINT_COUNT=$(grep -c "checkpoint" "$LOG_DIR/litestream.log" 2>/dev/null) - CHECKPOINT_COUNT=${CHECKPOINT_COUNT:-0} - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" 2>/dev/null) - COMPACTION_COUNT=${COMPACTION_COUNT:-0} - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" 2>/dev/null) - SYNC_COUNT=${SYNC_COUNT:-0} - - CHECKPOINT_DELTA=$((CHECKPOINT_COUNT - last_checkpoint_count)) - COMPACTION_DELTA=$((COMPACTION_COUNT - last_compaction_count)) - SYNC_DELTA=$((SYNC_COUNT - last_sync_count)) - - echo "" - echo " Operations: $CHECKPOINT_COUNT checkpoints (+$CHECKPOINT_DELTA), $COMPACTION_COUNT compactions (+$COMPACTION_DELTA)" - echo " Syncs: $SYNC_COUNT total (+$SYNC_DELTA in last minute)" - - last_checkpoint_count=$CHECKPOINT_COUNT - last_compaction_count=$COMPACTION_COUNT - last_sync_count=$SYNC_COUNT - fi - - # Check for errors - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ') - if [ "$ERROR_COUNT" -gt 0 ]; then - echo " ⚠ Critical errors: $ERROR_COUNT" - grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | tail -2 - fi - - # Check processes - if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo " ✗ Litestream stopped unexpectedly!" - break - fi - - if ! kill -0 "$LOAD_PID" 2>/dev/null; then - echo " ✓ Load test completed" - break - fi - - echo "" - done -} - -echo "Running MinIO S3 test for $TEST_DURATION..." -echo "Monitor will report every 60 seconds" -echo "================================================" -echo "" - -# Start monitoring in background -monitor_minio & -MONITOR_PID=$! - -# Wait for load test to complete -wait "$LOAD_PID" 2>/dev/null || true - -# Stop the monitor -kill $MONITOR_PID 2>/dev/null || true -wait $MONITOR_PID 2>/dev/null || true - -echo "" -echo "================================================" -echo "Final Test Results" -echo "================================================" - -# Final statistics -echo "Database Statistics:" -if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - ROW_COUNT="0" - fi - echo " Final size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - echo " Total rows: $ROW_COUNT" -fi - -echo "" -echo "MinIO S3 Statistics:" -FINAL_OBJECTS=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc ls "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | wc -l | tr -d ' ' || echo "0") - -FINAL_LTX=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc ls "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | grep -c "\.ltx" || echo "0") - -FINAL_SNAPSHOTS=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc ls "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | grep -c "snapshot" || echo "0") - -echo " Total objects in MinIO: $FINAL_OBJECTS" -echo " LTX segments: $FINAL_LTX" -echo " Snapshots: $FINAL_SNAPSHOTS" - -# Get storage size -STORAGE_INFO=$(docker run --rm --link "${MINIO_CONTAINER_NAME}:minio" \ - -e "MC_HOST_minio=http://${MINIO_ROOT_USER}:${MINIO_ROOT_PASSWORD}@minio:9000" \ - minio/mc du "minio/${MINIO_BUCKET}/" --recursive 2>/dev/null | tail -1 || echo "0") -echo " Total storage used: $STORAGE_INFO" - -echo "" -echo "Operation Counts:" -if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" || echo "0") - CHECKPOINT_COUNT=$(grep -c "checkpoint" "$LOG_DIR/litestream.log" || echo "0") - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" || echo "0") - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") -else - COMPACTION_COUNT="0" - CHECKPOINT_COUNT="0" - SYNC_COUNT="0" - ERROR_COUNT="0" -fi -echo " Compactions: $COMPACTION_COUNT" -echo " Checkpoints: $CHECKPOINT_COUNT" -echo " Syncs: $SYNC_COUNT" -echo " Errors: $ERROR_COUNT" - -# Test restoration from MinIO -echo "" -echo "Testing restoration from MinIO S3..." -RESTORE_DB="$TEST_DIR/restored.db" - -# Export credentials for litestream restore -export AWS_ACCESS_KEY_ID="${MINIO_ROOT_USER}" -export AWS_SECRET_ACCESS_KEY="${MINIO_ROOT_PASSWORD}" - -# Create a config file for restoration -cat > "$TEST_DIR/restore.yml" < "$LOG_DIR/restore.log" 2>&1 - -if [ $? -eq 0 ]; then - echo "✓ Restoration successful!" - - # Compare row counts - TABLES=$(sqlite3 "$RESTORE_DB" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - RESTORED_COUNT="0" - fi - - if [ "$ROW_COUNT" = "$RESTORED_COUNT" ]; then - echo "✓ Row counts match! ($RESTORED_COUNT rows)" - else - echo "⚠ Row count mismatch! Original: $ROW_COUNT, Restored: $RESTORED_COUNT" - fi -else - echo "✗ Restoration failed!" - tail -20 "$LOG_DIR/restore.log" -fi - -# Summary -echo "" -echo "================================================" -echo "Test Summary" -echo "================================================" - -CRITICAL_ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ') - -if [ "$CRITICAL_ERROR_COUNT" -eq 0 ] && [ "$FINAL_OBJECTS" -gt 0 ]; then - echo "✓ MINIO S3 TEST PASSED!" - echo "" - echo "Successfully validated:" - echo " - S3-compatible replication to MinIO" - echo " - Stored $FINAL_OBJECTS objects" - echo " - Compactions: $COMPACTION_COUNT" - echo " - Syncs: $SYNC_COUNT" - [ "$CHECKPOINT_COUNT" -gt 0 ] && echo " - Checkpoints: $CHECKPOINT_COUNT" - [ "$FINAL_SNAPSHOTS" -gt 0 ] && echo " - Snapshots: $FINAL_SNAPSHOTS" - echo " - Database restoration from S3" -else - echo "⚠ TEST COMPLETED WITH ISSUES:" - [ "$CRITICAL_ERROR_COUNT" -gt 0 ] && echo " - Critical errors detected: $CRITICAL_ERROR_COUNT" - [ "$FINAL_OBJECTS" -eq 0 ] && echo " - No objects stored in MinIO" - echo "" - echo "Review the logs for details:" - echo " $LOG_DIR/litestream.log" -fi - -echo "" -echo "MinIO Console: http://localhost:${MINIO_CONSOLE_PORT}" -echo "Credentials: ${MINIO_ROOT_USER}/${MINIO_ROOT_PASSWORD}" -echo "" -echo "Full test results available in: $TEST_DIR" -echo "================================================" diff --git a/scripts/test-overnight-s3.sh b/scripts/test-overnight-s3.sh deleted file mode 100755 index 277e95264..000000000 --- a/scripts/test-overnight-s3.sh +++ /dev/null @@ -1,409 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Check for required environment variables -if [ -z "${AWS_ACCESS_KEY_ID:-}" ] || [ -z "${AWS_SECRET_ACCESS_KEY:-}" ] || [ -z "${S3_BUCKET:-}" ]; then - echo "Error: Required environment variables not set" - echo "Please set: AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, S3_BUCKET" - echo "" - echo "Example:" - echo " export AWS_ACCESS_KEY_ID=your_key" - echo " export AWS_SECRET_ACCESS_KEY=your_secret" - echo " export S3_BUCKET=your-test-bucket" - echo " export AWS_REGION=us-east-1 # optional, defaults to us-east-1" - exit 1 -fi - -AWS_REGION="${AWS_REGION:-us-east-1}" -S3_PATH="s3://${S3_BUCKET}/litestream-overnight-$(date +%Y%m%d-%H%M%S)" - -TEST_DIR="/tmp/litestream-overnight-s3-$(date +%Y%m%d-%H%M%S)" -DB_PATH="$TEST_DIR/test.db" -LOG_DIR="$TEST_DIR/logs" -CONFIG_FILE="$TEST_DIR/litestream.yml" -MONITOR_PID="" -LITESTREAM_PID="" -LOAD_PID="" - -echo "================================================" -echo "Litestream Overnight S3 Test Suite" -echo "================================================" -echo "Test directory: $TEST_DIR" -echo "S3 destination: $S3_PATH" -echo "AWS Region: $AWS_REGION" -echo "Start time: $(date)" -echo "" - -cleanup() { - echo "" - echo "================================================" - echo "Cleaning up..." - echo "================================================" - - if [ -n "$LOAD_PID" ] && kill -0 "$LOAD_PID" 2>/dev/null; then - echo "Stopping load generator..." - kill "$LOAD_PID" 2>/dev/null || true - wait "$LOAD_PID" 2>/dev/null || true - fi - - if [ -n "$LITESTREAM_PID" ] && kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "Stopping litestream..." - kill "$LITESTREAM_PID" 2>/dev/null || true - wait "$LITESTREAM_PID" 2>/dev/null || true - fi - - if [ -n "$MONITOR_PID" ] && kill -0 "$MONITOR_PID" 2>/dev/null; then - echo "Stopping monitor..." - kill "$MONITOR_PID" 2>/dev/null || true - fi - - echo "" - echo "Test Summary:" - echo "=============" - if [ -f "$LOG_DIR/monitor.log" ]; then - echo "Final statistics from monitor log:" - tail -20 "$LOG_DIR/monitor.log" - fi - - echo "" - echo "S3 Final Statistics:" - aws s3 ls "${S3_PATH}/" --recursive --summarize 2>/dev/null | tail -5 || true - - echo "" - echo "Test artifacts saved locally in: $TEST_DIR" - echo "S3 replica data in: $S3_PATH" - echo "End time: $(date)" -} - -trap cleanup EXIT INT TERM - -mkdir -p "$TEST_DIR" "$LOG_DIR" - -echo "Creating initial database..." -sqlite3 "$DB_PATH" < "$CONFIG_FILE" < /dev/null 2>&1; then - echo "✓ S3 bucket accessible" -else - echo "✗ Failed to access S3 bucket: ${S3_BUCKET}" - exit 1 -fi - -echo "Building litestream if needed..." -if [ ! -f bin/litestream ]; then - go build -o bin/litestream ./cmd/litestream -fi - -echo "Starting litestream replication to S3..." -LOG_LEVEL=debug bin/litestream replicate -config "$CONFIG_FILE" > "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! -echo "Litestream started with PID: $LITESTREAM_PID" - -sleep 5 - -if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "ERROR: Litestream failed to start. Check logs:" - tail -50 "$LOG_DIR/litestream.log" - exit 1 -fi - -monitor_s3_test() { - while true; do - echo "================================================" | tee -a "$LOG_DIR/monitor.log" - echo "Monitor Update: $(date)" | tee -a "$LOG_DIR/monitor.log" - echo "================================================" | tee -a "$LOG_DIR/monitor.log" - - # Database size - if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null || echo "0") - echo "Database size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" | tee -a "$LOG_DIR/monitor.log" - fi - - # WAL size - if [ -f "$DB_PATH-wal" ]; then - WAL_SIZE=$(stat -f%z "$DB_PATH-wal" 2>/dev/null || stat -c%s "$DB_PATH-wal" 2>/dev/null || echo "0") - echo "WAL size: $(numfmt --to=iec-i --suffix=B $WAL_SIZE 2>/dev/null || echo "$WAL_SIZE bytes")" | tee -a "$LOG_DIR/monitor.log" - fi - - # S3 statistics - echo "" | tee -a "$LOG_DIR/monitor.log" - echo "S3 Replica Statistics:" | tee -a "$LOG_DIR/monitor.log" - - # Count objects in S3 - SNAPSHOT_COUNT=$(aws s3 ls "${S3_PATH}/" --recursive 2>/dev/null | grep -c "\.snapshot\.lz4" || echo "0") - WAL_COUNT=$(aws s3 ls "${S3_PATH}/" --recursive 2>/dev/null | grep -c "\.wal\.lz4" || echo "0") - TOTAL_OBJECTS=$(aws s3 ls "${S3_PATH}/" --recursive 2>/dev/null | wc -l | tr -d ' ' || echo "0") - - echo " Snapshots in S3: $SNAPSHOT_COUNT" | tee -a "$LOG_DIR/monitor.log" - echo " WAL segments in S3: $WAL_COUNT" | tee -a "$LOG_DIR/monitor.log" - echo " Total objects in S3: $TOTAL_OBJECTS" | tee -a "$LOG_DIR/monitor.log" - - # Get S3 storage size (if possible) - S3_SIZE=$(aws s3 ls "${S3_PATH}/" --recursive --summarize 2>/dev/null | grep "Total Size" | awk '{print $3}' || echo "0") - if [ "$S3_SIZE" != "0" ]; then - echo " Total S3 storage: $(numfmt --to=iec-i --suffix=B $S3_SIZE 2>/dev/null || echo "$S3_SIZE bytes")" | tee -a "$LOG_DIR/monitor.log" - fi - - # Count operations - echo "" | tee -a "$LOG_DIR/monitor.log" - echo "Operations:" | tee -a "$LOG_DIR/monitor.log" - if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - CHECKPOINT_COUNT=$(grep -iE "checkpoint|checkpointed" "$LOG_DIR/litestream.log" 2>/dev/null | wc -l | tr -d ' ' || echo "0") - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - echo " Compactions: $COMPACTION_COUNT" | tee -a "$LOG_DIR/monitor.log" - echo " Checkpoints: $CHECKPOINT_COUNT" | tee -a "$LOG_DIR/monitor.log" - echo " Syncs: $SYNC_COUNT" | tee -a "$LOG_DIR/monitor.log" - fi - - # Check for errors (exclude known non-critical) - echo "" | tee -a "$LOG_DIR/monitor.log" - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") - echo "Critical errors in litestream log: $ERROR_COUNT" | tee -a "$LOG_DIR/monitor.log" - - if [ "$ERROR_COUNT" -gt 0 ]; then - echo "Recent errors:" | tee -a "$LOG_DIR/monitor.log" - grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | tail -5 | tee -a "$LOG_DIR/monitor.log" - fi - - # Check for S3-specific errors - S3_ERROR_COUNT=$(grep -c "S3\|AWS\|403\|404\|500\|503" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - if [ "$S3_ERROR_COUNT" -gt 0 ]; then - echo "S3-specific errors: $S3_ERROR_COUNT" | tee -a "$LOG_DIR/monitor.log" - grep "S3\|AWS\|403\|404\|500\|503" "$LOG_DIR/litestream.log" | tail -3 | tee -a "$LOG_DIR/monitor.log" - fi - - # Process status - echo "" | tee -a "$LOG_DIR/monitor.log" - echo "Process Status:" | tee -a "$LOG_DIR/monitor.log" - - if kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo " Litestream: Running (PID: $LITESTREAM_PID)" | tee -a "$LOG_DIR/monitor.log" - else - echo " Litestream: STOPPED" | tee -a "$LOG_DIR/monitor.log" - fi - - if [ -n "$LOAD_PID" ] && kill -0 "$LOAD_PID" 2>/dev/null; then - echo " Load generator: Running (PID: $LOAD_PID)" | tee -a "$LOG_DIR/monitor.log" - else - echo " Load generator: STOPPED" | tee -a "$LOG_DIR/monitor.log" - fi - - # Network/API statistics from log - UPLOAD_COUNT=$(grep -c "uploading\|uploaded" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - echo " Total upload operations: $UPLOAD_COUNT" | tee -a "$LOG_DIR/monitor.log" - - echo "" | tee -a "$LOG_DIR/monitor.log" - sleep 60 - done -} - -echo "Starting monitor process..." -monitor_s3_test & -MONITOR_PID=$! -echo "Monitor started with PID: $MONITOR_PID" - -echo "" -echo "Initial database population (before starting litestream)..." -# Kill litestream temporarily to populate database -kill "$LITESTREAM_PID" 2>/dev/null || true -wait "$LITESTREAM_PID" 2>/dev/null || true - -bin/litestream-test populate -db "$DB_PATH" -target-size 100MB -batch-size 10000 > "$LOG_DIR/populate.log" 2>&1 -if [ $? -ne 0 ]; then - echo "Warning: Population failed, but continuing..." - cat "$LOG_DIR/populate.log" -fi - -# Restart litestream -echo "Restarting litestream after population..." -LOG_LEVEL=debug bin/litestream replicate -config "$CONFIG_FILE" > "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -echo "" -echo "Starting load generator for overnight S3 test..." -echo "Configuration:" -echo " - Duration: 8 hours" -echo " - Write rate: 100 writes/second (higher for S3 testing)" -echo " - Pattern: wave (simulates varying load)" -echo " - Workers: 8" -echo "" - -# Run load test for 8 hours with higher load for S3 -bin/litestream-test load \ - -db "$DB_PATH" \ - -write-rate 100 \ - -duration 8h \ - -pattern wave \ - -payload-size 4096 \ - -read-ratio 0.3 \ - -workers 8 \ - > "$LOG_DIR/load.log" 2>&1 & - -LOAD_PID=$! -echo "Load generator started with PID: $LOAD_PID" - -echo "" -echo "================================================" -echo "Overnight S3 test is running!" -echo "================================================" -echo "" -echo "Monitor the test with:" -echo " tail -f $LOG_DIR/monitor.log" -echo "" -echo "View litestream logs:" -echo " tail -f $LOG_DIR/litestream.log" -echo "" -echo "View load generator logs:" -echo " tail -f $LOG_DIR/load.log" -echo "" -echo "Check S3 contents:" -echo " aws s3 ls ${S3_PATH}/ --recursive" -echo "" -echo "The test will run for 8 hours. Press Ctrl+C to stop early." -echo "" - -wait "$LOAD_PID" - -echo "" -echo "Load generation completed." - -# Final statistics -echo "" -echo "================================================" -echo "Final Statistics" -echo "================================================" - -if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - # Find actual table name - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - ROW_COUNT="0" - fi - echo "Database size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - echo "Total rows: $ROW_COUNT" -fi - -echo "" -echo "S3 Statistics:" -# Count objects in S3 -SNAPSHOT_COUNT=$(aws s3 ls "${S3_PATH}/" --recursive 2>/dev/null | grep -c "\.snapshot\.lz4" || echo "0") -WAL_COUNT=$(aws s3 ls "${S3_PATH}/" --recursive 2>/dev/null | grep -c "\.wal\.lz4" || echo "0") -TOTAL_OBJECTS=$(aws s3 ls "${S3_PATH}/" --recursive 2>/dev/null | wc -l | tr -d ' ' || echo "0") -S3_SIZE=$(aws s3 ls "${S3_PATH}/" --recursive --summarize 2>/dev/null | grep "Total Size" | awk '{print $3}' || echo "0") - -echo " Snapshots in S3: $SNAPSHOT_COUNT" -echo " WAL segments in S3: $WAL_COUNT" -echo " Total objects: $TOTAL_OBJECTS" -if [ "$S3_SIZE" != "0" ]; then - echo " Total S3 storage: $(numfmt --to=iec-i --suffix=B $S3_SIZE 2>/dev/null || echo "$S3_SIZE bytes")" -fi - -echo "" -echo "Operation Counts:" -if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" || echo "0") - CHECKPOINT_COUNT=$(grep -iE "checkpoint|checkpointed" "$LOG_DIR/litestream.log" | wc -l | tr -d ' ' || echo "0") - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" || echo "0") - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") - echo " Compactions: $COMPACTION_COUNT" - echo " Checkpoints: $CHECKPOINT_COUNT" - echo " Syncs: $SYNC_COUNT" - echo " Critical errors: $ERROR_COUNT" -fi - -echo "" -echo "Testing restoration from S3..." - -# Test restoration -RESTORE_DB="$TEST_DIR/restored.db" -echo "Restoring database from S3 to: $RESTORE_DB" -bin/litestream restore -o "$RESTORE_DB" "$S3_PATH" > "$LOG_DIR/restore.log" 2>&1 - -if [ $? -eq 0 ]; then - echo "✓ Restoration successful!" - - # Compare row counts - use same table detection logic - TABLES=$(sqlite3 "$RESTORE_DB" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - RESTORED_COUNT="0" - fi - - if [ "$ROW_COUNT" = "$RESTORED_COUNT" ]; then - echo "✓ Row counts match! ($RESTORED_COUNT rows)" - else - echo "⚠ Row count mismatch! Original: $ROW_COUNT, Restored: $RESTORED_COUNT" - fi -else - echo "✗ Restoration failed! Check $LOG_DIR/restore.log" -fi diff --git a/scripts/test-overnight.sh b/scripts/test-overnight.sh deleted file mode 100755 index be8f17a30..000000000 --- a/scripts/test-overnight.sh +++ /dev/null @@ -1,331 +0,0 @@ -#!/bin/bash -set -euo pipefail - -TEST_DIR="/tmp/litestream-overnight-$(date +%Y%m%d-%H%M%S)" -DB_PATH="$TEST_DIR/test.db" -REPLICA_PATH="$TEST_DIR/replica" -LOG_DIR="$TEST_DIR/logs" -CONFIG_FILE="$TEST_DIR/litestream.yml" -MONITOR_PID="" -LITESTREAM_PID="" -LOAD_PID="" - -echo "================================================" -echo "Litestream Overnight Test Suite" -echo "================================================" -echo "Test directory: $TEST_DIR" -echo "Start time: $(date)" -echo "" - -cleanup() { - echo "" - echo "================================================" - echo "Cleaning up..." - echo "================================================" - - if [ -n "$LOAD_PID" ] && kill -0 "$LOAD_PID" 2>/dev/null; then - echo "Stopping load generator..." - kill "$LOAD_PID" 2>/dev/null || true - wait "$LOAD_PID" 2>/dev/null || true - fi - - if [ -n "$LITESTREAM_PID" ] && kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "Stopping litestream..." - kill "$LITESTREAM_PID" 2>/dev/null || true - wait "$LITESTREAM_PID" 2>/dev/null || true - fi - - if [ -n "$MONITOR_PID" ] && kill -0 "$MONITOR_PID" 2>/dev/null; then - echo "Stopping monitor..." - kill "$MONITOR_PID" 2>/dev/null || true - fi - - echo "" - echo "Test Summary:" - echo "=============" - if [ -f "$LOG_DIR/monitor.log" ]; then - echo "Final statistics from monitor log:" - tail -20 "$LOG_DIR/monitor.log" - fi - - echo "" - echo "Test artifacts saved in: $TEST_DIR" - echo "End time: $(date)" -} - -trap cleanup EXIT INT TERM - -mkdir -p "$TEST_DIR" "$LOG_DIR" "$REPLICA_PATH" - -echo "Creating initial database..." -sqlite3 "$DB_PATH" < "$CONFIG_FILE" < "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! -echo "Litestream started with PID: $LITESTREAM_PID" - -sleep 5 - -if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "ERROR: Litestream failed to start. Check logs:" - tail -50 "$LOG_DIR/litestream.log" - exit 1 -fi - -monitor_test() { - while true; do - echo "================================================" | tee -a "$LOG_DIR/monitor.log" - echo "Monitor Update: $(date)" | tee -a "$LOG_DIR/monitor.log" - echo "================================================" | tee -a "$LOG_DIR/monitor.log" - - # Database size - if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null || echo "0") - echo "Database size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" | tee -a "$LOG_DIR/monitor.log" - fi - - # WAL size - if [ -f "$DB_PATH-wal" ]; then - WAL_SIZE=$(stat -f%z "$DB_PATH-wal" 2>/dev/null || stat -c%s "$DB_PATH-wal" 2>/dev/null || echo "0") - echo "WAL size: $(numfmt --to=iec-i --suffix=B $WAL_SIZE 2>/dev/null || echo "$WAL_SIZE bytes")" | tee -a "$LOG_DIR/monitor.log" - fi - - # Replica statistics - echo "" | tee -a "$LOG_DIR/monitor.log" - echo "Replica Statistics:" | tee -a "$LOG_DIR/monitor.log" - - # Count snapshots (for file replica, look for snapshot.ltx files) - SNAPSHOT_COUNT=$(find "$REPLICA_PATH" -name "*snapshot*.ltx" 2>/dev/null | wc -l | tr -d ' ') - echo " Snapshots: $SNAPSHOT_COUNT" | tee -a "$LOG_DIR/monitor.log" - - # Count LTX segments by age (file replicas use .ltx not .wal.lz4) - if [ -d "$REPLICA_PATH" ]; then - LTX_30S=$(find "$REPLICA_PATH" -name "*.ltx" -mmin -0.5 2>/dev/null | wc -l | tr -d ' ') - LTX_1M=$(find "$REPLICA_PATH" -name "*.ltx" -mmin -1 2>/dev/null | wc -l | tr -d ' ') - LTX_5M=$(find "$REPLICA_PATH" -name "*.ltx" -mmin -5 2>/dev/null | wc -l | tr -d ' ') - LTX_TOTAL=$(find "$REPLICA_PATH" -name "*.ltx" 2>/dev/null | wc -l | tr -d ' ') - - echo " LTX segments (last 30s): $LTX_30S" | tee -a "$LOG_DIR/monitor.log" - echo " LTX segments (last 1m): $LTX_1M" | tee -a "$LOG_DIR/monitor.log" - echo " LTX segments (last 5m): $LTX_5M" | tee -a "$LOG_DIR/monitor.log" - echo " LTX segments (total): $LTX_TOTAL" | tee -a "$LOG_DIR/monitor.log" - - # Replica size - REPLICA_SIZE=$(du -sh "$REPLICA_PATH" 2>/dev/null | cut -f1) - echo " Total replica size: $REPLICA_SIZE" | tee -a "$LOG_DIR/monitor.log" - fi - - # Count operations - echo "" | tee -a "$LOG_DIR/monitor.log" - echo "Operations:" | tee -a "$LOG_DIR/monitor.log" - if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - CHECKPOINT_COUNT=$(grep -iE "checkpoint|checkpointed" "$LOG_DIR/litestream.log" 2>/dev/null | wc -l | tr -d ' ' || echo "0") - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - echo " Compactions: $COMPACTION_COUNT" | tee -a "$LOG_DIR/monitor.log" - echo " Checkpoints: $CHECKPOINT_COUNT" | tee -a "$LOG_DIR/monitor.log" - echo " Syncs: $SYNC_COUNT" | tee -a "$LOG_DIR/monitor.log" - fi - - # Check for errors in litestream log (exclude known non-critical) - echo "" | tee -a "$LOG_DIR/monitor.log" - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") - echo "Critical errors in litestream log: $ERROR_COUNT" | tee -a "$LOG_DIR/monitor.log" - - if [ "$ERROR_COUNT" -gt 0 ]; then - echo "Recent errors:" | tee -a "$LOG_DIR/monitor.log" - grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | tail -5 | tee -a "$LOG_DIR/monitor.log" - fi - - # Process status - echo "" | tee -a "$LOG_DIR/monitor.log" - echo "Process Status:" | tee -a "$LOG_DIR/monitor.log" - - if kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo " Litestream: Running (PID: $LITESTREAM_PID)" | tee -a "$LOG_DIR/monitor.log" - else - echo " Litestream: STOPPED" | tee -a "$LOG_DIR/monitor.log" - fi - - if [ -n "$LOAD_PID" ] && kill -0 "$LOAD_PID" 2>/dev/null; then - echo " Load generator: Running (PID: $LOAD_PID)" | tee -a "$LOG_DIR/monitor.log" - else - echo " Load generator: STOPPED" | tee -a "$LOG_DIR/monitor.log" - fi - - echo "" | tee -a "$LOG_DIR/monitor.log" - sleep 60 - done -} - -echo "Starting monitor process..." -monitor_test & -MONITOR_PID=$! -echo "Monitor started with PID: $MONITOR_PID" - -echo "" -echo "Initial database population (before starting litestream)..." -# Kill litestream temporarily to populate database -kill "$LITESTREAM_PID" 2>/dev/null || true -wait "$LITESTREAM_PID" 2>/dev/null || true - -bin/litestream-test populate -db "$DB_PATH" -target-size 100MB -batch-size 10000 > "$LOG_DIR/populate.log" 2>&1 -if [ $? -ne 0 ]; then - echo "Warning: Population failed, but continuing..." - cat "$LOG_DIR/populate.log" -fi - -# Restart litestream -echo "Restarting litestream after population..." -LOG_LEVEL=debug bin/litestream replicate -config "$CONFIG_FILE" > "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! -sleep 3 - -echo "" -echo "Starting load generator for overnight test..." -echo "Configuration:" -echo " - Duration: 8 hours" -echo " - Write rate: 50 writes/second" -echo " - Pattern: wave (simulates varying load)" -echo " - Workers: 4" -echo "" - -# Run load test for 8 hours with varying patterns -bin/litestream-test load \ - -db "$DB_PATH" \ - -write-rate 50 \ - -duration 8h \ - -pattern wave \ - -payload-size 2048 \ - -read-ratio 0.3 \ - -workers 4 \ - > "$LOG_DIR/load.log" 2>&1 & - -LOAD_PID=$! -echo "Load generator started with PID: $LOAD_PID" - -echo "" -echo "================================================" -echo "Overnight test is running!" -echo "================================================" -echo "" -echo "Monitor the test with:" -echo " tail -f $LOG_DIR/monitor.log" -echo "" -echo "View litestream logs:" -echo " tail -f $LOG_DIR/litestream.log" -echo "" -echo "View load generator logs:" -echo " tail -f $LOG_DIR/load.log" -echo "" -echo "The test will run for 8 hours. Press Ctrl+C to stop early." -echo "" - -wait "$LOAD_PID" - -echo "" -echo "Load generation completed." - -# Final statistics -echo "" -echo "================================================" -echo "Final Statistics" -echo "================================================" - -if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - # Find actual table name - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - ROW_COUNT="0" - fi - echo "Database size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - echo "Total rows: $ROW_COUNT" -fi - -if [ -d "$REPLICA_PATH" ]; then - SNAPSHOT_COUNT=$(find "$REPLICA_PATH" -name "*snapshot*.ltx" 2>/dev/null | wc -l | tr -d ' ') - LTX_COUNT=$(find "$REPLICA_PATH" -name "*.ltx" 2>/dev/null | wc -l | tr -d ' ') - REPLICA_SIZE=$(du -sh "$REPLICA_PATH" | cut -f1) - echo "Snapshots created: $SNAPSHOT_COUNT" - echo "LTX segments: $LTX_COUNT" - echo "Replica size: $REPLICA_SIZE" -fi - -if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" || echo "0") - CHECKPOINT_COUNT=$(grep -iE "checkpoint|checkpointed" "$LOG_DIR/litestream.log" | wc -l | tr -d ' ' || echo "0") - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") - echo "Compactions: $COMPACTION_COUNT" - echo "Checkpoints: $CHECKPOINT_COUNT" - echo "Critical errors: $ERROR_COUNT" -fi - -echo "" -echo "Running validation..." -bin/litestream-test validate \ - -source "$DB_PATH" \ - -replica "$REPLICA_PATH" \ - > "$LOG_DIR/validate.log" 2>&1 - -if [ $? -eq 0 ]; then - echo "✓ Validation passed!" -else - echo "✗ Validation failed! Check $LOG_DIR/validate.log" -fi diff --git a/scripts/test-quick-validation.sh b/scripts/test-quick-validation.sh deleted file mode 100755 index 54aed2bdb..000000000 --- a/scripts/test-quick-validation.sh +++ /dev/null @@ -1,324 +0,0 @@ -#!/bin/bash -set -euo pipefail - -# Quick validation test - runs for 30 minutes with aggressive settings -# Use this to validate configuration before overnight runs - -TEST_DURATION="${TEST_DURATION:-30m}" -TEST_DIR="/tmp/litestream-quick-$(date +%Y%m%d-%H%M%S)" -DB_PATH="$TEST_DIR/test.db" -REPLICA_PATH="$TEST_DIR/replica" -CONFIG_FILE="$TEST_DIR/litestream.yml" -LOG_DIR="$TEST_DIR/logs" - -echo "================================================" -echo "Litestream Quick Validation Test" -echo "================================================" -echo "Duration: $TEST_DURATION" -echo "Test directory: $TEST_DIR" -echo "Start time: $(date)" -echo "" - -cleanup() { - echo "" - echo "Cleaning up..." - - # Kill all spawned processes - jobs -p | xargs -r kill 2>/dev/null || true - wait - - echo "Test completed at: $(date)" - echo "Results saved in: $TEST_DIR" -} - -trap cleanup EXIT INT TERM - -# Create directories -mkdir -p "$TEST_DIR" "$LOG_DIR" "$REPLICA_PATH" - -# Build binaries if needed -echo "Building binaries..." -if [ ! -f bin/litestream ]; then - go build -o bin/litestream ./cmd/litestream -fi -if [ ! -f bin/litestream-test ]; then - go build -o bin/litestream-test ./cmd/litestream-test -fi - -# Create test database and populate BEFORE starting litestream -echo "Creating test database..." -sqlite3 "$DB_PATH" < "$LOG_DIR/populate.log" 2>&1 -if [ $? -ne 0 ]; then - echo "Warning: Population failed, but continuing..." - cat "$LOG_DIR/populate.log" -fi - -# Create aggressive test configuration -echo "Creating test configuration..." -cat > "$CONFIG_FILE" < "$LOG_DIR/litestream.log" 2>&1 & -LITESTREAM_PID=$! - -sleep 3 - -if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo "ERROR: Litestream failed to start!" - tail -50 "$LOG_DIR/litestream.log" - exit 1 -fi - -echo "Litestream running (PID: $LITESTREAM_PID)" -echo "" - -# Start load generator with more aggressive settings -echo "Starting load generator..." -bin/litestream-test load \ - -db "$DB_PATH" \ - -write-rate 100 \ - -duration "$TEST_DURATION" \ - -pattern wave \ - -payload-size 4096 \ - -read-ratio 0.2 \ - -workers 4 \ - > "$LOG_DIR/load.log" 2>&1 & -LOAD_PID=$! - -echo "Load generator running (PID: $LOAD_PID)" -echo "" - -# Monitor function -monitor_quick() { - while true; do - sleep 30 - - echo "[$(date +%H:%M:%S)] Status check" - - # Check database size and WAL size - if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - echo " Database: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - - # Check WAL file size - if [ -f "$DB_PATH-wal" ]; then - WAL_SIZE=$(stat -f%z "$DB_PATH-wal" 2>/dev/null || stat -c%s "$DB_PATH-wal" 2>/dev/null) - echo " WAL size: $(numfmt --to=iec-i --suffix=B $WAL_SIZE 2>/dev/null || echo "$WAL_SIZE bytes")" - fi - fi - - # Count replica files (for file replica type, count LTX files) - if [ -d "$REPLICA_PATH" ]; then - # Count snapshot files (snapshot.ltx files) - SNAPSHOTS=$(find "$REPLICA_PATH" -name "*snapshot*.ltx" 2>/dev/null | wc -l | tr -d ' ') - # Count LTX files (WAL segments) - LTX_FILES=$(find "$REPLICA_PATH" -name "*.ltx" 2>/dev/null | wc -l | tr -d ' ') - echo " Snapshots: $SNAPSHOTS, LTX segments: $LTX_FILES" - - # Show replica directory size - REPLICA_SIZE=$(du -sh "$REPLICA_PATH" 2>/dev/null | cut -f1) - echo " Replica size: $REPLICA_SIZE" - fi - - # Check for compaction (look for "compaction complete") - COMPACT_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - echo " Compactions: $COMPACT_COUNT" - - # Check for checkpoints (look for various checkpoint patterns) - CHECKPOINT_COUNT=$(grep -iE "checkpoint|checkpointed" "$LOG_DIR/litestream.log" 2>/dev/null | wc -l | tr -d ' ') - echo " Checkpoints: $CHECKPOINT_COUNT" - - # Check sync activity - SYNC_COUNT=$(grep -c "replica sync" "$LOG_DIR/litestream.log" 2>/dev/null || echo "0") - echo " Syncs: $SYNC_COUNT" - - # Check for errors (exclude known non-critical errors) - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ') - if [ "$ERROR_COUNT" -gt 0 ]; then - echo " ⚠ Critical errors: $ERROR_COUNT" - grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | tail -2 - fi - - # Check processes - if ! kill -0 "$LITESTREAM_PID" 2>/dev/null; then - echo " ✗ Litestream stopped unexpectedly!" - break - fi - - if ! kill -0 "$LOAD_PID" 2>/dev/null; then - echo " ✓ Load test completed" - break - fi - - echo "" - done -} - -echo "Running test for $TEST_DURATION..." -echo "================================================" -echo "" - -# Start monitoring in background -monitor_quick & -MONITOR_PID=$! - -# Wait for load test to complete -wait "$LOAD_PID" 2>/dev/null || true - -# Stop the monitor -kill $MONITOR_PID 2>/dev/null || true -wait $MONITOR_PID 2>/dev/null || true - -echo "" -echo "================================================" -echo "Test Results" -echo "================================================" - -# Final statistics -echo "Database Statistics:" -if [ -f "$DB_PATH" ]; then - DB_SIZE=$(stat -f%z "$DB_PATH" 2>/dev/null || stat -c%s "$DB_PATH" 2>/dev/null) - # Find the actual table name - tables are space-separated on one line - TABLES=$(sqlite3 "$DB_PATH" ".tables" 2>/dev/null) - # Look for the main data table - if echo "$TABLES" | grep -q "load_test"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM load_test" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_table_0"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_table_0" 2>/dev/null || echo "0") - elif echo "$TABLES" | grep -q "test_data"; then - ROW_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - else - ROW_COUNT="0" - fi - echo " Final size: $(numfmt --to=iec-i --suffix=B $DB_SIZE 2>/dev/null || echo "$DB_SIZE bytes")" - echo " Total rows: $ROW_COUNT" -fi - -echo "" -echo "Replication Statistics:" -if [ -d "$REPLICA_PATH" ]; then - SNAPSHOT_COUNT=$(find "$REPLICA_PATH" -name "*snapshot*.ltx" 2>/dev/null | wc -l | tr -d ' ') - LTX_COUNT=$(find "$REPLICA_PATH" -name "*.ltx" 2>/dev/null | wc -l | tr -d ' ') - REPLICA_SIZE=$(du -sh "$REPLICA_PATH" | cut -f1) - echo " Snapshots created: $SNAPSHOT_COUNT" - echo " LTX segments: $LTX_COUNT" - echo " Replica size: $REPLICA_SIZE" -fi - -echo "" -echo "Operation Counts:" -# Count operations from log -if [ -f "$LOG_DIR/litestream.log" ]; then - COMPACTION_COUNT=$(grep -c "compaction complete" "$LOG_DIR/litestream.log" || echo "0") - CHECKPOINT_COUNT=$(grep -iE "checkpoint|checkpointed" "$LOG_DIR/litestream.log" | wc -l | tr -d ' ' || echo "0") - ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" | grep -v "page size not initialized" | wc -l | tr -d ' ' || echo "0") -else - COMPACTION_COUNT="0" - CHECKPOINT_COUNT="0" - ERROR_COUNT="0" -fi -echo " Compactions: $COMPACTION_COUNT" -echo " Checkpoints: $CHECKPOINT_COUNT" -echo " Errors: $ERROR_COUNT" - -# Quick validation -echo "" -echo "Validation:" -bin/litestream-test validate \ - -source "$DB_PATH" \ - -replica "$REPLICA_PATH" \ - > "$LOG_DIR/validate.log" 2>&1 - -if [ $? -eq 0 ]; then - echo " ✓ Validation passed!" -else - echo " ✗ Validation failed!" - tail -10 "$LOG_DIR/validate.log" -fi - -# Test restoration -echo "" -echo "Testing restoration..." -RESTORE_DB="$TEST_DIR/restored.db" -bin/litestream restore -o "$RESTORE_DB" "file://$REPLICA_PATH" > "$LOG_DIR/restore.log" 2>&1 - -if [ $? -eq 0 ]; then - RESTORED_COUNT=$(sqlite3 "$RESTORE_DB" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - ORIGINAL_COUNT=$(sqlite3 "$DB_PATH" "SELECT COUNT(*) FROM test_data" 2>/dev/null || echo "0") - - if [ "$RESTORED_COUNT" = "$ORIGINAL_COUNT" ]; then - echo " ✓ Restoration successful! ($RESTORED_COUNT rows)" - else - echo " ⚠ Row count mismatch! Original: $ORIGINAL_COUNT, Restored: $RESTORED_COUNT" - fi -else - echo " ✗ Restoration failed!" -fi - -# Summary -echo "" -echo "================================================" -# Count critical errors (exclude known non-critical ones) -CRITICAL_ERROR_COUNT=$(grep -i "ERROR" "$LOG_DIR/litestream.log" 2>/dev/null | grep -v "page size not initialized" | wc -l | tr -d ' ') - -if [ "$CRITICAL_ERROR_COUNT" -eq 0 ] && [ "$LTX_COUNT" -gt 0 ]; then - echo "✓ Quick validation PASSED!" - echo "" - echo "Summary:" - echo " - Litestream successfully replicated data" - echo " - Created $LTX_COUNT LTX segments" - [ "$SNAPSHOT_COUNT" -gt 0 ] && echo " - Created $SNAPSHOT_COUNT snapshots" - [ "$COMPACTION_COUNT" -gt 0 ] && echo " - Performed $COMPACTION_COUNT compactions" - echo "" - echo "The configuration appears ready for overnight testing." - echo "Run the overnight test with:" - echo " ./test-overnight.sh" -else - echo "⚠ Quick validation completed with issues:" - [ "$CRITICAL_ERROR_COUNT" -gt 0 ] && echo " - Critical errors detected: $CRITICAL_ERROR_COUNT" - [ "$LTX_COUNT" -eq 0 ] && echo " - No LTX segments created (replication not working)" - [ "$SNAPSHOT_COUNT" -eq 0 ] && echo " - No snapshots created (may be normal for short tests)" - [ "$COMPACTION_COUNT" -eq 0 ] && echo " - No compactions occurred (may be normal for short tests)" - echo "" - echo "Review the logs before running overnight tests:" - echo " $LOG_DIR/litestream.log" -fi - -echo "" -echo "Full results available in: $TEST_DIR" -echo "================================================" diff --git a/tests/integration/README.md b/tests/integration/README.md new file mode 100644 index 000000000..b349cc669 --- /dev/null +++ b/tests/integration/README.md @@ -0,0 +1,474 @@ +# Integration Tests + +Go-based integration tests for Litestream. These tests replace the previous bash-based test scripts with proper Go testing infrastructure. + +## Overview + +This package contains comprehensive integration tests organized by test type: + +- **scenario_test.go** - Core functionality scenarios (fresh start, integrity, deletion, failover) +- **concurrent_test.go** - Concurrency and stress tests (rapid checkpoints, WAL growth, concurrent ops, busy timeout) +- **quick_test.go** - Quick validation tests (30 minutes configurable) +- **overnight_test.go** - Long-running stability tests (8+ hours) +- **boundary_test.go** - Edge cases (1GB boundary, different page sizes) +- **helpers.go** - Shared test utilities and helpers +- **fixtures.go** - Test data generators and scenarios + +## Prerequisites + +Build the required binaries: + +```bash +go build -o bin/litestream ./cmd/litestream +go build -o bin/litestream-test ./cmd/litestream-test +``` + +## Running Tests + +### Quick Tests (Default) + +Run fast integration tests suitable for CI: + +```bash +go test -v -tags=integration -timeout=30m ./tests/integration/... \ + -run="TestFreshStart|TestDatabaseIntegrity|TestRapidCheckpoints" +``` + +### All Scenario Tests + +Run all scenario tests (excluding long-running): + +```bash +go test -v -tags=integration -timeout=1h ./tests/integration/... +``` + +### Long-Running Tests + +Run overnight and boundary tests: + +```bash +go test -v -tags="integration,long" -timeout=10h ./tests/integration/... \ + -run="TestOvernight|Test1GBBoundary" +``` + +## Soak Tests + +Long-running soak tests live alongside the other integration tests and share the same helpers. They are excluded from CI by default and are intended for release validation or targeted debugging. + +### Overview + +| Test | Tags | Defaults | Purpose | Extra Requirements | +| --- | --- | --- | --- | --- | +| `TestComprehensiveSoak` | `integration,soak` | 2h duration, 50 MB DB, 500 writes/s | File-backed end-to-end stress | Litestream binaries in `./bin` | +| `TestMinIOSoak` | `integration,soak,docker` | 2h duration, 5 MB DB (short=2 m), 100 writes/s | S3-compatible replication via MinIO | Docker daemon, `docker` CLI | +| `TestOvernightS3Soak` | `integration,soak,aws` | 8h duration, 50 MB DB | Real S3 replication & restore | AWS credentials, `aws` CLI | + +All soak tests support `go test -test.short` to scale the default duration down to roughly two minutes for smoke verification. + +### Environment Variables + +| Variable | Default | Description | +| --- | --- | --- | +| `SOAK_AUTO_PURGE` | `yes` for non-interactive shells; prompts otherwise | Controls whether MinIO buckets are cleared before each run. Set to `no` to retain objects between runs. | +| `SOAK_KEEP_TEMP` | unset | When set (any value), preserves the temporary directory and artifacts (database, config, logs) instead of removing them after the test completes. | +| `SOAK_DEBUG` | `0` | Streams command stdout/stderr (database population, load generation, docker helpers) directly to the console. Without this the output is captured and only shown on failure. | +| `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `S3_BUCKET`, `AWS_REGION` | required for `aws` tag | Provide credentials and target bucket for the overnight S3 soak. Region defaults to `us-east-1` if unset. | + +### Example Commands + +File-based soak (full length): + +```bash +go test -v -tags="integration,soak" \ + -run=TestComprehensiveSoak -timeout=3h ./tests/integration +``` + +File-based soak (short mode with preserved artifacts and debug logging): + +```bash +SOAK_KEEP_TEMP=1 SOAK_DEBUG=1 go test -v -tags="integration,soak" \ + -run=TestComprehensiveSoak -test.short -timeout=1h ./tests/integration +``` + +MinIO soak (short mode, auto-purges bucket, preserves results): + +```bash +SOAK_AUTO_PURGE=yes SOAK_KEEP_TEMP=1 go test -v -tags="integration,soak,docker" \ + -run=TestMinIOSoak -test.short -timeout=20m ./tests/integration +``` + +Overnight S3 soak (full duration): + +```bash +export AWS_ACCESS_KEY_ID=... +export AWS_SECRET_ACCESS_KEY=... +export S3_BUCKET=your-bucket +export AWS_REGION=us-east-1 + +go test -v -tags="integration,soak,aws" \ + -run=TestOvernightS3Soak -timeout=10h ./tests/integration +``` + +### Tips + +- Run with `-v` to view the 60-second progress updates and final status summary. Without `-v`, progress output is suppressed by Go’s test runner. +- When prompted about purging a MinIO bucket, answering “yes” clears the bucket via `minio/mc` before the run; “no” allows you to inspect lingering objects from previous executions. +- `SOAK_KEEP_TEMP=1` is especially useful when investigating failures—the helper prints the preserved path so you can inspect databases, configs, and logs. +- The monitoring infrastructure automatically prints additional status blocks when error counts change, making `SOAK_DEBUG=1` optional for most workflows. + +### Specific Tests + +Run individual test functions: + +```bash +# Fresh start test +go test -v -tags=integration ./tests/integration/... -run=TestFreshStart + +# Rapid checkpoints test +go test -v -tags=integration ./tests/integration/... -run=TestRapidCheckpoints + +# 1GB boundary test +go test -v -tags=integration ./tests/integration/... -run=Test1GBBoundary +``` + +### Short Mode + +Run abbreviated versions with `-short`: + +```bash +go test -v -tags=integration -short ./tests/integration/... +``` + +This reduces test durations by 10x (e.g., 8 hours becomes 48 minutes). + +## Test Categories + +### Scenario Tests + +Core functionality tests that run in seconds to minutes: + +- `TestFreshStart` - Starting replication before database exists +- `TestDatabaseIntegrity` - Complex schema and data integrity +- `TestDatabaseDeletion` - Source database deletion during replication + +### Concurrent Tests + +Stress and concurrency tests: + +- `TestRapidCheckpoints` - Rapid checkpoint operations under load +- `TestWALGrowth` - Large WAL file handling (100MB+) +- `TestConcurrentOperations` - Multiple databases replicating simultaneously +- `TestBusyTimeout` - Database busy timeout and lock handling + +### Quick Tests + +Configurable duration validation (default 30 minutes): + +- `TestQuickValidation` - Comprehensive validation with wave pattern load + +### Overnight Tests + +Long-running stability tests (default 8 hours): + +- `TestOvernightFile` - 8-hour file-based replication test +- `TestOvernightComprehensive` - 8-hour comprehensive test with large database + +### Boundary Tests + +Edge case and boundary condition tests: + +- `Test1GBBoundary` - SQLite 1GB lock page boundary (page #262145 with 4KB pages) +- `TestLockPageWithDifferentPageSizes` - Lock page handling with various page sizes + +## CI Integration + +### Automatic (Pull Requests) + +Quick tests run automatically on PRs modifying Go code: + +```yaml +- Quick integration tests (TestFreshStart, TestDatabaseIntegrity, TestRapidCheckpoints) +- Timeout: 30 minutes +``` + +### Manual Workflows + +Trigger via GitHub Actions UI: + +**Quick Tests:** +``` +workflow_dispatch → test_type: quick +``` + +**All Scenario Tests:** +``` +workflow_dispatch → test_type: all +``` + +**Long-Running Tests:** +``` +workflow_dispatch → test_type: long +``` + +## Test Infrastructure + +### Helpers (helpers.go) + +- `SetupTestDB(t, name)` - Create test database instance +- `TestDB.Create()` - Create database with WAL mode +- `TestDB.Populate(size)` - Populate to target size +- `TestDB.StartLitestream()` - Start replication +- `TestDB.StopLitestream()` - Stop replication +- `TestDB.Restore(path)` - Restore from replica +- `TestDB.Validate(path)` - Full validation (integrity, checksum, data) +- `TestDB.QuickValidate(path)` - Quick validation +- `TestDB.GenerateLoad(...)` - Generate database load +- `GetTestDuration(t, default)` - Get configurable test duration +- `RequireBinaries(t)` - Check for required binaries + +### Fixtures (fixtures.go) + +- `DefaultLoadConfig()` - Load generation configuration +- `DefaultPopulateConfig()` - Database population configuration +- `CreateComplexTestSchema(db)` - Multi-table schema with foreign keys +- `PopulateComplexTestData(db, ...)` - Populate complex data +- `LargeWALScenario()` - Large WAL test scenario +- `RapidCheckpointsScenario()` - Rapid checkpoint scenario + +## Test Artifacts + +Tests create temporary directories via `t.TempDir()`: + +``` +/tmp// +├── .db # Test database +├── .db-wal # WAL file +├── .db-shm # Shared memory +├── replica/ # Replica directory +│ └── ltx/0/ # LTX files +├── litestream.log # Litestream output +└── *-restored.db # Restored databases +``` + +Artifacts are automatically cleaned up after tests complete. + +## Debugging Tests + +### View Litestream Logs + +```go +log, err := db.GetLitestreamLog() +fmt.Println(log) +``` + +### Check for Errors + +```go +errors, err := db.CheckForErrors() +for _, e := range errors { + t.Logf("Error: %s", e) +} +``` + +### Inspect Replica + +```go +fileCount, _ := db.GetReplicaFileCount() +t.Logf("LTX files: %d", fileCount) +``` + +### Check Database Size + +```go +size, _ := db.GetDatabaseSize() +t.Logf("DB size: %.2f MB", float64(size)/(1024*1024)) +``` + +## Migration from Bash + +This is part of an ongoing effort to migrate bash test scripts to Go integration tests. This migration improves maintainability, enables CI integration, and provides platform independence. + +### Test Directory Organization + +Three distinct test locations serve different purposes: + +**`tests/integration/` (this directory)** - Go-based integration and soak tests: +- Quick integration tests: `scenario_test.go`, `concurrent_test.go`, `boundary_test.go` +- Soak tests (2-8 hours): `comprehensive_soak_test.go`, `minio_soak_test.go`, `overnight_s3_soak_test.go` +- All tests use proper Go testing infrastructure with build tags + +**`scripts/` (top-level)** - Utility scripts only (soak tests migrated to Go): +- `analyze-test-results.sh` - Post-test analysis utility +- `setup-homebrew-tap.sh` - Packaging script (not a test) + +**`cmd/litestream-test/scripts/`** - Scenario and debugging bash scripts (being phased out): +- Bug reproduction scripts for specific issues (#752, #754) +- Format & upgrade tests for version compatibility +- S3 retention tests with Python mock +- Quick validation and setup utilities + +### Migration Status + +**Migrated from `scripts/` (5 scripts):** +- `test-quick-validation.sh` → `quick_test.go::TestQuickValidation` (CI: ✅) +- `test-overnight.sh` → `overnight_test.go::TestOvernightFile` (CI: ❌ too long) +- `test-comprehensive.sh` → `comprehensive_soak_test.go::TestComprehensiveSoak` (CI: ❌ soak test) +- `test-minio-s3.sh` → `minio_soak_test.go::TestMinIOSoak` (CI: ❌ soak test, requires Docker) +- `test-overnight-s3.sh` → `overnight_s3_soak_test.go::TestOvernightS3Soak` (CI: ❌ soak test, 8 hours) + +**Migrated from `cmd/litestream-test/scripts/` (9 scripts):** +- `test-fresh-start.sh` → `scenario_test.go::TestFreshStart` +- `test-database-integrity.sh` → `scenario_test.go::TestDatabaseIntegrity` +- `test-database-deletion.sh` → `scenario_test.go::TestDatabaseDeletion` +- `test-replica-failover.sh` → NOT MIGRATED (feature removed from Litestream) +- `test-rapid-checkpoints.sh` → `concurrent_test.go::TestRapidCheckpoints` +- `test-wal-growth.sh` → `concurrent_test.go::TestWALGrowth` +- `test-concurrent-operations.sh` → `concurrent_test.go::TestConcurrentOperations` +- `test-busy-timeout.sh` → `concurrent_test.go::TestBusyTimeout` +- `test-1gb-boundary.sh` → `boundary_test.go::Test1GBBoundary` + +**Remaining Bash Scripts:** + +_scripts/_ (2 scripts remaining): +- `analyze-test-results.sh` - Post-test analysis utility (may stay as bash) +- `setup-homebrew-tap.sh` - Packaging script (not a test) + +_cmd/litestream-test/scripts/_ (16 scripts remaining): +- Bug reproduction scripts: `reproduce-critical-bug.sh`, `test-754-*.sh`, `test-v0.5-*.sh` +- Format & upgrade tests: `test-format-isolation.sh`, `test-upgrade-*.sh`, `test-massive-upgrade.sh` +- S3 retention tests: `test-s3-retention-*.sh` (4 scripts, use Python S3 mock) +- Utility: `verify-test-setup.sh` + +### Why Some Tests Aren't in CI + +Per industry best practices, CI tests should complete in < 1 hour (ideally < 10 minutes): +- ✅ **Quick tests** (< 5 min) - Run on every PR +- ❌ **Soak tests** (2-8 hours) - Run locally before releases only +- ❌ **Long-running tests** (> 30 min) - Too slow for CI feedback loop + +Soak tests are migrated to Go for maintainability but run **locally only**. See "Soak Tests" section below. + +## Soak Tests (Long-Running Stability Tests) + +Soak tests run for 2-8 hours to validate long-term stability under sustained load. These tests are **NOT run in CI** per industry best practices (effective CI requires tests to complete in < 1 hour). + +### Purpose + +Soak tests validate: +- Long-term replication stability +- Memory leak detection over time +- Compaction effectiveness across multiple cycles +- Checkpoint behavior under sustained load +- Recovery from transient issues +- Storage growth patterns + +### When to Run Soak Tests + +- ✅ Before major releases +- ✅ After significant replication changes +- ✅ To reproduce stability issues +- ✅ For performance benchmarking +- ❌ NOT on every commit (too slow for CI) + +### Running Soak Tests Locally + +**File-based comprehensive test (2 hours):** +```bash +go test -v -tags="integration,soak" -timeout=3h -run=TestComprehensiveSoak ./tests/integration/ +``` + +**MinIO S3 test (2 hours, requires Docker):** +```bash +# Ensure Docker is running +go test -v -tags="integration,soak,docker" -timeout=3h -run=TestMinIOSoak ./tests/integration/ +``` + +**Overnight S3 test (8 hours, requires AWS):** +```bash +export AWS_ACCESS_KEY_ID=your_key +export AWS_SECRET_ACCESS_KEY=your_secret +export S3_BUCKET=your-test-bucket +export AWS_REGION=us-east-1 + +go test -v -tags="integration,soak,aws" -timeout=10h -run=TestOvernightS3Soak ./tests/integration/ +``` + +**Run all soak tests:** +```bash +go test -v -tags="integration,soak,docker,aws" -timeout=15h ./tests/integration/ +``` + +### Adjust Duration for Testing + +Tests respect the `-test.short` flag to run abbreviated versions: + +```bash +# Run comprehensive test for 30 minutes instead of 2 hours +go test -v -tags="integration,soak" -timeout=1h -run=TestComprehensiveSoak ./tests/integration/ -test.short +``` + +### Soak Test Build Tags + +Soak tests use multiple build tags to control execution: + +- `integration` - Required for all integration tests +- `soak` - Marks long-running stability tests (2-8 hours) +- `docker` - Requires Docker (MinIO test) +- `aws` - Requires AWS credentials (S3 tests) + +### Monitoring Soak Tests + +All soak tests log progress every 60 seconds: + +```bash +# Watch test progress in real-time +go test -v -tags="integration,soak" -run=TestComprehensiveSoak ./tests/integration/ 2>&1 | tee soak-test.log +``` + +Metrics reported during execution: +- Database size and WAL size +- Row count +- Replica statistics (snapshots, LTX segments) +- Operation counts (checkpoints, compactions, syncs) +- Error counts +- Write rate + +### Soak Test Summary + +| Test | Duration | Requirements | What It Tests | +|------|----------|--------------|---------------| +| TestComprehensiveSoak | 2h | None | File-based replication with aggressive compaction | +| TestMinIOSoak | 2h | Docker | S3-compatible storage via MinIO container | +| TestOvernightS3Soak | 8h | AWS credentials | Real S3 replication, overnight stability | + +## Benefits Over Bash + +1. **Type Safety** - Compile-time error checking +2. **Better Debugging** - Use standard Go debugging tools +3. **Code Reuse** - Shared helpers and fixtures +4. **Parallel Execution** - Tests can run concurrently +5. **CI Integration** - Run automatically on PRs +6. **Test Coverage** - Measure code coverage +7. **Consistent Patterns** - Standard Go testing conventions +8. **Better Error Messages** - Structured, clear reporting +9. **Platform Independent** - Works on Linux, macOS, Windows +10. **IDE Integration** - Full editor support + +## Contributing + +When adding new integration tests: + +1. Use appropriate build tags (`//go:build integration` or `//go:build integration && long`) +2. Call `RequireBinaries(t)` to check prerequisites +3. Use `SetupTestDB(t, name)` for test setup +4. Call `defer db.Cleanup()` for automatic cleanup +5. Log test progress with descriptive messages +6. Use `GetTestDuration(t, default)` for configurable durations +7. Add test to CI workflow if appropriate +8. Update this README with new test documentation + +## Related Documentation + +- [cmd/litestream-test README](../../cmd/litestream-test/README.md) - Testing harness CLI +- [scripts/README.md](../../scripts/README.md) - Legacy bash test scripts +- [GitHub Issue #798](https://github.com/benbjohnson/litestream/issues/798) - Migration tracking diff --git a/tests/integration/boundary_test.go b/tests/integration/boundary_test.go new file mode 100644 index 000000000..fe3d73d7d --- /dev/null +++ b/tests/integration/boundary_test.go @@ -0,0 +1,200 @@ +//go:build integration + +package integration + +import ( + "fmt" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +func Test1GBBoundary(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: SQLite 1GB lock page boundary handling") + t.Log("This tests database growth beyond 1GB with 4KB pages (lock page at #262145)") + + db := SetupTestDB(t, "1gb-boundary") + defer db.Cleanup() + + t.Log("[1] Creating database with 4KB page size...") + if err := db.CreateWithPageSize(4096); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Log("✓ Database created with 4KB pages") + + t.Log("[2] Populating to 1.5GB to cross lock page boundary...") + if err := db.PopulateWithOptions("1.5GB", 4096, 1024); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + + dbSize, err := db.GetDatabaseSize() + if err != nil { + t.Fatalf("Failed to get database size: %v", err) + } + + sizeGB := float64(dbSize) / (1024 * 1024 * 1024) + t.Logf("✓ Database populated: %.2f GB", sizeGB) + + if sizeGB < 1.0 { + t.Fatalf("Database did not reach 1GB threshold: %.2f GB", sizeGB) + } + + t.Log("[3] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(30 * time.Second) + + t.Log("[4] Checking replication across lock page boundary...") + fileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to check replica: %v", err) + } + + if fileCount == 0 { + t.Fatal("No LTX files created!") + } + + t.Logf("✓ Replication started: %d LTX files", fileCount) + + t.Log("[5] Checking for lock page errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + lockPageErrors := 0 + for _, errMsg := range errors { + if containsAny(errMsg, []string{"lock page", "page 262145", "locking page"}) { + lockPageErrors++ + t.Logf("Lock page error: %s", errMsg) + } + } + + if lockPageErrors > 0 { + t.Fatalf("Found %d lock page errors!", lockPageErrors) + } + + t.Log("✓ No lock page errors detected") + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[6] Testing restore of large database...") + restoredPath := filepath.Join(db.TempDir, "1gb-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + t.Log("[7] Validating restored database integrity...") + if err := db.QuickValidate(restoredPath); err != nil { + t.Fatalf("Validation failed: %v", err) + } + + restoredDB := &TestDB{Path: restoredPath, t: t} + restoredSize, _ := restoredDB.GetDatabaseSize() + restoredSizeGB := float64(restoredSize) / (1024 * 1024 * 1024) + + t.Logf("✓ Restored database size: %.2f GB", restoredSizeGB) + + if restoredSizeGB < 0.9 { + t.Fatalf("Restored database too small: %.2f GB (expected ~%.2f GB)", restoredSizeGB, sizeGB) + } + + t.Log("TEST PASSED: 1GB lock page boundary handled correctly") +} + +func TestLockPageWithDifferentPageSizes(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: Lock page handling with different SQLite page sizes") + + pageSizes := []struct { + size int + lockPageNum int + targetSizeMB int + }{ + {4096, 262145, 1200}, + {8192, 131073, 1200}, + } + + for _, ps := range pageSizes { + t.Run(fmt.Sprintf("PageSize%d", ps.size), func(t *testing.T) { + db := SetupTestDB(t, fmt.Sprintf("lockpage-%d", ps.size)) + defer db.Cleanup() + + t.Logf("[1] Creating database with %d byte page size (lock page at #%d)...", ps.size, ps.lockPageNum) + if err := db.CreateWithPageSize(ps.size); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Logf("[2] Populating to %dMB...", ps.targetSizeMB) + if err := db.PopulateWithOptions(fmt.Sprintf("%dMB", ps.targetSizeMB), ps.size, 1024); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + + dbSize, _ := db.GetDatabaseSize() + t.Logf("✓ Database: %.2f MB", float64(dbSize)/(1024*1024)) + + t.Log("[3] Starting replication...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(20 * time.Second) + + fileCount, _ := db.GetReplicaFileCount() + t.Logf("✓ LTX files: %d", fileCount) + + db.StopLitestream() + + t.Log("[4] Testing restore...") + restoredPath := filepath.Join(db.TempDir, fmt.Sprintf("lockpage-%d-restored.db", ps.size)) + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Test passed for page size", ps.size) + }) + } + + t.Log("TEST PASSED: All page sizes handled correctly") +} + +func containsAny(s string, substrs []string) bool { + for _, substr := range substrs { + if contains(s, substr) { + return true + } + } + return false +} + +func contains(s, substr string) bool { + return len(s) >= len(substr) && (s == substr || len(s) > len(substr) && (s[:len(substr)] == substr || s[len(s)-len(substr):] == substr || anySubstring(s, substr))) +} + +func anySubstring(s, substr string) bool { + for i := 0; i <= len(s)-len(substr); i++ { + if s[i:i+len(substr)] == substr { + return true + } + } + return false +} diff --git a/tests/integration/comprehensive_soak_test.go b/tests/integration/comprehensive_soak_test.go new file mode 100644 index 000000000..dd076f33f --- /dev/null +++ b/tests/integration/comprehensive_soak_test.go @@ -0,0 +1,266 @@ +//go:build integration && soak + +package integration + +import ( + "context" + "fmt" + "path/filepath" + "testing" + "time" +) + +// TestComprehensiveSoak runs a comprehensive soak test with aggressive settings +// to validate all Litestream features: replication, snapshots, compaction, checkpoints. +// +// Default duration: 2 hours +// Can be shortened with: go test -test.short (runs for 30 minutes) +// +// This test exercises: +// - Continuous replication +// - Snapshot generation (every 10m) +// - Compaction (30s/1m/5m/15m/30m intervals) +// - Checkpoint operations +// - Database restoration +func TestComprehensiveSoak(t *testing.T) { + RequireBinaries(t) + + // Determine test duration + duration := GetTestDuration(t, 2*time.Hour) + shortMode := testing.Short() + if shortMode { + duration = 2 * time.Minute + } + + targetSize := "50MB" + writeRate := 500 + if shortMode { + targetSize = "5MB" + writeRate = 100 + } + + t.Logf("================================================") + t.Logf("Litestream Comprehensive Soak Test") + t.Logf("================================================") + t.Logf("Duration: %v", duration) + t.Logf("Start time: %s", time.Now().Format(time.RFC3339)) + t.Log("") + t.Log("This test uses aggressive settings to validate:") + t.Log(" - Continuous replication") + t.Log(" - Snapshot generation (every 10m)") + t.Log(" - Compaction (30s/1m/5m intervals)") + t.Log(" - Checkpoint operations") + t.Log(" - Database restoration") + t.Log("") + + startTime := time.Now() + + // Setup test database + db := SetupTestDB(t, "comprehensive-soak") + defer db.Cleanup() + + // Create database + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + // Populate database + t.Logf("Populating database (%s initial data)...", targetSize) + if err := db.Populate(targetSize); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + t.Log("✓ Database populated") + t.Log("") + + // Create aggressive configuration for testing + t.Log("Creating aggressive test configuration...") + replicaURL := fmt.Sprintf("file://%s", filepath.ToSlash(db.ReplicaPath)) + configPath := CreateSoakConfig(db.Path, replicaURL, nil, shortMode) + db.ConfigPath = configPath + t.Logf("✓ Configuration created: %s", configPath) + t.Log("") + + // Start Litestream + t.Log("Starting Litestream replication...") + if err := db.StartLitestreamWithConfig(configPath); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + t.Logf("✓ Litestream running (PID: %d)", db.LitestreamPID) + t.Log("") + + // Start load generator with heavy sustained load + t.Log("Starting load generator (heavy sustained load)...") + t.Logf(" Write rate: %d writes/second", writeRate) + t.Logf(" Pattern: wave (simulates varying load)") + t.Logf(" Payload size: 4KB") + t.Logf(" Workers: 8") + t.Log("") + + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + // Setup signal handler for graceful interruption + testInfo := &TestInfo{ + StartTime: startTime, + Duration: duration, + DB: db, + cancel: cancel, + } + setupSignalHandler(t, cancel, testInfo) + + // Run load generation in background + loadDone := make(chan error, 1) + go func() { + loadDone <- db.GenerateLoad(ctx, writeRate, duration, "wave") + }() + + // Monitor every 60 seconds + t.Log("Running comprehensive test...") + t.Log("Monitor will report every 60 seconds") + t.Log("Press Ctrl+C twice within 5 seconds to stop early") + t.Log("================================================") + t.Log("") + + refreshStats := func() { + testInfo.RowCount, _ = db.GetRowCount("load_test") + if testInfo.RowCount == 0 { + testInfo.RowCount, _ = db.GetRowCount("test_table_0") + } + if testInfo.RowCount == 0 { + testInfo.RowCount, _ = db.GetRowCount("test_data") + } + testInfo.FileCount, _ = db.GetReplicaFileCount() + } + + logMetrics := func() { + LogSoakMetrics(t, db, "comprehensive") + if db.LitestreamCmd != nil && db.LitestreamCmd.ProcessState != nil { + t.Error("✗ Litestream stopped unexpectedly!") + if testInfo.cancel != nil { + testInfo.cancel() + } + } + } + + MonitorSoakTest(t, db, ctx, testInfo, refreshStats, logMetrics) + + // Wait for load generation to complete + if err := <-loadDone; err != nil { + t.Logf("Load generation completed: %v", err) + } + + if err := db.WaitForSnapshots(30 * time.Second); err != nil { + t.Fatalf("Failed waiting for snapshot: %v", err) + } + + t.Log("") + t.Log("================================================") + t.Log("Final Test Results") + t.Log("================================================") + t.Log("") + + // Stop Litestream + t.Log("Stopping Litestream...") + if err := db.StopLitestream(); err != nil { + t.Logf("Warning: Failed to stop Litestream cleanly: %v", err) + } + + // Final statistics + t.Log("Database Statistics:") + if dbSize, err := db.GetDatabaseSize(); err == nil { + t.Logf(" Final size: %.2f MB", float64(dbSize)/(1024*1024)) + } + + // Count rows using different table name possibilities + var rowCount int + var err error + if rowCount, err = db.GetRowCount("load_test"); err != nil { + if rowCount, err = db.GetRowCount("test_table_0"); err != nil { + if rowCount, err = db.GetRowCount("test_data"); err != nil { + t.Logf(" Warning: Could not get row count: %v", err) + } + } + } + if err == nil { + t.Logf(" Total rows: %d", rowCount) + } + t.Log("") + + // Replica statistics + t.Log("Replication Statistics:") + if fileCount, err := db.GetReplicaFileCount(); err == nil { + t.Logf(" LTX segments: %d", fileCount) + } + + // Check for errors + errors, _ := db.CheckForErrors() + criticalErrors := 0 + for _, errLine := range errors { + // Filter out known non-critical errors + if !containsAny(errLine, []string{"page size not initialized"}) { + criticalErrors++ + } + } + t.Logf(" Critical errors: %d", criticalErrors) + t.Log("") + + // Test restoration + t.Log("Testing restoration...") + restoredPath := filepath.Join(db.TempDir, "restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restoration failed: %v", err) + } + t.Log("✓ Restoration successful!") + + // Validate + t.Log("") + t.Log("Validating restored database integrity...") + restoredDB := &TestDB{Path: restoredPath, t: t} + if err := restoredDB.IntegrityCheck(); err != nil { + t.Fatalf("Integrity check failed: %v", err) + } + t.Log("✓ Integrity check passed!") + + // Analyze test results + analysis := AnalyzeSoakTest(t, db, duration) + PrintSoakTestAnalysis(t, analysis) + + // Test Summary + t.Log("================================================") + t.Log("Test Summary") + t.Log("================================================") + + testPassed := true + issues := []string{} + + if criticalErrors > 0 { + testPassed = false + issues = append(issues, fmt.Sprintf("Critical errors detected: %d", criticalErrors)) + } + + if analysis.FinalFileCount == 0 { + testPassed = false + issues = append(issues, "No files created (replication not working)") + } + + if testPassed { + t.Log("✓ TEST PASSED!") + t.Log("") + t.Log("The configuration is ready for production use.") + } else { + t.Log("⚠ TEST COMPLETED WITH ISSUES:") + for _, issue := range issues { + t.Logf(" - %s", issue) + } + t.Log("") + t.Log("Review the logs for details:") + logPath, _ := db.GetLitestreamLog() + t.Logf(" %s", logPath) + t.Fail() + } + + t.Log("") + t.Logf("Test duration: %v", time.Since(startTime).Round(time.Second)) + t.Logf("Results available in: %s", db.TempDir) + t.Log("================================================") +} diff --git a/tests/integration/concurrent_test.go b/tests/integration/concurrent_test.go new file mode 100644 index 000000000..cccb98d14 --- /dev/null +++ b/tests/integration/concurrent_test.go @@ -0,0 +1,482 @@ +//go:build integration + +package integration + +import ( + "context" + "database/sql" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +func TestRapidCheckpoints(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: Litestream under rapid checkpoint pressure") + + db := SetupTestDB(t, "rapid-checkpoints") + defer db.Cleanup() + + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Log("[1] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(3 * time.Second) + + t.Log("[2] Generating rapid writes with frequent checkpoints...") + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + t.Fatalf("Failed to open database: %v", err) + } + defer sqlDB.Close() + + if _, err := sqlDB.Exec(` + CREATE TABLE checkpoint_test ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + data BLOB, + timestamp INTEGER + ) + `); err != nil { + t.Fatalf("Failed to create table: %v", err) + } + + data := make([]byte, 4096) + checkpointCount := 0 + + for i := 0; i < 1000; i++ { + if _, err := sqlDB.Exec( + "INSERT INTO checkpoint_test (data, timestamp) VALUES (?, ?)", + data, + time.Now().Unix(), + ); err != nil { + t.Fatalf("Failed to insert row %d: %v", i, err) + } + + if i%100 == 0 { + if _, err := sqlDB.Exec("PRAGMA wal_checkpoint(TRUNCATE)"); err != nil { + t.Logf("Checkpoint %d failed: %v", checkpointCount, err) + } else { + checkpointCount++ + t.Logf("Checkpoint %d completed at row %d", checkpointCount, i) + } + } + } + + t.Logf("✓ Generated 1000 writes with %d checkpoints", checkpointCount) + + time.Sleep(5 * time.Second) + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[3] Checking for errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + if len(errors) > 5 { + t.Fatalf("Too many errors (%d), showing first 5:\n%v", len(errors), errors[:5]) + } else if len(errors) > 0 { + t.Logf("Found %d errors (acceptable for checkpoint stress)", len(errors)) + } + + t.Log("[4] Verifying replica...") + fileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to check replica: %v", err) + } + + if fileCount == 0 { + t.Fatal("No replica files created!") + } + + t.Logf("✓ Replica created with %d files", fileCount) + + t.Log("[5] Testing restore...") + restoredPath := filepath.Join(db.TempDir, "checkpoint-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + origCount, err := db.GetRowCount("checkpoint_test") + if err != nil { + t.Fatalf("Failed to get original row count: %v", err) + } + + restoredDB := &TestDB{Path: restoredPath, t: t} + restCount, err := restoredDB.GetRowCount("checkpoint_test") + if err != nil { + t.Fatalf("Failed to get restored row count: %v", err) + } + + if origCount != restCount { + t.Fatalf("Count mismatch: original=%d, restored=%d", origCount, restCount) + } + + t.Logf("✓ Data integrity verified: %d rows", origCount) + t.Log("TEST PASSED: Handled rapid checkpoints successfully") +} + +func TestWALGrowth(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + duration := GetTestDuration(t, 2*time.Minute) + t.Logf("Testing: Large WAL file handling (duration: %v)", duration) + + db := SetupTestDB(t, "wal-growth") + defer db.Cleanup() + + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Log("[1] Creating test table...") + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + t.Fatalf("Failed to open database: %v", err) + } + defer sqlDB.Close() + + if _, err := sqlDB.Exec(` + CREATE TABLE wal_test ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + data BLOB + ) + `); err != nil { + t.Fatalf("Failed to create table: %v", err) + } + + t.Log("✓ Table created") + + t.Log("[2] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(3 * time.Second) + + t.Log("[3] Generating sustained write load...") + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + config := DefaultLoadConfig() + config.WriteRate = 400 + config.Duration = duration + config.Pattern = LoadPatternWave + config.PayloadSize = 10 * 1024 + config.Workers = 4 + + if err := db.GenerateLoad(ctx, config.WriteRate, config.Duration, string(config.Pattern)); err != nil && ctx.Err() == nil { + t.Fatalf("Load generation failed: %v", err) + } + + t.Log("✓ Load generation complete") + + time.Sleep(5 * time.Second) + + t.Log("[4] Checking WAL size...") + walPath := db.Path + "-wal" + walSize, err := getFileSize(walPath) + if err != nil { + t.Logf("WAL file not found (may have been checkpointed): %v", err) + } else { + t.Logf("WAL size: %.2f MB", float64(walSize)/(1024*1024)) + } + + dbSize, err := db.GetDatabaseSize() + if err != nil { + t.Fatalf("Failed to get database size: %v", err) + } + + t.Logf("Total database size: %.2f MB", float64(dbSize)/(1024*1024)) + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[5] Checking for errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + if len(errors) > 10 { + t.Fatalf("Too many errors (%d), showing first 5:\n%v", len(errors), errors[:5]) + } + + t.Logf("✓ Found %d errors (acceptable)", len(errors)) + + t.Log("[6] Testing restore...") + restoredPath := filepath.Join(db.TempDir, "wal-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + origCount, err := db.GetRowCount("wal_test") + if err != nil { + t.Fatalf("Failed to get original row count: %v", err) + } + + restoredDB := &TestDB{Path: restoredPath, t: t} + restCount, err := restoredDB.GetRowCount("wal_test") + if err != nil { + t.Fatalf("Failed to get restored row count: %v", err) + } + + if origCount != restCount { + t.Fatalf("Count mismatch: original=%d, restored=%d", origCount, restCount) + } + + t.Logf("✓ Data integrity verified: %d rows", origCount) + t.Log("TEST PASSED: Handled large WAL successfully") +} + +func TestConcurrentOperations(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + duration := GetTestDuration(t, 3*time.Minute) + t.Logf("Testing: Multiple databases replicating concurrently (duration: %v)", duration) + + dbCount := 3 + dbs := make([]*TestDB, dbCount) + + for i := 0; i < dbCount; i++ { + dbs[i] = SetupTestDB(t, fmt.Sprintf("concurrent-%d", i)) + defer dbs[i].Cleanup() + } + + t.Log("[1] Creating databases...") + for i, db := range dbs { + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database %d: %v", i, err) + } + + if err := CreateTestTable(t, db.Path); err != nil { + t.Fatalf("Failed to create table for database %d: %v", i, err) + } + } + + t.Logf("✓ Created %d databases", dbCount) + + t.Log("[2] Starting Litestream for all databases...") + for i, db := range dbs { + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream for database %d: %v", i, err) + } + time.Sleep(1 * time.Second) + } + + t.Logf("✓ All Litestream instances running") + + t.Log("[3] Generating concurrent load...") + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + done := make(chan error, dbCount) + + for i, db := range dbs { + go func(idx int, database *TestDB) { + config := DefaultLoadConfig() + config.WriteRate = 50 + config.Duration = duration + config.Pattern = LoadPatternConstant + config.Workers = 2 + + err := database.GenerateLoad(ctx, config.WriteRate, config.Duration, string(config.Pattern)) + done <- err + }(i, db) + } + + for i := 0; i < dbCount; i++ { + if err := <-done; err != nil && ctx.Err() == nil { + t.Logf("Load generation %d had error: %v", i, err) + } + } + + t.Log("✓ Concurrent load complete") + + time.Sleep(5 * time.Second) + + t.Log("[4] Stopping all Litestream instances...") + for _, db := range dbs { + db.StopLitestream() + } + + time.Sleep(2 * time.Second) + + t.Log("[5] Verifying all replicas...") + for i, db := range dbs { + fileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to check replica %d: %v", i, err) + } + + if fileCount == 0 { + t.Fatalf("Database %d has no replica files!", i) + } + + t.Logf("✓ Database %d: %d replica files", i, fileCount) + } + + t.Log("[6] Testing restore for all databases...") + for i, db := range dbs { + restoredPath := filepath.Join(db.TempDir, fmt.Sprintf("concurrent-restored-%d.db", i)) + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed for database %d: %v", i, err) + } + + origCount, _ := db.GetRowCount("test_data") + restoredDB := &TestDB{Path: restoredPath, t: t} + restCount, _ := restoredDB.GetRowCount("test_data") + + if origCount != restCount { + t.Fatalf("Database %d count mismatch: original=%d, restored=%d", i, origCount, restCount) + } + + t.Logf("✓ Database %d verified: %d rows", i, origCount) + } + + t.Log("TEST PASSED: Concurrent replication works correctly") +} + +func TestBusyTimeout(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: Database busy timeout handling") + + db := SetupTestDB(t, "busy-timeout") + defer db.Cleanup() + + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Log("[1] Creating test data...") + if err := CreateTestTable(t, db.Path); err != nil { + t.Fatalf("Failed to create table: %v", err) + } + + if err := InsertTestData(t, db.Path, 100); err != nil { + t.Fatalf("Failed to insert test data: %v", err) + } + + t.Log("✓ Created table with 100 rows") + + t.Log("[2] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(3 * time.Second) + + t.Log("[3] Simulating concurrent access with long transactions...") + sqlDB, err := sql.Open("sqlite3", db.Path+"?_busy_timeout=5000") + if err != nil { + t.Fatalf("Failed to open database: %v", err) + } + defer sqlDB.Close() + + tx, err := sqlDB.Begin() + if err != nil { + t.Fatalf("Failed to begin transaction: %v", err) + } + + for i := 0; i < 500; i++ { + if _, err := tx.Exec( + "INSERT INTO test_data (data, created_at) VALUES (?, ?)", + fmt.Sprintf("busy test %d", i), + time.Now().Unix(), + ); err != nil { + t.Fatalf("Failed to insert in transaction: %v", err) + } + + if i%100 == 0 { + time.Sleep(500 * time.Millisecond) + } + } + + if err := tx.Commit(); err != nil { + t.Fatalf("Failed to commit transaction: %v", err) + } + + t.Log("✓ Long transaction completed") + + time.Sleep(5 * time.Second) + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[4] Checking for errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + if len(errors) > 0 { + t.Logf("Found %d errors (may include busy timeout messages)", len(errors)) + } + + t.Log("[5] Testing restore...") + restoredPath := filepath.Join(db.TempDir, "busy-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + origCount, err := db.GetRowCount("test_data") + if err != nil { + t.Fatalf("Failed to get original row count: %v", err) + } + + restoredDB := &TestDB{Path: restoredPath, t: t} + restCount, err := restoredDB.GetRowCount("test_data") + if err != nil { + t.Fatalf("Failed to get restored row count: %v", err) + } + + if origCount != restCount { + t.Fatalf("Count mismatch: original=%d, restored=%d", origCount, restCount) + } + + t.Logf("✓ Data integrity verified: %d rows", origCount) + t.Log("TEST PASSED: Busy timeout handled correctly") +} + +func getFileSize(path string) (int64, error) { + info, err := os.Stat(path) + if err != nil { + return 0, err + } + return info.Size(), nil +} diff --git a/tests/integration/fixtures.go b/tests/integration/fixtures.go new file mode 100644 index 000000000..4a553e067 --- /dev/null +++ b/tests/integration/fixtures.go @@ -0,0 +1,298 @@ +//go:build integration + +package integration + +import ( + "crypto/rand" + "database/sql" + "fmt" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +type LoadPattern string + +const ( + LoadPatternConstant LoadPattern = "constant" + LoadPatternBurst LoadPattern = "burst" + LoadPatternRandom LoadPattern = "random" + LoadPatternWave LoadPattern = "wave" +) + +type LoadConfig struct { + WriteRate int + Duration time.Duration + Pattern LoadPattern + PayloadSize int + ReadRatio float64 + Workers int +} + +func DefaultLoadConfig() *LoadConfig { + return &LoadConfig{ + WriteRate: 100, + Duration: 1 * time.Minute, + Pattern: LoadPatternConstant, + PayloadSize: 1024, + ReadRatio: 0.2, + Workers: 1, + } +} + +type PopulateConfig struct { + TargetSize string + RowSize int + BatchSize int + TableCount int + IndexRatio float64 + PageSize int +} + +func DefaultPopulateConfig() *PopulateConfig { + return &PopulateConfig{ + TargetSize: "100MB", + RowSize: 1024, + BatchSize: 1000, + TableCount: 1, + IndexRatio: 0.2, + PageSize: 4096, + } +} + +func CreateComplexTestSchema(db *sql.DB) error { + schemas := []string{ + `CREATE TABLE IF NOT EXISTS users ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + username TEXT NOT NULL UNIQUE, + email TEXT NOT NULL, + created_at INTEGER NOT NULL + )`, + `CREATE TABLE IF NOT EXISTS posts ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + user_id INTEGER NOT NULL, + title TEXT NOT NULL, + content TEXT, + created_at INTEGER NOT NULL, + FOREIGN KEY (user_id) REFERENCES users(id) + )`, + `CREATE TABLE IF NOT EXISTS comments ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + post_id INTEGER NOT NULL, + user_id INTEGER NOT NULL, + content TEXT NOT NULL, + created_at INTEGER NOT NULL, + FOREIGN KEY (post_id) REFERENCES posts(id), + FOREIGN KEY (user_id) REFERENCES users(id) + )`, + `CREATE INDEX IF NOT EXISTS idx_posts_user_id ON posts(user_id)`, + `CREATE INDEX IF NOT EXISTS idx_posts_created_at ON posts(created_at)`, + `CREATE INDEX IF NOT EXISTS idx_comments_post_id ON comments(post_id)`, + `CREATE INDEX IF NOT EXISTS idx_comments_created_at ON comments(created_at)`, + } + + for _, schema := range schemas { + if _, err := db.Exec(schema); err != nil { + return fmt.Errorf("execute schema: %w", err) + } + } + + return nil +} + +func PopulateComplexTestData(db *sql.DB, userCount, postsPerUser, commentsPerPost int) error { + tx, err := db.Begin() + if err != nil { + return fmt.Errorf("begin transaction: %w", err) + } + defer tx.Rollback() + + userStmt, err := tx.Prepare("INSERT INTO users (username, email, created_at) VALUES (?, ?, ?)") + if err != nil { + return fmt.Errorf("prepare user statement: %w", err) + } + defer userStmt.Close() + + postStmt, err := tx.Prepare("INSERT INTO posts (user_id, title, content, created_at) VALUES (?, ?, ?, ?)") + if err != nil { + return fmt.Errorf("prepare post statement: %w", err) + } + defer postStmt.Close() + + commentStmt, err := tx.Prepare("INSERT INTO comments (post_id, user_id, content, created_at) VALUES (?, ?, ?, ?)") + if err != nil { + return fmt.Errorf("prepare comment statement: %w", err) + } + defer commentStmt.Close() + + now := time.Now().Unix() + + for u := 1; u <= userCount; u++ { + userResult, err := userStmt.Exec( + fmt.Sprintf("user%d", u), + fmt.Sprintf("user%d@test.com", u), + now, + ) + if err != nil { + return fmt.Errorf("insert user: %w", err) + } + + userID, err := userResult.LastInsertId() + if err != nil { + return fmt.Errorf("get user id: %w", err) + } + + for p := 1; p <= postsPerUser; p++ { + postResult, err := postStmt.Exec( + userID, + fmt.Sprintf("Post %d from user %d", p, u), + generateRandomContent(100), + now, + ) + if err != nil { + return fmt.Errorf("insert post: %w", err) + } + + postID, err := postResult.LastInsertId() + if err != nil { + return fmt.Errorf("get post id: %w", err) + } + + for c := 1; c <= commentsPerPost; c++ { + commentUserID := (u + c) % userCount + if commentUserID == 0 { + commentUserID = userCount + } + + _, err := commentStmt.Exec( + postID, + commentUserID, + generateRandomContent(50), + now, + ) + if err != nil { + return fmt.Errorf("insert comment: %w", err) + } + } + } + } + + return tx.Commit() +} + +func generateRandomContent(length int) string { + const charset = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 " + b := make([]byte, length) + rand.Read(b) + + for i := range b { + b[i] = charset[int(b[i])%len(charset)] + } + + return string(b) +} + +type TestScenario struct { + Name string + Description string + Setup func(*sql.DB) error + Validate func(*sql.DB, *sql.DB) error +} + +func LargeWALScenario() *TestScenario { + return &TestScenario{ + Name: "Large WAL", + Description: "Generate large WAL file to test handling", + Setup: func(db *sql.DB) error { + if _, err := db.Exec(` + CREATE TABLE test_wal ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + data BLOB + ) + `); err != nil { + return err + } + + data := make([]byte, 10*1024) + rand.Read(data) + + for i := 0; i < 10000; i++ { + if _, err := db.Exec("INSERT INTO test_wal (data) VALUES (?)", data); err != nil { + return err + } + } + + return nil + }, + Validate: func(source, restored *sql.DB) error { + var sourceCount, restoredCount int + + if err := source.QueryRow("SELECT COUNT(*) FROM test_wal").Scan(&sourceCount); err != nil { + return fmt.Errorf("query source: %w", err) + } + + if err := restored.QueryRow("SELECT COUNT(*) FROM test_wal").Scan(&restoredCount); err != nil { + return fmt.Errorf("query restored: %w", err) + } + + if sourceCount != restoredCount { + return fmt.Errorf("count mismatch: source=%d, restored=%d", sourceCount, restoredCount) + } + + return nil + }, + } +} + +func RapidCheckpointsScenario() *TestScenario { + return &TestScenario{ + Name: "Rapid Checkpoints", + Description: "Test rapid checkpoint operations", + Setup: func(db *sql.DB) error { + if _, err := db.Exec(` + CREATE TABLE test_checkpoints ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + data TEXT, + timestamp INTEGER + ) + `); err != nil { + return err + } + + for i := 0; i < 1000; i++ { + if _, err := db.Exec( + "INSERT INTO test_checkpoints (data, timestamp) VALUES (?, ?)", + fmt.Sprintf("data %d", i), + time.Now().Unix(), + ); err != nil { + return err + } + + if i%100 == 0 { + if _, err := db.Exec("PRAGMA wal_checkpoint(TRUNCATE)"); err != nil { + return err + } + } + } + + return nil + }, + Validate: func(source, restored *sql.DB) error { + var sourceCount, restoredCount int + + if err := source.QueryRow("SELECT COUNT(*) FROM test_checkpoints").Scan(&sourceCount); err != nil { + return fmt.Errorf("query source: %w", err) + } + + if err := restored.QueryRow("SELECT COUNT(*) FROM test_checkpoints").Scan(&restoredCount); err != nil { + return fmt.Errorf("query restored: %w", err) + } + + if sourceCount != restoredCount { + return fmt.Errorf("count mismatch: source=%d, restored=%d", sourceCount, restoredCount) + } + + return nil + }, + } +} diff --git a/tests/integration/helpers.go b/tests/integration/helpers.go new file mode 100644 index 000000000..e402eba4d --- /dev/null +++ b/tests/integration/helpers.go @@ -0,0 +1,554 @@ +//go:build integration + +package integration + +import ( + "bytes" + "context" + "database/sql" + "fmt" + "io" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" + + "github.com/benbjohnson/litestream" +) + +type TestDB struct { + Path string + ReplicaPath string + ReplicaURL string + ReplicaEnv []string + ConfigPath string + TempDir string + LitestreamCmd *exec.Cmd + LitestreamPID int + t *testing.T +} + +// getBinaryPath returns the cross-platform path to a binary. +// On Windows, it adds the .exe extension. +func getBinaryPath(name string) string { + binPath := filepath.Join("..", "..", "bin", name) + if runtime.GOOS == "windows" { + binPath += ".exe" + } + return binPath +} + +func streamCommandOutput() bool { + v := strings.ToLower(strings.TrimSpace(os.Getenv("SOAK_DEBUG"))) + switch v { + case "", "0", "false", "off", "no": + return false + default: + return true + } +} + +func configureCmdIO(cmd *exec.Cmd) (bool, *bytes.Buffer, *bytes.Buffer) { + stream := streamCommandOutput() + stdoutBuf := &bytes.Buffer{} + stderrBuf := &bytes.Buffer{} + if stream { + cmd.Stdout = io.MultiWriter(os.Stdout, stdoutBuf) + cmd.Stderr = io.MultiWriter(os.Stderr, stderrBuf) + } else { + cmd.Stdout = stdoutBuf + cmd.Stderr = stderrBuf + } + return stream, stdoutBuf, stderrBuf +} + +func combinedOutput(stdoutBuf, stderrBuf *bytes.Buffer) string { + var sb strings.Builder + if stdoutBuf != nil && stdoutBuf.Len() > 0 { + sb.Write(stdoutBuf.Bytes()) + } + if stderrBuf != nil && stderrBuf.Len() > 0 { + sb.Write(stderrBuf.Bytes()) + } + return strings.TrimSpace(sb.String()) +} + +func SetupTestDB(t *testing.T, name string) *TestDB { + t.Helper() + + var tempDir string + if os.Getenv("SOAK_KEEP_TEMP") != "" { + dir, err := os.MkdirTemp("", fmt.Sprintf("litestream-%s-", name)) + if err != nil { + t.Fatalf("create temp dir: %v", err) + } + tempDir = dir + t.Cleanup(func() { + t.Logf("SOAK_KEEP_TEMP set, preserving test artifacts at: %s", tempDir) + }) + } else { + tempDir = t.TempDir() + } + dbPath := filepath.Join(tempDir, fmt.Sprintf("%s.db", name)) + replicaPath := filepath.Join(tempDir, "replica") + + return &TestDB{ + Path: dbPath, + ReplicaPath: replicaPath, + ReplicaURL: fmt.Sprintf("file://%s", filepath.ToSlash(replicaPath)), + TempDir: tempDir, + t: t, + } +} + +func (db *TestDB) Create() error { + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer sqlDB.Close() + + if _, err := sqlDB.Exec("PRAGMA journal_mode=WAL"); err != nil { + return fmt.Errorf("set WAL mode: %w", err) + } + + return nil +} + +func (db *TestDB) CreateWithPageSize(pageSize int) error { + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + return fmt.Errorf("open database: %w", err) + } + defer sqlDB.Close() + + if _, err := sqlDB.Exec(fmt.Sprintf("PRAGMA page_size = %d", pageSize)); err != nil { + return fmt.Errorf("set page size: %w", err) + } + + if _, err := sqlDB.Exec("PRAGMA journal_mode=WAL"); err != nil { + return fmt.Errorf("set WAL mode: %w", err) + } + + return nil +} + +func (db *TestDB) Populate(targetSize string) error { + cmd := exec.Command(getBinaryPath("litestream-test"), "populate", + "-db", db.Path, + "-target-size", targetSize, + ) + + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + + db.t.Logf("Populating database to %s...", targetSize) + + if err := cmd.Run(); err != nil { + if output := combinedOutput(stdoutBuf, stderrBuf); output != "" { + return fmt.Errorf("populate failed: %w\nOutput: %s", err, output) + } + return fmt.Errorf("populate failed: %w", err) + } + return nil +} + +func (db *TestDB) PopulateWithOptions(targetSize string, pageSize int, rowSize int) error { + cmd := exec.Command(getBinaryPath("litestream-test"), "populate", + "-db", db.Path, + "-target-size", targetSize, + "-page-size", fmt.Sprintf("%d", pageSize), + "-row-size", fmt.Sprintf("%d", rowSize), + ) + + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + + db.t.Logf("Populating database to %s (page size: %d, row size: %d)...", targetSize, pageSize, rowSize) + + if err := cmd.Run(); err != nil { + if output := combinedOutput(stdoutBuf, stderrBuf); output != "" { + return fmt.Errorf("populate failed: %w\nOutput: %s", err, output) + } + return fmt.Errorf("populate failed: %w", err) + } + return nil +} + +func (db *TestDB) GenerateLoad(ctx context.Context, writeRate int, duration time.Duration, pattern string) error { + cmd := exec.CommandContext(ctx, getBinaryPath("litestream-test"), "load", + "-db", db.Path, + "-write-rate", fmt.Sprintf("%d", writeRate), + "-duration", duration.String(), + "-pattern", pattern, + ) + + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + + db.t.Logf("Starting load generation: %d writes/sec for %v (%s pattern)", writeRate, duration, pattern) + + if err := cmd.Run(); err != nil { + if output := combinedOutput(stdoutBuf, stderrBuf); output != "" { + return fmt.Errorf("load generation failed: %w\nOutput: %s", err, output) + } + return fmt.Errorf("load generation failed: %w", err) + } + return nil +} + +func (db *TestDB) StartLitestream() error { + logPath := filepath.Join(db.TempDir, "litestream.log") + logFile, err := os.Create(logPath) + if err != nil { + return fmt.Errorf("create log file: %w", err) + } + + replicaURL := fmt.Sprintf("file://%s", filepath.ToSlash(db.ReplicaPath)) + cmd := exec.Command(getBinaryPath("litestream"), "replicate", + db.Path, + replicaURL, + ) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("start litestream: %w", err) + } + + db.LitestreamCmd = cmd + db.LitestreamPID = cmd.Process.Pid + + time.Sleep(2 * time.Second) + + if cmd.ProcessState != nil && cmd.ProcessState.Exited() { + logFile.Close() + return fmt.Errorf("litestream exited immediately") + } + + return nil +} + +func (db *TestDB) StartLitestreamWithConfig(configPath string) error { + logPath := filepath.Join(db.TempDir, "litestream.log") + logFile, err := os.Create(logPath) + if err != nil { + return fmt.Errorf("create log file: %w", err) + } + + db.ConfigPath = configPath + cmd := exec.Command(getBinaryPath("litestream"), "replicate", + "-config", configPath, + ) + cmd.Stdout = logFile + cmd.Stderr = logFile + + if err := cmd.Start(); err != nil { + logFile.Close() + return fmt.Errorf("start litestream: %w", err) + } + + db.LitestreamCmd = cmd + db.LitestreamPID = cmd.Process.Pid + + time.Sleep(2 * time.Second) + + return nil +} + +func (db *TestDB) StopLitestream() error { + if db.LitestreamCmd == nil || db.LitestreamCmd.Process == nil { + return nil + } + + if err := db.LitestreamCmd.Process.Kill(); err != nil { + return fmt.Errorf("kill litestream: %w", err) + } + + db.LitestreamCmd.Wait() + time.Sleep(1 * time.Second) + + return nil +} + +func (db *TestDB) Restore(outputPath string) error { + replicaURL := db.ReplicaURL + if replicaURL == "" { + replicaURL = fmt.Sprintf("file://%s", filepath.ToSlash(db.ReplicaPath)) + } + var cmd *exec.Cmd + if db.ConfigPath != "" && (strings.HasPrefix(replicaURL, "s3://") || strings.HasPrefix(replicaURL, "abs://") || strings.HasPrefix(replicaURL, "nats://")) { + cmd = exec.Command(getBinaryPath("litestream"), "restore", + "-config", db.ConfigPath, + "-o", outputPath, + db.Path, + ) + } else { + cmd = exec.Command(getBinaryPath("litestream"), "restore", + "-o", outputPath, + replicaURL, + ) + } + cmd.Env = append(os.Environ(), db.ReplicaEnv...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("restore failed: %w\nOutput: %s", err, string(output)) + } + return nil +} + +func (db *TestDB) Validate(restoredPath string) error { + replicaURL := db.ReplicaURL + if replicaURL == "" { + replicaURL = fmt.Sprintf("file://%s", filepath.ToSlash(db.ReplicaPath)) + } + cmd := exec.Command(getBinaryPath("litestream-test"), "validate", + "-source-db", db.Path, + "-replica-url", replicaURL, + "-restored-db", restoredPath, + "-check-type", "full", + ) + cmd.Env = append(os.Environ(), db.ReplicaEnv...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("validation failed: %w\nOutput: %s", err, string(output)) + } + return nil +} + +func (db *TestDB) QuickValidate(restoredPath string) error { + replicaURL := db.ReplicaURL + if replicaURL == "" { + replicaURL = fmt.Sprintf("file://%s", filepath.ToSlash(db.ReplicaPath)) + } + cmd := exec.Command(getBinaryPath("litestream-test"), "validate", + "-source-db", db.Path, + "-replica-url", replicaURL, + "-restored-db", restoredPath, + "-check-type", "quick", + ) + cmd.Env = append(os.Environ(), db.ReplicaEnv...) + output, err := cmd.CombinedOutput() + if err != nil { + return fmt.Errorf("validation failed: %w\nOutput: %s", err, string(output)) + } + return nil +} + +func (db *TestDB) GetRowCount(table string) (int, error) { + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + return 0, fmt.Errorf("open database: %w", err) + } + defer sqlDB.Close() + + var count int + query := fmt.Sprintf("SELECT COUNT(*) FROM %s", table) + if err := sqlDB.QueryRow(query).Scan(&count); err != nil { + return 0, fmt.Errorf("query count: %w", err) + } + + return count, nil +} + +func (db *TestDB) GetDatabaseSize() (int64, error) { + info, err := os.Stat(db.Path) + if err != nil { + return 0, err + } + + size := info.Size() + + walPath := db.Path + "-wal" + if walInfo, err := os.Stat(walPath); err == nil { + size += walInfo.Size() + } + + return size, nil +} + +func (db *TestDB) GetReplicaFileCount() (int, error) { + ltxPath := filepath.Join(db.ReplicaPath, "ltx", "0") + files, err := filepath.Glob(filepath.Join(ltxPath, "*.ltx")) + if err != nil { + return 0, err + } + return len(files), nil +} + +func (db *TestDB) GetLitestreamLog() (string, error) { + logPath := filepath.Join(db.TempDir, "litestream.log") + content, err := os.ReadFile(logPath) + if err != nil { + return "", err + } + return string(content), nil +} + +func (db *TestDB) CheckForErrors() ([]string, error) { + log, err := db.GetLitestreamLog() + if err != nil { + return nil, err + } + + var errors []string + lines := strings.Split(log, "\n") + for _, line := range lines { + if strings.Contains(strings.ToUpper(line), "ERROR") { + errors = append(errors, line) + } + } + + return errors, nil +} + +func (db *TestDB) Cleanup() { + db.StopLitestream() +} + +// WaitForSnapshots waits for snapshots & WAL segments to appear on file replicas. +func (db *TestDB) WaitForSnapshots(timeout time.Duration) error { + if !strings.HasPrefix(db.ReplicaURL, "file://") { + return nil + } + + snapshotDir := filepath.Join(db.ReplicaPath, "ltx", fmt.Sprintf("%d", litestream.SnapshotLevel)) + walDir := filepath.Join(db.ReplicaPath, "ltx", "0") + + deadline := time.Now().Add(timeout) + for { + snapshotCount := countLTXFiles(snapshotDir) + walCount := countLTXFiles(walDir) + + if snapshotCount > 0 && walCount > 0 { + return nil + } + + if time.Now().After(deadline) { + return fmt.Errorf("timeout waiting for replica data: snapshots=%d wal=%d", snapshotCount, walCount) + } + + time.Sleep(500 * time.Millisecond) + } +} + +func countLTXFiles(dir string) int { + matches, err := filepath.Glob(filepath.Join(dir, "*.ltx")) + if err != nil { + return 0 + } + return len(matches) +} + +func GetTestDuration(t *testing.T, defaultDuration time.Duration) time.Duration { + t.Helper() + + if testing.Short() { + return defaultDuration / 10 + } + + return defaultDuration +} + +func RequireBinaries(t *testing.T) { + t.Helper() + + litestreamBin := getBinaryPath("litestream") + if _, err := os.Stat(litestreamBin); err != nil { + t.Skip("litestream binary not found, run: go build -o bin/litestream ./cmd/litestream") + } + + litestreamTestBin := getBinaryPath("litestream-test") + if _, err := os.Stat(litestreamTestBin); err != nil { + t.Skip("litestream-test binary not found, run: go build -o bin/litestream-test ./cmd/litestream-test") + } +} + +func CreateTestTable(t *testing.T, dbPath string) error { + t.Helper() + + sqlDB, err := sql.Open("sqlite3", dbPath) + if err != nil { + return err + } + defer sqlDB.Close() + + _, err = sqlDB.Exec(` + CREATE TABLE IF NOT EXISTS test_data ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + data TEXT, + created_at INTEGER + ) + `) + return err +} + +func InsertTestData(t *testing.T, dbPath string, count int) error { + t.Helper() + + sqlDB, err := sql.Open("sqlite3", dbPath) + if err != nil { + return err + } + defer sqlDB.Close() + + tx, err := sqlDB.Begin() + if err != nil { + return err + } + defer tx.Rollback() + + stmt, err := tx.Prepare("INSERT INTO test_data (data, created_at) VALUES (?, ?)") + if err != nil { + return err + } + defer stmt.Close() + + for i := 0; i < count; i++ { + if _, err := stmt.Exec(fmt.Sprintf("test data %d", i), time.Now().Unix()); err != nil { + return err + } + } + + return tx.Commit() +} + +// IntegrityCheck runs PRAGMA integrity_check on the database. +func (db *TestDB) IntegrityCheck() error { + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + return err + } + defer sqlDB.Close() + + var result string + if err := sqlDB.QueryRow("PRAGMA integrity_check").Scan(&result); err != nil { + return err + } + if result != "ok" { + return fmt.Errorf("integrity check failed: %s", result) + } + return nil +} + +// PrintTestSummary prints a summary of the test results +func (db *TestDB) PrintTestSummary(t *testing.T, testName string, startTime time.Time) { + t.Helper() + + duration := time.Since(startTime) + dbSize, _ := db.GetDatabaseSize() + fileCount, _ := db.GetReplicaFileCount() + errors, _ := db.CheckForErrors() + + t.Log("\n" + strings.Repeat("=", 80)) + t.Logf("TEST SUMMARY: %s", testName) + t.Log(strings.Repeat("=", 80)) + t.Logf("Duration: %v", duration.Round(time.Second)) + t.Logf("Database Size: %.2f MB", float64(dbSize)/(1024*1024)) + t.Logf("Replica Files: %d LTX files", fileCount) + t.Logf("Litestream Errors: %d", len(errors)) + t.Log(strings.Repeat("=", 80)) +} diff --git a/tests/integration/minio_soak_test.go b/tests/integration/minio_soak_test.go new file mode 100644 index 000000000..4b0b81195 --- /dev/null +++ b/tests/integration/minio_soak_test.go @@ -0,0 +1,368 @@ +//go:build integration && soak && docker + +package integration + +import ( + "context" + "database/sql" + "fmt" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +// TestMinIOSoak runs a soak test against local MinIO S3-compatible server using Docker. +// +// Default duration: 2 hours +// Can be shortened with: go test -test.short (runs for 30 minutes) +// +// Requirements: +// - Docker must be running +// - docker command must be in PATH +// +// This test validates: +// - S3-compatible replication to MinIO +// - Docker container lifecycle management +// - Heavy sustained load (500 writes/sec) +// - Restoration from S3-compatible storage +func TestMinIOSoak(t *testing.T) { + RequireBinaries(t) + RequireDocker(t) + + // Determine test duration + duration := GetTestDuration(t, 2*time.Hour) + shortMode := testing.Short() + if shortMode { + duration = 2 * time.Minute + } + + targetSize := "50MB" + writeRate := 500 + if shortMode { + targetSize = "5MB" + writeRate = 100 + } + + t.Logf("================================================") + t.Logf("Litestream MinIO S3 Soak Test") + t.Logf("================================================") + t.Logf("Duration: %v", duration) + t.Logf("Start time: %s", time.Now().Format(time.RFC3339)) + t.Log("") + + startTime := time.Now() + + // Start MinIO container + t.Log("Starting MinIO container...") + containerID, endpoint, dataVolume := StartMinIOContainer(t) + defer StopMinIOContainer(t, containerID, dataVolume) + t.Logf("✓ MinIO running at: %s", endpoint) + t.Log("") + + // Create MinIO bucket + bucket := "litestream-test" + CreateMinIOBucket(t, containerID, bucket) + t.Log("") + + // Setup test database + db := SetupTestDB(t, "minio-soak") + defer db.Cleanup() + + // Create database + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + // Populate with initial data + t.Logf("Populating database (%s initial data)...", targetSize) + if err := db.Populate(targetSize); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + t.Log("✓ Database populated") + t.Log("") + + // Create S3 configuration for MinIO + s3Path := fmt.Sprintf("litestream-test-%d", time.Now().Unix()) + s3URL := fmt.Sprintf("s3://%s/%s", bucket, s3Path) + db.ReplicaURL = s3URL + t.Log("Creating Litestream configuration for MinIO S3...") + s3Config := &S3Config{ + Endpoint: endpoint, + AccessKey: "minioadmin", + SecretKey: "minioadmin", + Region: "us-east-1", + ForcePathStyle: true, + SkipVerify: true, + } + configPath := CreateSoakConfig(db.Path, s3URL, s3Config, shortMode) + db.ConfigPath = configPath + t.Logf("✓ Configuration created: %s", configPath) + t.Logf(" S3 URL: %s", s3URL) + t.Log("") + + // Start Litestream + t.Log("Starting Litestream with MinIO backend...") + if err := db.StartLitestreamWithConfig(configPath); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + t.Logf("✓ Litestream running (PID: %d)", db.LitestreamPID) + t.Log("") + + // Start load generator + t.Log("Starting load generator (heavy sustained load)...") + t.Logf(" Write rate: %d writes/second", writeRate) + t.Logf(" Pattern: wave (simulates varying load)") + t.Logf(" Payload size: 4KB") + t.Logf(" Workers: 8") + t.Log("") + + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + // Setup signal handler for graceful interruption + testInfo := &TestInfo{ + StartTime: startTime, + Duration: duration, + DB: db, + cancel: cancel, + } + setupSignalHandler(t, cancel, testInfo) + + // Run load generation in background + loadDone := make(chan error, 1) + go func() { + loadDone <- db.GenerateLoad(ctx, writeRate, duration, "wave") + }() + + // Monitor every 60 seconds with MinIO-specific metrics + t.Log("Running MinIO S3 test...") + t.Log("Monitor will report every 60 seconds") + t.Log("Press Ctrl+C twice within 5 seconds to stop early") + t.Log("================================================") + t.Log("") + + refreshStats := func() { + testInfo.RowCount, _ = db.GetRowCount("load_test") + if testInfo.RowCount == 0 { + testInfo.RowCount, _ = db.GetRowCount("test_table_0") + } + if testInfo.RowCount == 0 { + testInfo.RowCount, _ = db.GetRowCount("test_data") + } + testInfo.FileCount = CountMinIOObjects(t, containerID, bucket) + } + + logMetrics := func() { + logMinIOMetrics(t, db, containerID, bucket) + if db.LitestreamCmd != nil && db.LitestreamCmd.ProcessState != nil { + t.Error("✗ Litestream stopped unexpectedly!") + if testInfo.cancel != nil { + testInfo.cancel() + } + } + } + + MonitorSoakTest(t, db, ctx, testInfo, refreshStats, logMetrics) + + // Wait for load generation to complete + if err := <-loadDone; err != nil { + t.Logf("Load generation completed: %v", err) + } + + if err := db.WaitForSnapshots(30 * time.Second); err != nil { + t.Fatalf("Failed waiting for snapshot: %v", err) + } + + t.Log("") + t.Log("================================================") + t.Log("Final Test Results") + t.Log("================================================") + t.Log("") + + // Stop Litestream + t.Log("Stopping Litestream...") + if err := db.StopLitestream(); err != nil { + t.Logf("Warning: Failed to stop Litestream cleanly: %v", err) + } + + // Final statistics + t.Log("Database Statistics:") + if dbSize, err := db.GetDatabaseSize(); err == nil { + t.Logf(" Final size: %.2f MB", float64(dbSize)/(1024*1024)) + } + + // Count rows + var rowCount int + var err error + if rowCount, err = db.GetRowCount("load_test"); err != nil { + if rowCount, err = db.GetRowCount("test_table_0"); err != nil { + if rowCount, err = db.GetRowCount("test_data"); err != nil { + t.Logf(" Warning: Could not get row count: %v", err) + } + } + } + if err == nil { + t.Logf(" Total rows: %d", rowCount) + } + t.Log("") + + // MinIO statistics + t.Log("MinIO S3 Statistics:") + finalObjects := CountMinIOObjects(t, containerID, bucket) + t.Logf(" Total objects in MinIO: %d", finalObjects) + t.Log("") + + // Check for errors + errors, _ := db.CheckForErrors() + criticalErrors := 0 + for _, errLine := range errors { + if !containsAny(errLine, []string{"page size not initialized"}) { + criticalErrors++ + } + } + t.Logf(" Critical errors: %d", criticalErrors) + t.Log("") + + // Test restoration from MinIO + t.Log("Testing restoration from MinIO S3...") + restoredPath := filepath.Join(db.TempDir, "restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restoration from MinIO failed: %v", err) + } + t.Log("✓ Restoration successful!") + + // Compare row counts + var restoredCount int + if restoredCount, err = getRowCountFromPath(restoredPath, "load_test"); err != nil { + if restoredCount, err = getRowCountFromPath(restoredPath, "test_table_0"); err != nil { + if restoredCount, err = getRowCountFromPath(restoredPath, "test_data"); err != nil { + t.Logf(" Warning: Could not get restored row count: %v", err) + } + } + } + if err == nil && rowCount > 0 { + if rowCount == restoredCount { + t.Logf("✓ Row counts match! (%d rows)", restoredCount) + } else { + t.Logf("⚠ Row count mismatch! Original: %d, Restored: %d", rowCount, restoredCount) + } + } + + // Validate integrity + t.Log("") + t.Log("Validating restored database integrity...") + restoredDB := &TestDB{Path: restoredPath, t: t} + if err := restoredDB.IntegrityCheck(); err != nil { + t.Fatalf("Integrity check failed: %v", err) + } + t.Log("✓ Integrity check passed!") + + // Analyze test results + analysis := AnalyzeSoakTest(t, db, duration) + PrintSoakTestAnalysis(t, analysis) + + // Test Summary + t.Log("================================================") + t.Log("Test Summary") + t.Log("================================================") + + testPassed := true + issues := []string{} + + if criticalErrors > 0 { + testPassed = false + issues = append(issues, fmt.Sprintf("Critical errors detected: %d", criticalErrors)) + } + + if finalObjects == 0 { + testPassed = false + issues = append(issues, "No objects stored in MinIO") + } + + if testPassed { + t.Log("✓ TEST PASSED!") + t.Log("") + t.Logf("Successfully replicated to MinIO (%d objects)", finalObjects) + t.Log("The configuration is ready for production use.") + } else { + t.Log("⚠ TEST COMPLETED WITH ISSUES:") + for _, issue := range issues { + t.Logf(" - %s", issue) + } + t.Log("") + t.Log("Review the logs for details:") + logPath, _ := db.GetLitestreamLog() + t.Logf(" %s", logPath) + t.Fail() + } + + t.Log("") + t.Logf("Test duration: %v", time.Since(startTime).Round(time.Second)) + t.Logf("Results available in: %s", db.TempDir) + t.Log("================================================") +} + +// logMinIOMetrics logs MinIO-specific metrics +func logMinIOMetrics(t *testing.T, db *TestDB, containerID, bucket string) { + t.Helper() + + // Basic database metrics + LogSoakMetrics(t, db, "minio") + + // MinIO-specific metrics + t.Log("") + t.Log(" MinIO S3 Statistics:") + + objectCount := CountMinIOObjects(t, containerID, bucket) + t.Logf(" Total objects: %d", objectCount) + + // Count LTX files specifically + ltxCount := countMinIOLTXFiles(t, containerID, bucket) + t.Logf(" LTX segments: %d", ltxCount) +} + +// countMinIOLTXFiles counts LTX files in MinIO bucket +func countMinIOLTXFiles(t *testing.T, containerID, bucket string) int { + t.Helper() + + cmd := exec.Command("docker", "run", "--rm", + "--link", containerID+":minio", + "-e", "MC_HOST_minio=http://minioadmin:minioadmin@minio:9000", + "minio/mc", "ls", "minio/"+bucket+"/", "--recursive") + + output, err := cmd.CombinedOutput() + if err != nil { + return 0 + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + ltxCount := 0 + for _, line := range lines { + if strings.Contains(line, ".ltx") { + ltxCount++ + } + } + + return ltxCount +} + +// getRowCountFromPath gets row count from a database file path +func getRowCountFromPath(dbPath, table string) (int, error) { + db, err := sql.Open("sqlite3", dbPath) + if err != nil { + return 0, err + } + defer db.Close() + + var count int + query := fmt.Sprintf("SELECT COUNT(*) FROM %s", table) + if err := db.QueryRow(query).Scan(&count); err != nil { + return 0, err + } + + return count, nil +} diff --git a/tests/integration/overnight_s3_soak_test.go b/tests/integration/overnight_s3_soak_test.go new file mode 100644 index 000000000..bdd3bd156 --- /dev/null +++ b/tests/integration/overnight_s3_soak_test.go @@ -0,0 +1,353 @@ +//go:build integration && soak && aws + +package integration + +import ( + "context" + "database/sql" + "fmt" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +// TestOvernightS3Soak runs an 8-hour overnight soak test against real AWS S3. +// +// Default duration: 8 hours +// Can be shortened with: go test -test.short (runs for 1 hour) +// +// Requirements: +// - AWS_ACCESS_KEY_ID environment variable +// - AWS_SECRET_ACCESS_KEY environment variable +// - S3_BUCKET environment variable +// - AWS_REGION environment variable (optional, defaults to us-east-1) +// - AWS CLI must be installed +// +// This test validates: +// - Long-term S3 replication stability +// - Network resilience over 8 hours +// - Real S3 API performance +// - Restoration from cloud storage +func TestOvernightS3Soak(t *testing.T) { + RequireBinaries(t) + + // Check AWS credentials and get configuration + bucket, region := CheckAWSCredentials(t) + + // Determine test duration + var duration time.Duration + if testing.Short() { + duration = 10 * time.Minute + } else { + duration = 8 * time.Hour + } + + shortMode := testing.Short() + + t.Logf("================================================") + t.Logf("Litestream Overnight S3 Soak Test") + t.Logf("================================================") + t.Logf("Duration: %v", duration) + t.Logf("S3 Bucket: %s", bucket) + t.Logf("AWS Region: %s", region) + t.Logf("Start time: %s", time.Now().Format(time.RFC3339)) + t.Log("") + + startTime := time.Now() + + // Test S3 connectivity + t.Log("Testing S3 connectivity...") + TestS3Connectivity(t, bucket) + t.Log("") + + // Setup test database + db := SetupTestDB(t, "overnight-s3-soak") + defer db.Cleanup() + + // Create database + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + // Create S3 configuration + s3Path := fmt.Sprintf("litestream-overnight-%d", time.Now().Unix()) + s3URL := fmt.Sprintf("s3://%s/%s", bucket, s3Path) + db.ReplicaURL = s3URL + t.Log("Creating Litestream configuration for S3...") + s3Config := &S3Config{ + Region: region, + } + configPath := CreateSoakConfig(db.Path, s3URL, s3Config, shortMode) + db.ConfigPath = configPath + t.Logf("✓ Configuration created: %s", configPath) + t.Logf(" S3 URL: %s", s3URL) + t.Log("") + + // Start Litestream initially (before population) + t.Log("Starting Litestream...") + if err := db.StartLitestreamWithConfig(configPath); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + t.Logf("✓ Litestream started (PID: %d)", db.LitestreamPID) + t.Log("") + + // Stop Litestream to populate database + t.Log("Stopping Litestream temporarily for initial population...") + if err := db.StopLitestream(); err != nil { + t.Fatalf("Failed to stop Litestream: %v", err) + } + + // Populate with 100MB of initial data + t.Log("Populating database (100MB initial data)...") + if err := db.Populate("100MB"); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + t.Log("✓ Database populated") + t.Log("") + + // Restart Litestream after population + t.Log("Restarting Litestream after population...") + if err := db.StartLitestreamWithConfig(configPath); err != nil { + t.Fatalf("Failed to restart Litestream: %v", err) + } + t.Logf("✓ Litestream restarted (PID: %d)", db.LitestreamPID) + t.Log("") + + // Start load generator for overnight test + t.Log("Starting load generator for overnight S3 test...") + t.Log("Configuration:") + t.Logf(" Duration: %v", duration) + t.Logf(" Write rate: 100 writes/second (higher for S3 testing)") + t.Logf(" Pattern: wave (simulates varying load)") + t.Logf(" Workers: 8") + t.Log("") + + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + // Setup signal handler for graceful interruption + testInfo := &TestInfo{ + StartTime: startTime, + Duration: duration, + DB: db, + cancel: cancel, + } + setupSignalHandler(t, cancel, testInfo) + + // Run load generation in background + loadDone := make(chan error, 1) + go func() { + loadDone <- db.GenerateLoad(ctx, 100, duration, "wave") + }() + + // Monitor every 60 seconds with S3-specific metrics + t.Log("Overnight S3 test is running!") + t.Log("Monitor will report every 60 seconds") + t.Log("Press Ctrl+C twice within 5 seconds to stop early") + t.Log("================================================") + t.Log("") + t.Logf("The test will run for %v. Monitor progress below.", duration) + t.Log("") + + refreshStats := func() { + testInfo.RowCount, _ = db.GetRowCount("load_test") + if testInfo.RowCount == 0 { + testInfo.RowCount, _ = db.GetRowCount("test_table_0") + } + if testInfo.RowCount == 0 { + testInfo.RowCount, _ = db.GetRowCount("test_data") + } + testInfo.FileCount = CountS3Objects(t, s3URL) + } + + logMetrics := func() { + logS3Metrics(t, db, s3URL) + if db.LitestreamCmd != nil && db.LitestreamCmd.ProcessState != nil { + t.Error("✗ Litestream stopped unexpectedly!") + if testInfo.cancel != nil { + testInfo.cancel() + } + } + } + + MonitorSoakTest(t, db, ctx, testInfo, refreshStats, logMetrics) + + // Wait for load generation to complete + if err := <-loadDone; err != nil { + t.Logf("Load generation completed: %v", err) + } + + t.Log("") + t.Log("Load generation completed.") + + // Final statistics + t.Log("") + t.Log("================================================") + t.Log("Final Statistics") + t.Log("================================================") + t.Log("") + + // Stop Litestream + t.Log("Stopping Litestream...") + if err := db.StopLitestream(); err != nil { + t.Logf("Warning: Failed to stop Litestream cleanly: %v", err) + } + + // Database statistics + t.Log("Database Statistics:") + if dbSize, err := db.GetDatabaseSize(); err == nil { + t.Logf(" Final size: %.2f MB", float64(dbSize)/(1024*1024)) + } + + // Count rows + var rowCount int + var err error + if rowCount, err = db.GetRowCount("load_test"); err != nil { + if rowCount, err = db.GetRowCount("test_table_0"); err != nil { + if rowCount, err = db.GetRowCount("test_data"); err != nil { + t.Logf(" Warning: Could not get row count: %v", err) + } + } + } + if err == nil { + t.Logf(" Total rows: %d", rowCount) + } + t.Log("") + + // S3 statistics + t.Log("S3 Statistics:") + finalObjects := CountS3Objects(t, s3URL) + t.Logf(" Total objects: %d", finalObjects) + + if s3Size := GetS3StorageSize(t, s3URL); s3Size > 0 { + t.Logf(" Total S3 storage: %.2f MB", float64(s3Size)/(1024*1024)) + } + t.Log("") + + // Check for errors + errors, _ := db.CheckForErrors() + criticalErrors := 0 + for _, errLine := range errors { + if !containsAny(errLine, []string{"page size not initialized"}) { + criticalErrors++ + } + } + t.Logf(" Critical errors: %d", criticalErrors) + t.Log("") + + // Test restoration from S3 + t.Log("Testing restoration from S3...") + restoredPath := filepath.Join(db.TempDir, "restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restoration from S3 failed: %v", err) + } + t.Log("✓ Restoration successful!") + + // Compare row counts + var restoredCount int + if restoredCount, err = getRowCountFromPath(restoredPath, "load_test"); err != nil { + if restoredCount, err = getRowCountFromPath(restoredPath, "test_table_0"); err != nil { + if restoredCount, err = getRowCountFromPath(restoredPath, "test_data"); err != nil { + t.Logf(" Warning: Could not get restored row count: %v", err) + } + } + } + if err == nil && rowCount > 0 { + if rowCount == restoredCount { + t.Logf("✓ Row counts match! (%d rows)", restoredCount) + } else { + t.Logf("⚠ Row count mismatch! Original: %d, Restored: %d", rowCount, restoredCount) + } + } + + // Validate + t.Log("") + t.Log("Validating restored database...") + if err := db.Validate(restoredPath); err != nil { + t.Fatalf("Validation failed: %v", err) + } + t.Log("✓ Validation passed!") + + // Analyze test results + analysis := AnalyzeSoakTest(t, db, duration) + PrintSoakTestAnalysis(t, analysis) + + // Test Summary + t.Log("================================================") + t.Log("Test Summary") + t.Log("================================================") + + testPassed := true + issues := []string{} + + if criticalErrors > 0 { + testPassed = false + issues = append(issues, fmt.Sprintf("Critical errors detected: %d", criticalErrors)) + } + + if finalObjects == 0 { + testPassed = false + issues = append(issues, "No objects stored in S3") + } + + if testPassed { + t.Log("✓ TEST PASSED!") + t.Log("") + t.Logf("Successfully replicated to AWS S3 (%d objects)", finalObjects) + t.Log("The configuration is ready for production use.") + } else { + t.Log("⚠ TEST COMPLETED WITH ISSUES:") + for _, issue := range issues { + t.Logf(" - %s", issue) + } + t.Log("") + t.Log("Review the logs for details:") + logPath, _ := db.GetLitestreamLog() + t.Logf(" %s", logPath) + t.Fail() + } + + t.Log("") + t.Logf("Test duration: %v", time.Since(startTime).Round(time.Second)) + t.Logf("Results available in: %s", db.TempDir) + t.Logf("S3 replica data in: %s", s3URL) + t.Log("================================================") +} + +// logS3Metrics logs S3-specific metrics +func logS3Metrics(t *testing.T, db *TestDB, s3URL string) { + t.Helper() + + // Basic database metrics + LogSoakMetrics(t, db, "overnight-s3") + + // S3-specific metrics + t.Log("") + t.Log(" S3 Statistics:") + + objectCount := CountS3Objects(t, s3URL) + t.Logf(" Total objects: %d", objectCount) + + if s3Size := GetS3StorageSize(t, s3URL); s3Size > 0 { + t.Logf(" Total storage: %.2f MB", float64(s3Size)/(1024*1024)) + } +} + +// getRowCountFromPath gets row count from a database file path +func getRowCountFromPath(dbPath, table string) (int, error) { + db, err := sql.Open("sqlite3", dbPath) + if err != nil { + return 0, err + } + defer db.Close() + + var count int + query := fmt.Sprintf("SELECT COUNT(*) FROM %s", table) + if err := db.QueryRow(query).Scan(&count); err != nil { + return 0, err + } + + return count, nil +} diff --git a/tests/integration/overnight_test.go b/tests/integration/overnight_test.go new file mode 100644 index 000000000..fbc576afc --- /dev/null +++ b/tests/integration/overnight_test.go @@ -0,0 +1,215 @@ +//go:build integration && long + +package integration + +import ( + "context" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +func TestOvernightFile(t *testing.T) { + if testing.Short() { + t.Skip("skipping long integration test in short mode") + } + + RequireBinaries(t) + + startTime := time.Now() + duration := GetTestDuration(t, 8*time.Hour) + t.Logf("Testing: Overnight file-based replication (duration: %v)", duration) + t.Log("Default: 8 hours, configurable via test duration") + + db := SetupTestDB(t, "overnight-file") + defer db.Cleanup() + defer db.PrintTestSummary(t, "Overnight File Replication", startTime) + + t.Log("[1] Creating and populating database...") + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + if err := db.Populate("100MB"); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + + t.Log("✓ Database populated to 100MB") + + t.Log("[2] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(10 * time.Second) + + t.Log("[3] Generating sustained load...") + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + config := DefaultLoadConfig() + config.WriteRate = 50 + config.Duration = duration + config.Pattern = LoadPatternWave + config.PayloadSize = 2 * 1024 + config.Workers = 4 + + ticker := time.NewTicker(1 * time.Minute) + defer ticker.Stop() + + go func() { + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + fileCount, _ := db.GetReplicaFileCount() + dbSize, _ := db.GetDatabaseSize() + t.Logf("[Progress] Files: %d, DB Size: %.2f MB, Elapsed: %v", + fileCount, float64(dbSize)/(1024*1024), time.Since(time.Now().Add(-duration))) + } + } + }() + + if err := db.GenerateLoad(ctx, config.WriteRate, config.Duration, string(config.Pattern)); err != nil && ctx.Err() == nil { + t.Fatalf("Load generation failed: %v", err) + } + + t.Log("✓ Load generation complete") + + time.Sleep(1 * time.Minute) + + t.Log("[4] Final statistics...") + fileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to check replica: %v", err) + } + + dbSize, err := db.GetDatabaseSize() + if err != nil { + t.Fatalf("Failed to get database size: %v", err) + } + + t.Logf("Final LTX files: %d", fileCount) + t.Logf("Final DB size: %.2f MB", float64(dbSize)/(1024*1024)) + + t.Log("[5] Checking for errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + if len(errors) > 20 { + t.Fatalf("Too many errors (%d), test may be unstable", len(errors)) + } else if len(errors) > 0 { + t.Logf("Found %d errors (acceptable for long test)", len(errors)) + } else { + t.Log("✓ No errors detected") + } + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[6] Testing final restore...") + restoredPath := filepath.Join(db.TempDir, "overnight-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + t.Log("[7] Full validation...") + if err := db.Validate(restoredPath); err != nil { + t.Fatalf("Validation failed: %v", err) + } + + t.Log("✓ Validation passed") + t.Log("TEST PASSED: Overnight file replication successful") +} + +func TestOvernightComprehensive(t *testing.T) { + if testing.Short() { + t.Skip("skipping long integration test in short mode") + } + + RequireBinaries(t) + + startTime := time.Now() + duration := GetTestDuration(t, 8*time.Hour) + t.Logf("Testing: Comprehensive overnight test (duration: %v)", duration) + + db := SetupTestDB(t, "overnight-comprehensive") + defer db.Cleanup() + defer db.PrintTestSummary(t, "Overnight Comprehensive Test", startTime) + + t.Log("[1] Creating large database...") + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + if err := db.Populate("500MB"); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + + t.Log("✓ Database populated to 500MB") + + t.Log("[2] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(10 * time.Second) + + t.Log("[3] Generating mixed workload...") + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + config := DefaultLoadConfig() + config.WriteRate = 100 + config.Duration = duration + config.Pattern = LoadPatternWave + config.PayloadSize = 4 * 1024 + config.ReadRatio = 0.3 + config.Workers = 8 + + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + + go func() { + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + fileCount, _ := db.GetReplicaFileCount() + dbSize, _ := db.GetDatabaseSize() + t.Logf("[Progress] Files: %d, DB Size: %.2f MB", fileCount, float64(dbSize)/(1024*1024)) + } + } + }() + + if err := db.GenerateLoad(ctx, config.WriteRate, config.Duration, string(config.Pattern)); err != nil && ctx.Err() == nil { + t.Fatalf("Load generation failed: %v", err) + } + + t.Log("✓ Load generation complete") + + time.Sleep(2 * time.Minute) + + db.StopLitestream() + + t.Log("[4] Final validation...") + restoredPath := filepath.Join(db.TempDir, "comprehensive-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + if err := db.Validate(restoredPath); err != nil { + t.Fatalf("Validation failed: %v", err) + } + + t.Log("✓ Comprehensive test passed") + t.Log("TEST PASSED: Overnight comprehensive test successful") +} diff --git a/tests/integration/quick_test.go b/tests/integration/quick_test.go new file mode 100644 index 000000000..a6b7ea61d --- /dev/null +++ b/tests/integration/quick_test.go @@ -0,0 +1,121 @@ +//go:build integration + +package integration + +import ( + "context" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +func TestQuickValidation(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + startTime := time.Now() + duration := GetTestDuration(t, 30*time.Minute) + t.Logf("Testing: Quick validation test (duration: %v)", duration) + t.Log("Default: 30 minutes, configurable via test duration") + + db := SetupTestDB(t, "quick-validation") + defer db.Cleanup() + defer db.PrintTestSummary(t, "Quick Validation Test", startTime) + + t.Log("[1] Creating and populating database...") + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + if err := db.Populate("10MB"); err != nil { + t.Fatalf("Failed to populate database: %v", err) + } + + t.Log("✓ Database populated to 10MB") + + t.Log("[2] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(5 * time.Second) + + t.Log("[3] Generating wave pattern load...") + ctx, cancel := context.WithTimeout(context.Background(), duration) + defer cancel() + + config := DefaultLoadConfig() + config.WriteRate = 100 + config.Duration = duration + config.Pattern = LoadPatternWave + config.PayloadSize = 4 * 1024 + config.Workers = 4 + + if err := db.GenerateLoad(ctx, config.WriteRate, config.Duration, string(config.Pattern)); err != nil && ctx.Err() == nil { + t.Fatalf("Load generation failed: %v", err) + } + + t.Log("✓ Load generation complete") + + time.Sleep(10 * time.Second) + + t.Log("[4] Checking replica status...") + fileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to check replica: %v", err) + } + + if fileCount == 0 { + t.Fatal("No LTX segments created!") + } + + t.Logf("✓ LTX segments created: %d files", fileCount) + + dbSize, err := db.GetDatabaseSize() + if err != nil { + t.Fatalf("Failed to get database size: %v", err) + } + + t.Logf("Database size: %.2f MB", float64(dbSize)/(1024*1024)) + + t.Log("[5] Checking for errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + if len(errors) > 10 { + t.Fatalf("Too many critical errors (%d), showing first 5:\n%v", len(errors), errors[:5]) + } else if len(errors) > 0 { + t.Logf("Found %d errors (showing first 3):", len(errors)) + for i := 0; i < min(len(errors), 3); i++ { + t.Logf(" %s", errors[i]) + } + } else { + t.Log("✓ No errors detected") + } + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[6] Testing restore...") + restoredPath := filepath.Join(db.TempDir, "quick-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + t.Log("[7] Validating restoration...") + if err := db.QuickValidate(restoredPath); err != nil { + t.Fatalf("Validation failed: %v", err) + } + + t.Log("✓ Validation passed") + t.Log("TEST PASSED: Quick validation successful") +} diff --git a/tests/integration/scenario_test.go b/tests/integration/scenario_test.go new file mode 100644 index 000000000..42edf7450 --- /dev/null +++ b/tests/integration/scenario_test.go @@ -0,0 +1,342 @@ +//go:build integration + +package integration + +import ( + "database/sql" + "fmt" + "os" + "path/filepath" + "testing" + "time" + + _ "github.com/mattn/go-sqlite3" +) + +func TestFreshStart(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: Starting replication with a fresh (empty) database") + t.Log("This tests if Litestream works correctly when it creates the database from scratch") + + db := SetupTestDB(t, "fresh-start") + defer db.Cleanup() + + t.Log("[1] Starting Litestream with non-existent database...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(2 * time.Second) + + t.Log("[2] Creating database while Litestream is running...") + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + t.Fatalf("Failed to open database: %v", err) + } + + if _, err := sqlDB.Exec("PRAGMA journal_mode=WAL"); err != nil { + t.Fatalf("Failed to set WAL mode: %v", err) + } + + if _, err := sqlDB.Exec("CREATE TABLE test (id INTEGER PRIMARY KEY, data TEXT)"); err != nil { + t.Fatalf("Failed to create table: %v", err) + } + + if _, err := sqlDB.Exec("INSERT INTO test (data) VALUES ('initial data')"); err != nil { + t.Fatalf("Failed to insert initial data: %v", err) + } + sqlDB.Close() + + time.Sleep(3 * time.Second) + + t.Log("[3] Checking if Litestream detected the database...") + log, err := db.GetLitestreamLog() + if err != nil { + t.Fatalf("Failed to read log: %v", err) + } + + t.Logf("Litestream log snippet:\n%s", log[:min(len(log), 500)]) + + t.Log("[4] Adding data to test replication...") + sqlDB, err = sql.Open("sqlite3", db.Path) + if err != nil { + t.Fatalf("Failed to open database: %v", err) + } + + for i := 1; i <= 100; i++ { + if _, err := sqlDB.Exec("INSERT INTO test (data) VALUES (?)", fmt.Sprintf("row %d", i)); err != nil { + t.Fatalf("Failed to insert row %d: %v", i, err) + } + } + sqlDB.Close() + + time.Sleep(5 * time.Second) + + t.Log("[5] Checking for errors...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + if len(errors) > 1 { + t.Logf("Found %d errors (showing first 3):", len(errors)) + for i := 0; i < min(len(errors), 3); i++ { + t.Logf(" %s", errors[i]) + } + } else { + t.Log("✓ No significant errors") + } + + t.Log("[6] Checking replica files...") + fileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to get replica file count: %v", err) + } + + if fileCount == 0 { + t.Fatal("✗ No replica files created!") + } + + t.Logf("✓ Replica created with %d LTX files", fileCount) + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[7] Testing restore...") + restoredPath := filepath.Join(db.TempDir, "fresh-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("✗ Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + origCount, err := db.GetRowCount("test") + if err != nil { + t.Fatalf("Failed to get original row count: %v", err) + } + + restoredDB := &TestDB{Path: restoredPath, t: t} + restCount, err := restoredDB.GetRowCount("test") + if err != nil { + t.Fatalf("Failed to get restored row count: %v", err) + } + + if origCount != restCount { + t.Fatalf("✗ Data mismatch: Original=%d, Restored=%d", origCount, restCount) + } + + t.Logf("✓ Data integrity verified: %d rows", origCount) + t.Log("TEST PASSED: Fresh start works correctly") +} + +func TestDatabaseIntegrity(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: Complex data patterns and integrity after restore") + + db := SetupTestDB(t, "integrity-test") + defer db.Cleanup() + + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Log("[1] Creating complex schema...") + sqlDB, err := sql.Open("sqlite3", db.Path) + if err != nil { + t.Fatalf("Failed to open database: %v", err) + } + defer sqlDB.Close() + + if err := CreateComplexTestSchema(sqlDB); err != nil { + t.Fatalf("Failed to create schema: %v", err) + } + + t.Log("✓ Schema created") + + t.Log("[2] Populating with test data...") + if err := PopulateComplexTestData(sqlDB, 10, 5, 3); err != nil { + t.Fatalf("Failed to populate data: %v", err) + } + + t.Log("✓ Data populated (10 users, 50 posts, 150 comments)") + + t.Log("[3] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(10 * time.Second) + + db.StopLitestream() + time.Sleep(2 * time.Second) + + t.Log("[4] Checking integrity of original database...") + var integrityResult string + if err := sqlDB.QueryRow("PRAGMA integrity_check").Scan(&integrityResult); err != nil { + t.Fatalf("Integrity check failed: %v", err) + } + + if integrityResult != "ok" { + t.Fatalf("Source database integrity check failed: %s", integrityResult) + } + + t.Log("✓ Source database integrity OK") + + t.Log("[5] Restoring database...") + restoredPath := filepath.Join(db.TempDir, "integrity-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + t.Log("[6] Checking integrity of restored database...") + restoredDB, err := sql.Open("sqlite3", restoredPath) + if err != nil { + t.Fatalf("Failed to open restored database: %v", err) + } + defer restoredDB.Close() + + if err := restoredDB.QueryRow("PRAGMA integrity_check").Scan(&integrityResult); err != nil { + t.Fatalf("Restored integrity check failed: %v", err) + } + + if integrityResult != "ok" { + t.Fatalf("Restored database integrity check failed: %s", integrityResult) + } + + t.Log("✓ Restored database integrity OK") + + t.Log("[7] Validating data consistency...") + tables := []string{"users", "posts", "comments"} + for _, table := range tables { + var sourceCount, restoredCount int + + if err := sqlDB.QueryRow(fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&sourceCount); err != nil { + t.Fatalf("Failed to count source %s: %v", table, err) + } + + if err := restoredDB.QueryRow(fmt.Sprintf("SELECT COUNT(*) FROM %s", table)).Scan(&restoredCount); err != nil { + t.Fatalf("Failed to count restored %s: %v", table, err) + } + + if sourceCount != restoredCount { + t.Fatalf("Count mismatch for %s: source=%d, restored=%d", table, sourceCount, restoredCount) + } + + t.Logf("✓ Table %s: %d rows match", table, sourceCount) + } + + t.Log("TEST PASSED: Database integrity maintained through replication") +} + +func TestDatabaseDeletion(t *testing.T) { + if testing.Short() { + t.Skip("skipping integration test in short mode") + } + + RequireBinaries(t) + + t.Log("Testing: Database deletion during active replication") + + db := SetupTestDB(t, "deletion-test") + defer db.Cleanup() + + if err := db.Create(); err != nil { + t.Fatalf("Failed to create database: %v", err) + } + + t.Log("[1] Creating test table and data...") + if err := CreateTestTable(t, db.Path); err != nil { + t.Fatalf("Failed to create table: %v", err) + } + + if err := InsertTestData(t, db.Path, 100); err != nil { + t.Fatalf("Failed to insert test data: %v", err) + } + + t.Log("✓ Created table with 100 rows") + + t.Log("[2] Starting Litestream...") + if err := db.StartLitestream(); err != nil { + t.Fatalf("Failed to start Litestream: %v", err) + } + + time.Sleep(5 * time.Second) + + fileCount, _ := db.GetReplicaFileCount() + t.Logf("✓ Replication started (%d files)", fileCount) + + t.Log("[3] Deleting database files...") + os.Remove(db.Path) + os.Remove(db.Path + "-wal") + os.Remove(db.Path + "-shm") + + time.Sleep(3 * time.Second) + + t.Log("✓ Database deleted") + + t.Log("[4] Checking Litestream behavior...") + errors, err := db.CheckForErrors() + if err != nil { + t.Fatalf("Failed to check errors: %v", err) + } + + t.Logf("Litestream reported %d error messages (expected after database deletion)", len(errors)) + + db.StopLitestream() + + t.Log("[5] Verifying replica is still intact...") + finalFileCount, err := db.GetReplicaFileCount() + if err != nil { + t.Fatalf("Failed to check replica: %v", err) + } + + if finalFileCount == 0 { + t.Fatalf("Replica appears to be empty or missing") + } + + t.Logf("✓ Replica exists with %d files (was %d - compaction may have reduced count)", finalFileCount, fileCount) + + t.Log("[6] Testing restore from replica...") + restoredPath := filepath.Join(db.TempDir, "deletion-restored.db") + if err := db.Restore(restoredPath); err != nil { + t.Fatalf("Restore failed: %v", err) + } + + t.Log("✓ Restore successful") + + restoredDB := &TestDB{Path: restoredPath, t: t} + restCount, err := restoredDB.GetRowCount("test_data") + if err != nil { + t.Fatalf("Failed to get restored row count: %v", err) + } + + if restCount != 100 { + t.Fatalf("Expected 100 rows, got %d", restCount) + } + + t.Logf("✓ Restored database has correct data: %d rows", restCount) + t.Log("TEST PASSED: Replica survives source database deletion") +} + +// TestReplicaFailover was removed because Litestream no longer supports +// multiple replicas on a single database (see cmd/litestream/main.go). +// The bash script test-replica-failover.sh was also non-functional. + +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/tests/integration/soak_helpers.go b/tests/integration/soak_helpers.go new file mode 100644 index 000000000..ad6e625a0 --- /dev/null +++ b/tests/integration/soak_helpers.go @@ -0,0 +1,1150 @@ +//go:build integration && soak + +package integration + +import ( + "bufio" + "context" + "database/sql" + "fmt" + "os" + "os/exec" + "os/signal" + "path/filepath" + "strconv" + "strings" + "syscall" + "testing" + "time" +) + +// S3Config holds S3-specific configuration +type S3Config struct { + Endpoint string + AccessKey string + SecretKey string + Region string + ForcePathStyle bool + SkipVerify bool + SSE string + SSEKMSKeyID string +} + +// TestInfo holds test state for signal handler and monitoring +type TestInfo struct { + StartTime time.Time + Duration time.Duration + RowCount int + FileCount int + DB *TestDB + cancel context.CancelFunc +} + +// ErrorStats holds error categorization and counts +type ErrorStats struct { + TotalCount int + CriticalCount int + BenignCount int + RecentErrors []string + ErrorsByType map[string]int +} + +func isInteractive() bool { + if fi, err := os.Stdin.Stat(); err == nil { + return fi.Mode()&os.ModeCharDevice != 0 + } + return false +} + +func promptYesNo(t *testing.T, prompt string, defaultYes bool) bool { + t.Helper() + + switch strings.ToLower(strings.TrimSpace(os.Getenv("SOAK_AUTO_PURGE"))) { + case "y", "yes", "true", "1", "on": + t.Logf("%s yes (SOAK_AUTO_PURGE)", prompt) + return true + case "n", "no", "false", "0", "off": + t.Logf("%s no (SOAK_AUTO_PURGE)", prompt) + return false + } + + if !isInteractive() { + if defaultYes { + t.Logf("%s yes (non-interactive default)", prompt) + return true + } + t.Logf("%s no (non-interactive default)", prompt) + return false + } + + defPrompt := "[y/N]" + if defaultYes { + defPrompt = "[Y/n]" + } + + fmt.Printf("%s %s ", prompt, defPrompt) + reader := bufio.NewReader(os.Stdin) + text, err := reader.ReadString('\n') + if err != nil { + t.Logf("Failed to read response: %v (defaulting to no)", err) + return false + } + + switch strings.ToLower(strings.TrimSpace(text)) { + case "", "y", "yes": + if defaultYes || text != "" { + return true + } + return false + case "n", "no": + return false + default: + return defaultYes + } +} + +func promptYesNoDefaultNo(t *testing.T, prompt string) bool { + return promptYesNo(t, prompt, false) +} + +func promptYesNoDefaultYes(t *testing.T, prompt string) bool { + return promptYesNo(t, prompt, true) +} + +// RequireDocker checks if Docker is available +func RequireDocker(t *testing.T) { + t.Helper() + + cmd := exec.Command("docker", "version") + if err := cmd.Run(); err != nil { + t.Skip("Docker is not available, skipping test") + } +} + +// StartMinIOContainer starts a MinIO container and returns the container ID and endpoint +func StartMinIOContainer(t *testing.T) (containerID string, endpoint string, volumeName string) { + t.Helper() + + containerName := fmt.Sprintf("litestream-test-minio-%d", time.Now().Unix()) + volumeName = fmt.Sprintf("litestream-test-minio-data-%d", time.Now().Unix()) + minioPort := "9100" + consolePort := "9101" + + // Clean up any existing container + exec.Command("docker", "stop", containerName).Run() + exec.Command("docker", "rm", containerName).Run() + + // Remove any lingering volume with the same name, then create fresh volume. + exec.Command("docker", "volume", "rm", volumeName).Run() + if out, err := exec.Command("docker", "volume", "create", volumeName).CombinedOutput(); err != nil { + t.Fatalf("Failed to create MinIO volume: %v\nOutput: %s", err, string(out)) + } + + // Start MinIO container + cmd := exec.Command("docker", "run", "-d", + "--name", containerName, + "-p", minioPort+":9000", + "-p", consolePort+":9001", + "-v", volumeName+":/data", + "-e", "MINIO_ROOT_USER=minioadmin", + "-e", "MINIO_ROOT_PASSWORD=minioadmin", + "minio/minio", "server", "/data", "--console-address", ":9001") + + output, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("Failed to start MinIO container: %v\nOutput: %s", err, string(output)) + } + + containerID = strings.TrimSpace(string(output)) + endpoint = fmt.Sprintf("http://localhost:%s", minioPort) + + // Wait for MinIO to be ready + time.Sleep(5 * time.Second) + + // Verify container is running + cmd = exec.Command("docker", "ps", "-q", "-f", "name="+containerName) + output, err = cmd.CombinedOutput() + if err != nil || len(strings.TrimSpace(string(output))) == 0 { + t.Fatalf("MinIO container failed to start properly") + } + + t.Logf("MinIO container started: %s (endpoint: %s)", containerID[:12], endpoint) + + return containerID, endpoint, volumeName +} + +// StopMinIOContainer stops and removes a MinIO container +func StopMinIOContainer(t *testing.T, containerID string, volumeName string) { + t.Helper() + + if containerID == "" { + return + } + + t.Logf("Stopping MinIO container: %s", containerID[:12]) + + exec.Command("docker", "stop", containerID).Run() + exec.Command("docker", "rm", containerID).Run() + + if volumeName != "" { + exec.Command("docker", "volume", "rm", volumeName).Run() + } +} + +// CreateMinIOBucket creates a bucket in MinIO +func CreateMinIOBucket(t *testing.T, containerID, bucket string) { + t.Helper() + + if minioBucketExists(containerID, bucket) { + if promptYesNoDefaultYes(t, fmt.Sprintf("Bucket '%s' already exists. Purge existing objects before running soak test?", bucket)) { + t.Logf("Purging MinIO bucket '%s'...", bucket) + if err := clearMinIOBucket(containerID, bucket); err != nil { + t.Fatalf("Failed to purge MinIO bucket: %v", err) + } + } else { + t.Logf("Skipping purge of bucket '%s'. Residual data may cause replication errors.", bucket) + } + } + + // Use mc (MinIO Client) via docker to create bucket + cmd := exec.Command("docker", "run", "--rm", + "--link", containerID+":minio", + "-e", "MC_HOST_minio=http://minioadmin:minioadmin@minio:9000", + "minio/mc", "mb", "minio/"+bucket) + + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + if err := cmd.Run(); err != nil { + output := combinedOutput(stdoutBuf, stderrBuf) + if !strings.Contains(output, "already exists") { + t.Fatalf("Create bucket failed: %v Output: %s", err, output) + } + } + + if err := waitForMinIOBucket(containerID, bucket, 60*time.Second); err != nil { + t.Fatalf("Bucket %s not ready: %v", bucket, err) + } + + if err := clearMinIOBucket(containerID, bucket); err != nil { + t.Fatalf("Failed to purge MinIO bucket: %v", err) + } + + t.Logf("MinIO bucket '%s' ready", bucket) +} + +func minioBucketExists(containerID, bucket string) bool { + cmd := exec.Command("docker", "run", "--rm", + "--link", containerID+":minio", + "-e", "MC_HOST_minio=http://minioadmin:minioadmin@minio:9000", + "minio/mc", "ls", "minio/"+bucket+"/") + _, _, _ = configureCmdIO(cmd) + if err := cmd.Run(); err != nil { + return false + } + return true +} + +func clearMinIOBucket(containerID, bucket string) error { + cmd := exec.Command("docker", "run", "--rm", + "--link", containerID+":minio", + "-e", "MC_HOST_minio=http://minioadmin:minioadmin@minio:9000", + "minio/mc", "rm", "--recursive", "--force", "minio/"+bucket) + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + if err := cmd.Run(); err != nil { + output := combinedOutput(stdoutBuf, stderrBuf) + if output != "" { + return fmt.Errorf("%w: %s", err, output) + } + return err + } + return nil +} + +func waitForMinIOBucket(containerID, bucket string, timeout time.Duration) error { + deadline := time.Now().Add(timeout) + for { + if minioBucketExists(containerID, bucket) { + return nil + } + if time.Now().After(deadline) { + break + } + time.Sleep(500 * time.Millisecond) + } + return fmt.Errorf("bucket %s not available", bucket) +} + +// CountMinIOObjects counts objects in a MinIO bucket +func CountMinIOObjects(t *testing.T, containerID, bucket string) int { + t.Helper() + + cmd := exec.Command("docker", "run", "--rm", + "--link", containerID+":minio", + "-e", "MC_HOST_minio=http://minioadmin:minioadmin@minio:9000", + "minio/mc", "ls", "minio/"+bucket+"/", "--recursive") + + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + if err := cmd.Run(); err != nil { + return 0 + } + + output := combinedOutput(stdoutBuf, stderrBuf) + lines := strings.Split(strings.TrimSpace(output), "\n") + if len(lines) == 1 && lines[0] == "" { + return 0 + } + + return len(lines) +} + +// CheckAWSCredentials checks if AWS credentials are set and returns bucket and region +func CheckAWSCredentials(t *testing.T) (bucket, region string) { + t.Helper() + + accessKey := os.Getenv("AWS_ACCESS_KEY_ID") + secretKey := os.Getenv("AWS_SECRET_ACCESS_KEY") + bucket = os.Getenv("S3_BUCKET") + region = os.Getenv("AWS_REGION") + + if accessKey == "" || secretKey == "" || bucket == "" { + t.Skip("AWS credentials not set. Set AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and S3_BUCKET") + } + + if region == "" { + region = "us-east-1" + } + + t.Logf("Using AWS S3: bucket=%s, region=%s", bucket, region) + + return bucket, region +} + +// TestS3Connectivity tests if we can access the S3 bucket +func TestS3Connectivity(t *testing.T, bucket string) { + t.Helper() + + cmd := exec.Command("aws", "s3", "ls", "s3://"+bucket+"/") + if err := cmd.Run(); err != nil { + t.Fatalf("Failed to access S3 bucket '%s': %v\nEnsure AWS CLI is installed and credentials are valid", bucket, err) + } + + t.Logf("✓ S3 bucket '%s' is accessible", bucket) +} + +// CountS3Objects counts objects in an S3 path +func CountS3Objects(t *testing.T, s3URL string) int { + t.Helper() + + cmd := exec.Command("aws", "s3", "ls", s3URL+"/", "--recursive") + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + if err := cmd.Run(); err != nil { + return 0 + } + + output := combinedOutput(stdoutBuf, stderrBuf) + lines := strings.Split(strings.TrimSpace(output), "\n") + if len(lines) == 1 && lines[0] == "" { + return 0 + } + + return len(lines) +} + +// GetS3StorageSize gets the total storage size of an S3 path +func GetS3StorageSize(t *testing.T, s3URL string) int64 { + t.Helper() + + cmd := exec.Command("aws", "s3", "ls", s3URL+"/", "--recursive", "--summarize") + _, stdoutBuf, stderrBuf := configureCmdIO(cmd) + if err := cmd.Run(); err != nil { + return 0 + } + + output := combinedOutput(stdoutBuf, stderrBuf) + lines := strings.Split(output, "\n") + for _, line := range lines { + if strings.Contains(line, "Total Size:") { + var size int64 + fmt.Sscanf(line, "Total Size: %d", &size) + return size + } + } + + return 0 +} + +// CreateSoakConfig creates a litestream configuration file for soak tests +func CreateSoakConfig(dbPath, replicaURL string, s3Config *S3Config, shortMode bool) string { + tempDir := filepath.Dir(dbPath) + configPath := filepath.Join(tempDir, "litestream.yml") + + var config strings.Builder + + snapshotInterval := "10m" + snapshotRetention := "1h" + retentionCheckInterval := "5m" + levelIntervals := []string{"30s", "1m", "5m", "15m", "30m"} + + if shortMode { + snapshotInterval = "30s" + snapshotRetention = "10m" + retentionCheckInterval = "2m" + levelIntervals = []string{"15s", "30s", "1m"} + } + + // Add S3 credentials if provided + if s3Config != nil && s3Config.AccessKey != "" { + config.WriteString(fmt.Sprintf("access-key-id: %s\n", s3Config.AccessKey)) + config.WriteString(fmt.Sprintf("secret-access-key: %s\n", s3Config.SecretKey)) + config.WriteString("\n") + } + + // Aggressive snapshot settings for testing + config.WriteString("snapshot:\n") + config.WriteString(fmt.Sprintf(" interval: %s\n", snapshotInterval)) + config.WriteString(fmt.Sprintf(" retention: %s\n", snapshotRetention)) + config.WriteString("\n") + + // Aggressive compaction levels + config.WriteString("levels:\n") + for _, interval := range levelIntervals { + config.WriteString(fmt.Sprintf(" - interval: %s\n", interval)) + } + config.WriteString("\n") + + // Database configuration + config.WriteString("dbs:\n") + config.WriteString(fmt.Sprintf(" - path: %s\n", filepath.ToSlash(dbPath))) + config.WriteString(" checkpoint-interval: 1m\n") + config.WriteString(" min-checkpoint-page-count: 100\n") + config.WriteString(" max-checkpoint-page-count: 5000\n") + config.WriteString("\n") + config.WriteString(" replicas:\n") + config.WriteString(fmt.Sprintf(" - url: %s\n", replicaURL)) + + // Add S3-specific settings if provided + if s3Config != nil { + if s3Config.Endpoint != "" { + config.WriteString(fmt.Sprintf(" endpoint: %s\n", s3Config.Endpoint)) + } + if s3Config.Region != "" { + config.WriteString(fmt.Sprintf(" region: %s\n", s3Config.Region)) + } + if s3Config.ForcePathStyle { + config.WriteString(" force-path-style: true\n") + } + if s3Config.SkipVerify { + config.WriteString(" skip-verify: true\n") + } + if s3Config.SSE != "" { + config.WriteString(fmt.Sprintf(" sse: %s\n", s3Config.SSE)) + } + if s3Config.SSEKMSKeyID != "" { + config.WriteString(fmt.Sprintf(" sse-kms-key-id: %s\n", s3Config.SSEKMSKeyID)) + } + config.WriteString(fmt.Sprintf(" retention-check-interval: %s\n", retentionCheckInterval)) + } + + if err := os.WriteFile(configPath, []byte(config.String()), 0644); err != nil { + panic(fmt.Sprintf("Failed to create config file: %v", err)) + } + + return configPath +} + +// setupSignalHandler sets up SIGINT/SIGTERM handler with confirmation +func setupSignalHandler(t *testing.T, cancel context.CancelFunc, testInfo *TestInfo) { + t.Helper() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + go func() { + firstInterrupt := true + + for sig := range sigChan { + if firstInterrupt { + firstInterrupt = false + + t.Logf("") + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + t.Logf("⚠ Interrupt signal received (%v)", sig) + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + t.Logf("") + + elapsed := time.Since(testInfo.StartTime) + remaining := testInfo.Duration - elapsed + pct := float64(elapsed) / float64(testInfo.Duration) * 100 + + t.Logf("Test Progress:") + t.Logf(" Elapsed: %v (%.0f%% complete)", elapsed.Round(time.Second), pct) + t.Logf(" Remaining: %v", remaining.Round(time.Second)) + t.Logf(" Data collected: %d rows, %d replica files", testInfo.RowCount, testInfo.FileCount) + t.Logf("") + t.Logf("Press Ctrl+C again within 5 seconds to confirm shutdown.") + t.Logf("Otherwise, test will continue...") + t.Logf("") + + // Wait 5 seconds for second interrupt + timeout := time.NewTimer(5 * time.Second) + select { + case <-sigChan: + // Second interrupt - confirmed shutdown + timeout.Stop() + t.Logf("Shutdown confirmed. Initiating graceful cleanup...") + cancel() // Cancel context to stop test + performGracefulShutdown(t, testInfo) + return + + case <-timeout.C: + // Timeout - continue test + t.Logf("No confirmation received. Continuing test...") + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + t.Logf("") + firstInterrupt = true + } + } else { + // Second interrupt received + t.Logf("Shutdown confirmed. Initiating graceful cleanup...") + cancel() + performGracefulShutdown(t, testInfo) + return + } + } + }() + + t.Cleanup(func() { + signal.Stop(sigChan) + close(sigChan) + }) +} + +// performGracefulShutdown performs cleanup on early termination +func performGracefulShutdown(t *testing.T, testInfo *TestInfo) { + t.Helper() + + if testInfo.cancel != nil { + testInfo.cancel() + } + + t.Log("") + t.Log("================================================") + t.Log("Graceful Shutdown - Early Termination") + t.Log("================================================") + t.Log("") + + elapsed := time.Since(testInfo.StartTime) + + // Stop Litestream gracefully + t.Log("Stopping Litestream...") + if err := testInfo.DB.StopLitestream(); err != nil { + t.Logf("Warning: Error stopping Litestream: %v", err) + } else { + t.Log("✓ Litestream stopped") + } + + // Wait for pending operations + t.Log("Waiting for pending operations to complete...") + time.Sleep(2 * time.Second) + + // Show partial results + t.Log("") + t.Log("Partial Test Results:") + t.Logf(" Test duration: %v (%.0f%% of planned %v)", + elapsed.Round(time.Second), + float64(elapsed)/float64(testInfo.Duration)*100, + testInfo.Duration.Round(time.Minute)) + + if dbSize, err := testInfo.DB.GetDatabaseSize(); err == nil { + t.Logf(" Database size: %.2f MB", float64(dbSize)/(1024*1024)) + } + + if rowCount, err := testInfo.DB.GetRowCount("load_test"); err == nil { + t.Logf(" Rows inserted: %d", rowCount) + if elapsed.Seconds() > 0 { + rate := float64(rowCount) / elapsed.Seconds() + t.Logf(" Average write rate: %.1f rows/second", rate) + } + } + + if fileCount, err := testInfo.DB.GetReplicaFileCount(); err == nil { + t.Logf(" Replica LTX files: %d", fileCount) + } + + // Run abbreviated analysis + t.Log("") + t.Log("Analyzing partial test data...") + analysis := AnalyzeSoakTest(t, testInfo.DB, elapsed) + + t.Log("") + t.Log("What Was Validated (Partial):") + if analysis.SnapshotCount > 0 { + t.Logf(" ✓ Snapshots: %d generated", analysis.SnapshotCount) + } + if analysis.TotalCompactions > 0 { + t.Logf(" ✓ Compactions: %d completed", analysis.TotalCompactions) + } + if analysis.DatabaseRows > 0 { + t.Logf(" ✓ Data written: %d rows", analysis.DatabaseRows) + } + + // Check for errors + errors, _ := testInfo.DB.CheckForErrors() + criticalErrors := 0 + for _, errLine := range errors { + if !strings.Contains(errLine, "page size not initialized") { + criticalErrors++ + } + } + t.Logf(" Critical errors: %d", criticalErrors) + + // Show where data is preserved + t.Log("") + t.Log("Test artifacts preserved at:") + t.Logf(" %s", testInfo.DB.TempDir) + + if logPath, err := testInfo.DB.GetLitestreamLog(); err == nil { + t.Logf(" Log: %s", logPath) + } + + t.Log("") + t.Log("Test terminated early by user.") + t.Log("================================================") + + // Mark test as failed (early termination) + t.Fail() +} + +// getErrorStats categorizes and counts errors +func getErrorStats(db *TestDB) ErrorStats { + errors, _ := db.CheckForErrors() + stats := ErrorStats{ + TotalCount: len(errors), + ErrorsByType: make(map[string]int), + } + + for _, errLine := range errors { + switch { + case strings.Contains(errLine, "page size not initialized"): + stats.BenignCount++ + stats.ErrorsByType["page size not initialized"]++ + case strings.Contains(errLine, "connection refused"): + stats.BenignCount++ + stats.ErrorsByType["connection refused"]++ + case strings.Contains(errLine, "context canceled"): + stats.BenignCount++ + stats.ErrorsByType["context canceled"]++ + default: + stats.CriticalCount++ + if len(stats.RecentErrors) < 5 { + stats.RecentErrors = append(stats.RecentErrors, errLine) + } + + switch { + case strings.Contains(errLine, "timeout"): + stats.ErrorsByType["timeout"]++ + case strings.Contains(errLine, "compaction failed"): + stats.ErrorsByType["compaction failed"]++ + default: + stats.ErrorsByType["other"]++ + } + } + } + + return stats +} + +// printProgress displays progress bar with error status +func printProgress(t *testing.T, elapsed, total time.Duration, errorStats ErrorStats) { + t.Helper() + + if total <= 0 { + total = time.Second + } + + if elapsed < 0 { + elapsed = 0 + } + + pct := float64(elapsed) / float64(total) * 100 + if pct > 100 { + pct = 100 + } else if pct < 0 { + pct = 0 + } + + remaining := total - elapsed + if remaining < 0 { + remaining = 0 + } + + // Progress bar + barWidth := 40 + filled := 0 + if total.Seconds() > 0 { + ratio := elapsed.Seconds() / total.Seconds() + if ratio < 0 { + ratio = 0 + } else if ratio > 1 { + ratio = 1 + } + filled = int(float64(barWidth) * ratio) + } + if filled > barWidth { + filled = barWidth + } + if filled < 0 { + filled = 0 + } + bar := strings.Repeat("█", filled) + strings.Repeat("░", barWidth-filled) + + // Status indicator + status := "✓" + if errorStats.CriticalCount > 0 { + status = "⚠" + } + + t.Logf("%s Progress: [%s] %.0f%% | %v elapsed | %v remaining | Errors: %d/%d", + status, bar, pct, + elapsed.Round(time.Minute), remaining.Round(time.Minute), + errorStats.CriticalCount, errorStats.TotalCount) +} + +// printErrorDetails displays detailed error information +func printErrorDetails(t *testing.T, errorStats ErrorStats) { + t.Helper() + + t.Log("") + t.Log("⚠ Error Status:") + t.Logf(" Total: %d (%d critical, %d benign)", errorStats.TotalCount, errorStats.CriticalCount, errorStats.BenignCount) + + // Group critical errors by type + if errorStats.CriticalCount > 0 { + t.Log(" Critical errors:") + for errorType, count := range errorStats.ErrorsByType { + if errorType != "page size not initialized" && count > 0 { + t.Logf(" • %q (%d)", errorType, count) + } + } + + // Show recent errors + if len(errorStats.RecentErrors) > 0 { + t.Log("") + t.Log(" Recent errors:") + for _, errLine := range errorStats.RecentErrors { + // Extract just the error message + if idx := strings.Index(errLine, "error="); idx != -1 { + msg := errLine[idx+7:] + if len(msg) > 80 { + msg = msg[:80] + "..." + } + t.Logf(" %s", msg) + } + } + } + } + + // Show benign errors if present + if errorStats.BenignCount > 0 { + t.Log("") + t.Logf(" Benign: %q (%d)", "page size not initialized", errorStats.BenignCount) + } +} + +// shouldAbortTest checks if test should auto-abort due to critical issues +func shouldAbortTest(errorStats ErrorStats, fileCount int, elapsed time.Duration) (bool, string) { + // Abort if critical error threshold exceeded after extended runtime + if elapsed > 10*time.Minute && errorStats.CriticalCount > 100 { + return true, fmt.Sprintf("Critical error threshold exceeded (%d errors)", errorStats.CriticalCount) + } + + // Abort if replication completely stopped (0 files after 10 minutes) + if elapsed > 10*time.Minute && fileCount == 0 { + return true, "Replication not working (0 files created after 10 minutes)" + } + + // Abort if error rate is increasing rapidly (>1 error/minute) + if errorStats.CriticalCount > 0 && elapsed > 30*time.Minute { + minutes := elapsed.Minutes() + if minutes > 0 { + errorRate := float64(errorStats.CriticalCount) / minutes + if errorRate > 2.0 { + return true, fmt.Sprintf("Error rate too high (%.1f errors/minute)", errorRate) + } + } + } + + return false, "" +} + +// MonitorSoakTest monitors a soak test, calling metricsFunc every 60 seconds +func MonitorSoakTest(t *testing.T, db *TestDB, ctx context.Context, info *TestInfo, refresh func(), logFunc func()) { + t.Helper() + + if info == nil { + info = &TestInfo{} + } + + ticker := time.NewTicker(60 * time.Second) + defer ticker.Stop() + + lastCritical := -1 + lastTotal := -1 + lastProgress := -1.0 + + for { + select { + case <-ctx.Done(): + if refresh != nil { + refresh() + } + if info != nil { + // Show final progress snapshot + errorStats := getErrorStats(db) + if lastProgress < 0 || lastProgress < 100 || errorStats.CriticalCount != lastCritical || errorStats.TotalCount != lastTotal { + printProgress(t, info.Duration, info.Duration, errorStats) + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + t.Logf("[%s] Status Report", time.Now().Format("15:04:05")) + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + if logFunc != nil { + logFunc() + } + if errorStats.CriticalCount > 0 { + printErrorDetails(t, errorStats) + } + } + } + t.Log("Monitoring stopped: test duration completed") + return + case <-ticker.C: + if refresh != nil { + refresh() + } + + elapsed := time.Since(info.StartTime) + if elapsed < 0 { + elapsed = 0 + } + + errorStats := getErrorStats(db) + + if shouldAbort, reason := shouldAbortTest(errorStats, info.FileCount, elapsed); shouldAbort { + t.Logf("") + t.Logf("⚠ AUTO-ABORTING TEST: %s", reason) + if info.cancel != nil { + info.cancel() + } + t.Fail() + return + } + + totalDuration := info.Duration + if totalDuration <= 0 { + totalDuration = time.Second + } + progress := elapsed.Seconds() / totalDuration.Seconds() * 100 + if progress < 0 { + progress = 0 + } else if progress > 100 { + progress = 100 + } + + shouldLog := false + if lastCritical == -1 && lastTotal == -1 { + shouldLog = true + } + + if !shouldLog && (errorStats.CriticalCount != lastCritical || errorStats.TotalCount != lastTotal) { + shouldLog = true + } + + if !shouldLog && (lastProgress < 0 || progress >= lastProgress+5 || progress >= 100) { + shouldLog = true + } + + if !shouldLog { + continue + } + + lastCritical = errorStats.CriticalCount + lastTotal = errorStats.TotalCount + lastProgress = progress + + printProgress(t, elapsed, info.Duration, errorStats) + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + t.Logf("[%s] Status Report", time.Now().Format("15:04:05")) + t.Logf("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━") + + if logFunc != nil { + logFunc() + } + + if errorStats.CriticalCount > 0 { + printErrorDetails(t, errorStats) + } + + t.Log("") + } + } +} + +// LogSoakMetrics logs basic soak test metrics +func LogSoakMetrics(t *testing.T, db *TestDB, testName string) { + t.Helper() + + // Database size + if dbSize, err := db.GetDatabaseSize(); err == nil { + t.Logf(" Database size: %.2f MB", float64(dbSize)/(1024*1024)) + } + + // WAL size + walPath := db.Path + "-wal" + if info, err := os.Stat(walPath); err == nil { + t.Logf(" WAL size: %.2f MB", float64(info.Size())/(1024*1024)) + } + + // Row count + if count, err := db.GetRowCount("load_test"); err == nil { + t.Logf(" Rows: %d", count) + } else if count, err := db.GetRowCount("test_table_0"); err == nil { + t.Logf(" Rows: %d", count) + } + + // Replica stats + if fileCount, err := db.GetReplicaFileCount(); err == nil { + t.Logf(" Replica LTX files: %d", fileCount) + } + + // Error check - filter out known benign errors + if errors, err := db.CheckForErrors(); err == nil && len(errors) > 0 { + criticalErrors := []string{} + for _, errLine := range errors { + if !strings.Contains(errLine, "page size not initialized") { + criticalErrors = append(criticalErrors, errLine) + } + } + if len(criticalErrors) > 0 { + t.Logf(" ⚠ Critical errors detected: %d", len(criticalErrors)) + if len(criticalErrors) <= 2 { + for _, errLine := range criticalErrors { + t.Logf(" %s", errLine) + } + } + } + } +} + +// SoakTestAnalysis holds detailed soak test metrics +type SoakTestAnalysis struct { + CompactionsByLevel map[int]int + TotalCompactions int + SnapshotCount int + CheckpointCount int + TotalFilesCreated int + FinalFileCount int + MinTxID string + MaxTxID string + DatabaseRows int64 + MinRowID int64 + MaxRowID int64 + DatabaseSizeMB float64 + Duration time.Duration +} + +// AnalyzeSoakTest analyzes test results from logs and database +func AnalyzeSoakTest(t *testing.T, db *TestDB, duration time.Duration) *SoakTestAnalysis { + t.Helper() + + analysis := &SoakTestAnalysis{ + CompactionsByLevel: make(map[int]int), + Duration: duration, + } + + // Get database stats + if count, err := db.GetRowCount("load_test"); err == nil { + analysis.DatabaseRows = int64(count) + } + + if dbSize, err := db.GetDatabaseSize(); err == nil { + analysis.DatabaseSizeMB = float64(dbSize) / (1024 * 1024) + } + + // Get row ID range + sqlDB, err := sql.Open("sqlite3", db.Path) + if err == nil { + defer sqlDB.Close() + sqlDB.QueryRow("SELECT MIN(id), MAX(id) FROM load_test").Scan(&analysis.MinRowID, &analysis.MaxRowID) + } + + // Get final file count + if count, err := db.GetReplicaFileCount(); err == nil { + analysis.FinalFileCount = count + } + + // Parse litestream log + logPath, _ := db.GetLitestreamLog() + if logPath != "" { + parseLog(logPath, analysis) + } + + return analysis +} + +func parseLog(logPath string, analysis *SoakTestAnalysis) { + file, err := os.Open(logPath) + if err != nil { + return + } + defer file.Close() + + scanner := bufio.NewScanner(file) + var firstTxID, lastTxID string + + for scanner.Scan() { + line := scanner.Text() + + if strings.Contains(line, "compaction complete") { + analysis.TotalCompactions++ + + // Extract level + if idx := strings.Index(line, "level="); idx != -1 { + levelStr := line[idx+6:] + if spaceIdx := strings.Index(levelStr, " "); spaceIdx != -1 { + levelStr = levelStr[:spaceIdx] + } + if level, err := strconv.Atoi(levelStr); err == nil { + analysis.CompactionsByLevel[level]++ + } + } + + // Extract transaction IDs + if idx := strings.Index(line, "txid.min="); idx != -1 { + txMin := line[idx+9 : idx+25] + if firstTxID == "" { + firstTxID = txMin + } + } + if idx := strings.Index(line, "txid.max="); idx != -1 { + txMax := line[idx+9 : idx+25] + lastTxID = txMax + } + } + + if strings.Contains(line, "snapshot complete") { + analysis.SnapshotCount++ + } + + if strings.Contains(line, "checkpoint complete") { + analysis.CheckpointCount++ + } + } + + analysis.MinTxID = firstTxID + analysis.MaxTxID = lastTxID + + // Count all LTX files ever created (from txid range) + if analysis.MaxTxID != "" { + if maxID, err := strconv.ParseInt(analysis.MaxTxID, 16, 64); err == nil { + analysis.TotalFilesCreated = int(maxID) + } + } +} + +// PrintSoakTestAnalysis prints detailed analysis and plain English summary +func PrintSoakTestAnalysis(t *testing.T, analysis *SoakTestAnalysis) { + t.Helper() + + t.Log("") + t.Log("================================================") + t.Log("Detailed Test Metrics") + t.Log("================================================") + t.Log("") + + // Compaction breakdown + t.Log("Compaction Activity:") + t.Logf(" Total compactions: %d", analysis.TotalCompactions) + levels := []int{1, 2, 3, 4, 5} + for _, level := range levels { + if count := analysis.CompactionsByLevel[level]; count > 0 { + t.Logf(" Level %d: %d compactions", level, count) + } + } + t.Log("") + + // File operations + t.Log("File Operations:") + t.Logf(" Total LTX files created: %d", analysis.TotalFilesCreated) + if analysis.TotalFilesCreated > 0 { + t.Logf(" Final file count: %d (%.1f%% reduction)", + analysis.FinalFileCount, + 100.0*float64(analysis.TotalFilesCreated-analysis.FinalFileCount)/float64(analysis.TotalFilesCreated)) + } + t.Logf(" Snapshots generated: %d", analysis.SnapshotCount) + if analysis.CheckpointCount > 0 { + t.Logf(" Checkpoints: %d", analysis.CheckpointCount) + } + t.Log("") + + // Database activity + t.Log("Database Activity:") + t.Logf(" Total rows: %d", analysis.DatabaseRows) + t.Logf(" Row ID range: %d → %d", analysis.MinRowID, analysis.MaxRowID) + gapCount := (analysis.MaxRowID - analysis.MinRowID + 1) - analysis.DatabaseRows + if gapCount == 0 { + t.Log(" Row continuity: ✓ No gaps (perfect)") + } else { + t.Logf(" Row continuity: %d gaps detected", gapCount) + } + t.Logf(" Final database size: %.2f MB", analysis.DatabaseSizeMB) + if analysis.Duration.Seconds() > 0 { + avgRate := float64(analysis.DatabaseRows) / analysis.Duration.Seconds() + t.Logf(" Average write rate: %.1f rows/second", avgRate) + } + t.Log("") + + // Transaction range + if analysis.MinTxID != "" && analysis.MaxTxID != "" { + t.Log("Replication Range:") + t.Logf(" First transaction: %s", analysis.MinTxID) + t.Logf(" Last transaction: %s", analysis.MaxTxID) + t.Log("") + } + + // Plain English summary + t.Log("================================================") + t.Log("What This Test Validated") + t.Log("================================================") + t.Log("") + + t.Logf("✓ Long-term Stability") + t.Logf(" Litestream ran flawlessly for %v under sustained load", analysis.Duration.Round(time.Minute)) + t.Log("") + + t.Log("✓ Snapshot Generation") + t.Logf(" %d snapshots created successfully", analysis.SnapshotCount) + t.Log("") + + t.Log("✓ Compaction Efficiency") + if analysis.TotalFilesCreated > 0 { + reductionPct := 100.0 * float64(analysis.TotalFilesCreated-analysis.FinalFileCount) / float64(analysis.TotalFilesCreated) + t.Logf(" Reduced %d files to %d (%.0f%% reduction through compaction)", + analysis.TotalFilesCreated, analysis.FinalFileCount, reductionPct) + } + t.Log("") + + if analysis.DatabaseSizeMB > 1000 { + t.Log("✓ Large Database Handling") + t.Logf(" Successfully replicated %.1f GB database", analysis.DatabaseSizeMB/1024) + t.Log("") + } + + t.Log("✓ Restoration Capability") + t.Log(" Full restore from replica completed successfully") + t.Log("") + + t.Log("✓ Data Integrity") + t.Log(" SQLite integrity check confirmed no corruption") + if gapCount == 0 { + t.Log(" All rows present with perfect continuity") + } + t.Log("") +}