CLI Scripting Patterns
Learn how to use SQLStream effectively in shell scripts, automation workflows, and CI/CD pipelines.
Exit Codes
SQLStream follows standard Unix exit code conventions:
- 0: Success
- 1: General error (SQL syntax, file not found, etc.)
- 2: Invalid command-line arguments
Using Exit Codes in Scripts
#!/bin/bash
if sqlstream query data.csv "SELECT * FROM data WHERE id = 1" > /dev/null 2>&1; then
echo "✓ Query succeeded"
else
echo "✗ Query failed with exit code $?"
exit 1
fi
Error Handling Pattern
#!/bin/bash
set -e # Exit on any error
# Query will fail if file doesn't exist or SQL is invalid
RESULT=$(sqlstream query data.csv "SELECT COUNT(*) FROM data" --format json 2>&1) || {
echo "ERROR: Query failed"
echo "$RESULT"
exit 1
}
echo "Query succeeded: $RESULT"
Output Capture & Processing
Capture JSON Output
#!/bin/bash
# Capture JSON and process with jq
RESULT=$(sqlstream query employees.csv "SELECT * FROM employees WHERE salary > 80000" --format json)
# Count high earners
COUNT=$(echo "$RESULT" | jq 'length')
echo "High earners: $COUNT"
# Extract names
echo "$RESULT" | jq -r '.[] | .name'
Capture CSV Output
#!/bin/bash
# Save to file
sqlstream query data.csv "SELECT * FROM data WHERE status = 'pending'" --format csv > pending_items.csv
# Count lines (excluding header)
LINE_COUNT=$(($(wc -l < pending_items.csv) - 1))
echo "Pending items: $LINE_COUNT"
Parse Table Output
#!/bin/bash
# Get scalar value from table output
TOTAL=$(sqlstream query sales.csv "SELECT SUM(amount) as total FROM sales" --format json | jq -r '.[0].total')
echo "Total sales: \$$TOTAL"
Piping & Chaining
Pipe to Unix Tools
# Pipe to grep
sqlstream query logs.csv "SELECT * FROM logs" --format csv | grep ERROR
# Pipe to awk
sqlstream query data.csv "SELECT * FROM data" --format csv | awk -F, '$3 > 100 {print $1, $3}'
# Pipe to sort
sqlstream query data.csv "SELECT name, amount FROM data" --format csv | sort -t, -k2 -nr
# Pipe to jq for complex JSON processing
sqlstream query data.csv "SELECT * FROM data" --format json | \
jq '.[] | select(.status == "active") | {id, name, amount}'
Chain Multiple Queries
#!/bin/bash
# Extract IDs from first query
IDS=$(sqlstream query employees.csv "SELECT id FROM employees WHERE department = 'Engineering'" --format json | jq -r '.[] | .id')
# Use IDs in second query (construct IN clause)
ID_LIST=$(echo "$IDS" | tr '\n' ',' | sed 's/,$//')
sqlstream query orders.csv "SELECT * FROM orders WHERE employee_id IN ($ID_LIST)" --format json
Multi-Stage Pipeline
#!/bin/bash
# Stage 1: Extract active users
sqlstream query users.csv "SELECT id, name FROM users WHERE status = 'active'" --format csv > active_users.csv
# Stage 2: Get their orders
sqlstream query "
SELECT u.name, o.amount
FROM 'active_users.csv' u
JOIN 'orders.csv' o ON u.id = o.user_id
" --format csv > active_user_orders.csv
# Stage 3: Aggregate
sqlstream query "
SELECT name, SUM(amount) as total
FROM 'active_user_orders.csv'
GROUP BY name
" --format csv > summary.csv
Automation Patterns
Cron Jobs
# crontab -e
# Run daily at 2 AM
0 2 * * * /home/user/scripts/daily_report.sh
# Run every hour
0 * * * * sqlstream query /data/logs.csv "SELECT * FROM logs WHERE level = 'ERROR' AND timestamp > datetime('now', '-1 hour')" --format json > /reports/hourly_errors_$(date +\%Y\%m\%d_\%H).json
# Run every Monday at 9 AM
0 9 * * 1 sqlstream query /data/sales.csv "SELECT * FROM sales WHERE date >= date('now', '-7 days')" --format csv > /reports/weekly_sales.csv
Daily Report Script
#!/bin/bash
# daily_report.sh
DATE=$(date +%Y-%m-%d)
REPORT_DIR="/reports/$DATE"
mkdir -p "$REPORT_DIR"
# Generate multiple reports
echo "Generating reports for $DATE..."
# Sales summary
sqlstream query /data/sales.csv "
SELECT
date,
COUNT(*) as order_count,
SUM(amount) as total_sales
FROM sales
WHERE date = '$DATE'
GROUP BY date
" --format json > "$REPORT_DIR/sales_summary.json"
# Error log
sqlstream query /data/logs.csv "
SELECT * FROM logs
WHERE date = '$DATE' AND level = 'ERROR'
" --format csv > "$REPORT_DIR/errors.csv"
# User activity
sqlstream query /data/users.csv "
SELECT status, COUNT(*) as count
FROM users
GROUP BY status
" --format json > "$REPORT_DIR/user_stats.json"
echo "✓ Reports generated in $REPORT_DIR"
Monitoring Script
#!/bin/bash
# monitor_data_quality.sh
set -e
# Check for null values
NULL_COUNT=$(sqlstream query data.csv "SELECT COUNT(*) as count FROM data WHERE important_field IS NULL" --format json | jq -r '.[0].count')
if [ "$NULL_COUNT" -gt 0 ]; then
echo "⚠ WARNING: Found $NULL_COUNT null values in important_field"
# Send alert
curl -X POST "https://alerts.example.com/webhook" \
-H "Content-Type: application/json" \
-d "{\"message\": \"Data quality issue: $NULL_COUNT null values\"}"
fi
# Check for duplicates
DUP_COUNT=$(sqlstream query data.csv "SELECT id, COUNT(*) as count FROM data GROUP BY id HAVING count > 1" --format json | jq 'length')
if [ "$DUP_COUNT" -gt 0 ]; then
echo "⚠ WARNING: Found $DUP_COUNT duplicate IDs"
fi
echo "✓ Data quality checks passed"
CI/CD Integration
GitHub Actions
# .github/workflows/data-validation.yml
name: Data Validation
on:
push:
paths:
- 'data/**/*.csv'
schedule:
- cron: '0 0 * * *' # Daily at midnight
jobs:
validate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.11'
- name: Install SQLStream
run: pip install "sqlstream[all]"
- name: Validate data schema
run: |
sqlstream query data/users.csv "SELECT * FROM users LIMIT 1" --format json
sqlstream query data/orders.csv "SELECT * FROM orders LIMIT 1" --format json
- name: Check data quality
run: |
# No null emails
COUNT=$(sqlstream query data/users.csv "SELECT COUNT(*) as count FROM users WHERE email IS NULL" --format json | jq -r '.[0].count')
if [ "$COUNT" -gt 0 ]; then
echo "ERROR: Found $COUNT users with null emails"
exit 1
fi
- name: Generate test report
run: |
sqlstream query data/orders.csv "
SELECT
status,
COUNT(*) as count,
SUM(amount) as total
FROM orders
GROUP BY status
" --format json > test-report.json
- name: Upload report
uses: actions/upload-artifact@v3
with:
name: test-report
path: test-report.json
GitLab CI
# .gitlab-ci.yml
stages:
- validate
- report
validate_data:
stage: validate
image: python:3.11
before_script:
- pip install "sqlstream[all]"
script:
- sqlstream query data.csv "SELECT COUNT(*) FROM data" --format json
- |
# Check for errors
ERROR_COUNT=$(sqlstream query logs.csv "SELECT COUNT(*) as count FROM logs WHERE level = 'ERROR'" --format json | jq -r '.[0].count')
if [ "$ERROR_COUNT" -gt 100 ]; then
echo "Too many errors: $ERROR_COUNT"
exit 1
fi
generate_report:
stage: report
image: python:3.11
before_script:
- pip install "sqlstream[all]"
script:
- sqlstream query data.csv "SELECT * FROM data" --format csv > report.csv
artifacts:
paths:
- report.csv
expire_in: 1 week
Environment Variables
Use environment variables for configuration:
#!/bin/bash
# Configuration
DATA_DIR=${SQLSTREAM_DATA_DIR:-"/data"}
OUTPUT_DIR=${SQLSTREAM_OUTPUT_DIR:-"/output"}
BACKEND=${SQLSTREAM_BACKEND:-"duckdb"}
# Query with environment config
sqlstream query "$DATA_DIR/data.csv" "SELECT * FROM data" \
--backend "$BACKEND" \
--format csv > "$OUTPUT_DIR/results.csv"
Error Handling Best Practices
Validate Inputs
#!/bin/bash
# Check if file exists
if [ ! -f "data.csv" ]; then
echo "ERROR: data.csv not found"
exit 1
fi
# Check if file is not empty
if [ ! -s "data.csv" ]; then
echo "ERROR: data.csv is empty"
exit 1
fi
# Run query
sqlstream query data.csv "SELECT * FROM data"
Capture and Log Errors
#!/bin/bash
LOG_FILE="/var/log/sqlstream/queries.log"
# Run query and log
{
echo "[$(date)] Starting query..."
if OUTPUT=$(sqlstream query data.csv "SELECT * FROM data" --format json 2>&1); then
echo "[$(date)] Query succeeded"
echo "$OUTPUT"
else
echo "[$(date)] Query failed"
echo "$OUTPUT"
exit 1
fi
} | tee -a "$LOG_FILE"
Retry Logic
#!/bin/bash
MAX_RETRIES=3
RETRY_DELAY=5
for i in $(seq 1 $MAX_RETRIES); do
if sqlstream query data.csv "SELECT * FROM data" > output.json 2>&1; then
echo "✓ Query succeeded"
exit 0
else
echo "✗ Attempt $i failed"
if [ $i -lt $MAX_RETRIES ]; then
echo "Retrying in $RETRY_DELAY seconds..."
sleep $RETRY_DELAY
fi
fi
done
echo "ERROR: Query failed after $MAX_RETRIES attempts"
exit 1
Performance Optimization
Use Appropriate Backend
# Small files (<100K rows): Python backend (default)
sqlstream query small.csv "SELECT * FROM small"
# Large files (>100K rows): Pandas backend
sqlstream query large.csv "SELECT * FROM large WHERE amount > 1000" --backend pandas
# Complex SQL (CTEs, window functions): DuckDB backend
sqlstream query data.csv "
WITH ranked AS (
SELECT *, ROW_NUMBER() OVER (PARTITION BY category ORDER BY amount DESC) as rank
FROM data
)
SELECT * FROM ranked WHERE rank <= 10
" --backend duckdb
Limit Output for Testing
# Test query with LIMIT before full run
sqlstream query huge_file.csv "SELECT * FROM huge_file WHERE condition LIMIT 10" --format json
# If it works, run full query
sqlstream query huge_file.csv "SELECT * FROM huge_file WHERE condition" --backend pandas > output.json
Templates
Query Template Script
#!/bin/bash
# query_template.sh - Reusable query script
usage() {
echo "Usage: $0 <input_file> <output_file> [backend]"
echo "Example: $0 data.csv output.json duckdb"
exit 1
}
# Parse arguments
INPUT_FILE=${1:?$(usage)}
OUTPUT_FILE=${2:?$(usage)}
BACKEND=${3:-"auto"}
# Validate input
[ ! -f "$INPUT_FILE" ] && { echo "ERROR: $INPUT_FILE not found"; exit 1; }
# Run query
sqlstream query "$INPUT_FILE" "SELECT * FROM $(basename $INPUT_FILE .csv)" \
--backend "$BACKEND" \
--format json > "$OUTPUT_FILE" || {
echo "ERROR: Query failed"
exit 1
}
echo "✓ Results saved to $OUTPUT_FILE"
Next Steps
- Query Command Reference - All CLI options
- Output Formats - Format details
- Examples - More script examples
- Performance Guide - Optimization tips