Python API Overview
Use SQLStream programmatically in your Python code.
Basic Usage
from sqlstream import query
# Execute query with explicit source
results = query("data.csv").sql("SELECT * FROM data WHERE age > 25")
# Execute query with inline source (extracted from SQL)
results = query().sql("SELECT * FROM 'data.csv' WHERE age > 25")
# Iterate (lazy)
for row in results:
print(row)
# Or convert to list (eager)
results_list = query().sql("SELECT * FROM 'data.csv'").to_list()
Backend Selection
SQLStream offers three execution backends with different performance characteristics and SQL support:
Auto Backend (Default)
from sqlstream import query
# Automatically selects: pandas > duckdb > python
results = query("data.csv").sql("SELECT * FROM data")
Backend priority: 1. Pandas (if installed) - Fast execution, basic SQL 2. DuckDB (if pandas not available, duckdb installed) - Full SQL, maximum performance 3. Python (fallback) - Educational, Volcano model
Explicit Backend
# Force DuckDB backend (full SQL support)
results = query("data.csv").sql("""
SELECT
department,
AVG(salary) as avg_salary,
ROW_NUMBER() OVER (ORDER BY AVG(salary) DESC) as rank
FROM data
GROUP BY department
""", backend="duckdb")
# Force Pandas backend (10-100x faster than Python)
results = query("data.csv").sql(
"SELECT * FROM data WHERE age > 25",
backend="pandas"
)
# Force Python backend (educational)
results = query("data.csv").sql(
"SELECT * FROM data WHERE age > 25",
backend="python"
)
When to use each backend:
- backend="duckdb" - Complex SQL (CTEs, window functions), production workloads, maximum performance
- backend="pandas" - Balance of speed and simplicity, basic queries
- backend="python" - Learning query execution, understanding Volcano model
- backend="auto" (default) - Let SQLStream choose the best available
Working with Results
Lazy Iteration (Memory-Efficient)
# Process rows one at a time (doesn't load all into memory)
results = query("large_file.csv").sql("SELECT * FROM large_file")
for row in results:
# Each row is a dictionary
print(f"{row['name']}: {row['value']}")
Eager Loading
# Load all results into a list
results_list = query("data.csv").sql("SELECT * FROM data").to_list()
# Now you can access by index, slice, etc.
first_row = results_list[0]
top_10 = results_list[:10]
Integration with Other Libraries
# Convert to pandas DataFrame
import pandas as pd
results = query("data.csv").sql("SELECT * FROM data")
df = pd.DataFrame(results.to_list())
# Convert to JSON
import json
results = query("data.csv").sql("SELECT * FROM data")
json_str = json.dumps(results.to_list(), indent=2)
# Write to CSV
import csv
results = query("data.csv").sql("SELECT * FROM data")
with open("output.csv", "w", newline="") as f:
if results.to_list():
writer = csv.DictWriter(f, fieldnames=results.to_list()[0].keys())
writer.writeheader()
writer.writerows(results.to_list())
Multiple File Queries
Inline File Paths (Recommended)
# JOIN multiple files
results = query().sql("""
SELECT
e.name,
e.salary,
d.department_name
FROM 'employees.csv' e
JOIN 'departments.csv' d ON e.dept_id = d.id
WHERE e.salary > 75000
""")
Mixed Sources
# Query from different sources
results = query().sql("""
SELECT
local.*,
remote.category
FROM 'data/local.csv' local
JOIN 's3://bucket/remote.parquet' remote
ON local.id = remote.id
""", backend="duckdb")
Advanced Usage
Query Explain Plan
# View query execution plan
results = query("data.csv").sql(
"SELECT * FROM data WHERE age > 25",
backend="python"
)
# Get explain plan
plan = results.explain()
print(plan)
Example output:
S3 and HTTP Sources
# Query S3 data
results = query("s3://my-bucket/data.parquet").sql("""
SELECT * FROM data
WHERE date > '2024-01-01'
LIMIT 100
""", backend="duckdb")
# Query HTTP URL
results = query("https://example.com/data.csv").sql("""
SELECT category, COUNT(*) as count
FROM data
GROUP BY category
""")
Complex Analytics
# Multi-step analytics with DuckDB
results = query().sql("""
WITH monthly_sales AS (
SELECT
DATE_TRUNC('month', sale_date) as month,
product_id,
SUM(amount) as total_sales
FROM 's3://bucket/sales.parquet'
GROUP BY 1, 2
),
ranked AS (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY month
ORDER BY total_sales DESC
) as rank
FROM monthly_sales
)
SELECT * FROM ranked WHERE rank <= 10
""", backend="duckdb")
for row in results:
print(f"{row['month']}: {row['product_id']} - ${row['total_sales']:,.2f}")
Error Handling
from sqlstream import query
try:
results = query("data.csv").sql(
"SELECT * FROM data WHERE invalid_column > 10"
)
for row in results:
print(row)
except FileNotFoundError:
print("File not found")
except ValueError as e:
print(f"Query error: {e}")
except Exception as e:
print(f"Unexpected error: {e}")
Best Practices
1. Use Lazy Iteration for Large Files
# ✅ Good: Memory-efficient
for row in query("huge.csv").sql("SELECT * FROM huge"):
process(row)
# ❌ Bad: Loads everything into memory
all_rows = query("huge.csv").sql("SELECT * FROM huge").to_list()
2. Choose the Right Backend
# ✅ Good: DuckDB for complex SQL
results = query().sql("""
WITH stats AS (SELECT AVG(value) as avg FROM 'data.csv')
SELECT * FROM 'data.csv' WHERE value > (SELECT avg FROM stats)
""", backend="duckdb")
# ❌ Bad: Python backend doesn't support CTEs
# results = query().sql("WITH ...", backend="python") # Will fail
3. Use Inline Paths for Multi-File Queries
# ✅ Good: Clear and simple
results = query().sql("""
SELECT * FROM 'file1.csv' a
JOIN 'file2.csv' b ON a.id = b.id
""")
# ❌ Less convenient: Multiple query objects
# q1 = query("file1.csv")
# q2 = query("file2.csv")
# ...
Next Steps
- Advanced Python Guides - In-depth Python API usage
- SQL Support - Supported SQL syntax
- Backend Comparison - Choose the right backend
- Examples - Real-world code examples