Skip to content

SQL Parser Reference

SQL parsing and AST generation.

AggregateFunction

AggregateFunction dataclass

Represents an aggregate function in SELECT clause

Examples:

COUNT(*), COUNT(id), SUM(amount), AVG(price), MIN(age), MAX(age)

Source code in sqlstream/sql/ast_nodes.py
@dataclass
class AggregateFunction:
    """
    Represents an aggregate function in SELECT clause

    Examples:
        COUNT(*), COUNT(id), SUM(amount), AVG(price), MIN(age), MAX(age)
    """

    function: str  # 'COUNT', 'SUM', 'AVG', 'MIN', 'MAX'
    column: str  # Column name, or '*' for COUNT(*)
    alias: str | None = None  # AS alias

    def __repr__(self) -> str:
        result = f"{self.function}({self.column})"
        if self.alias:
            result += f" AS {self.alias}"
        return result

Condition

Condition dataclass

A single WHERE condition: column operator value

Source code in sqlstream/sql/ast_nodes.py
@dataclass
class Condition:
    """A single WHERE condition: column operator value"""

    column: str
    operator: str  # '=', '>', '<', '>=', '<=', '!=', 'IN'
    value: Any

    def __repr__(self) -> str:
        return f"{self.column} {self.operator} {self.value}"

JoinClause

JoinClause dataclass

Represents a JOIN clause

Examples:

INNER JOIN orders ON customers.id = orders.customer_id LEFT JOIN products ON orders.product_id = products.id

Source code in sqlstream/sql/ast_nodes.py
@dataclass
class JoinClause:
    """
    Represents a JOIN clause

    Examples:
        INNER JOIN orders ON customers.id = orders.customer_id
        LEFT JOIN products ON orders.product_id = products.id
    """

    right_source: str  # Right table/file name
    join_type: str  # 'INNER', 'LEFT', 'RIGHT'
    on_left: str  # Left column in join condition
    on_right: str  # Right column in join condition

    def __repr__(self) -> str:
        return f"{self.join_type} JOIN {self.right_source} ON {self.on_left} = {self.on_right}"

OrderByColumn

OrderByColumn dataclass

Represents a column in ORDER BY clause

Examples:

name ASC, age DESC

Source code in sqlstream/sql/ast_nodes.py
@dataclass
class OrderByColumn:
    """
    Represents a column in ORDER BY clause

    Examples:
        name ASC, age DESC
    """

    column: str
    direction: str = "ASC"  # 'ASC' or 'DESC', default ASC

    def __repr__(self) -> str:
        return f"{self.column} {self.direction}"

SelectStatement

SelectStatement dataclass

Represents a complete SELECT statement

Examples:

SELECT * FROM data SELECT name, age FROM data WHERE age > 25 SELECT * FROM data WHERE age > 25 LIMIT 10 SELECT city, COUNT(*) FROM data GROUP BY city SELECT * FROM data ORDER BY age DESC LIMIT 10 SELECT * FROM customers INNER JOIN orders ON customers.id = orders.customer_id

Source code in sqlstream/sql/ast_nodes.py
@dataclass
class SelectStatement:
    """
    Represents a complete SELECT statement

    Examples:
        SELECT * FROM data
        SELECT name, age FROM data WHERE age > 25
        SELECT * FROM data WHERE age > 25 LIMIT 10
        SELECT city, COUNT(*) FROM data GROUP BY city
        SELECT * FROM data ORDER BY age DESC LIMIT 10
        SELECT * FROM customers INNER JOIN orders ON customers.id = orders.customer_id
    """

    columns: list[str]  # ['*'] for all columns, or specific column names
    source: str  # Table/file name (FROM clause)
    where: WhereClause | None = None
    group_by: list[str] | None = None
    order_by: list[OrderByColumn] | None = None
    limit: int | None = None
    aggregates: list[AggregateFunction] | None = None  # Aggregate functions in SELECT
    join: JoinClause | None = None  # JOIN clause

    def __repr__(self) -> str:
        parts = [f"SELECT {', '.join(self.columns)}"]
        parts.append(f"FROM {self.source}")
        if self.where:
            parts.append(f"WHERE {self.where}")
        if self.group_by:
            parts.append(f"GROUP BY {', '.join(self.group_by)}")
        if self.order_by:
            parts.append(f"ORDER BY {', '.join(str(col) for col in self.order_by)}")
        if self.join:
            parts.append(str(self.join))
        if self.limit:
            parts.append(f"LIMIT {self.limit}")
        return " ".join(parts)

WhereClause

WhereClause dataclass

WHERE clause containing multiple conditions

Source code in sqlstream/sql/ast_nodes.py
@dataclass
class WhereClause:
    """WHERE clause containing multiple conditions"""

    conditions: list[Condition]

    def __repr__(self) -> str:
        return " AND ".join(str(c) for c in self.conditions)

ParseError

ParseError

Bases: Exception

Raised when SQL parsing fails

Source code in sqlstream/sql/parser.py
class ParseError(Exception):
    """Raised when SQL parsing fails"""

    pass

SQLParser

SQLParser

Simple recursive descent parser for SQL

Grammar (simplified): SELECT_STMT := SELECT columns FROM source [WHERE conditions][LIMIT n] columns := * | column_name [, column_name] conditions := condition [AND condition] condition := column_name operator value operator := = | > | < | >= | <= | !=

Source code in sqlstream/sql/parser.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
class SQLParser:
    """
    Simple recursive descent parser for SQL

    Grammar (simplified):
        SELECT_STMT := SELECT columns FROM source [WHERE conditions] [LIMIT n]
        columns     := * | column_name [, column_name]*
        conditions  := condition [AND condition]*
        condition   := column_name operator value
        operator    := = | > | < | >= | <= | !=
    """

    def __init__(self, sql: str):
        self.sql = sql.strip()
        self.tokens = self._tokenize(sql)
        self.pos = 0

    def _tokenize(self, sql: str) -> list[str]:
        """
        Simple tokenization by splitting on whitespace and special characters

        This is intentionally simple. A production parser would use a proper lexer.
        """
        # Replace commas and parens with spaces around them
        sql = re.sub(r"([,()])", r" \1 ", sql)

        # Don't split dots - we'll handle table.column qualification in the parser
        # This allows file paths like "/path/to/file.csv" to remain intact

        # Split on whitespace
        tokens = sql.split()

        return tokens

    def current(self) -> str | None:
        """Get current token without advancing"""
        if self.pos < len(self.tokens):
            return self.tokens[self.pos]
        return None

    def peek(self, offset: int = 1) -> str | None:
        """Look ahead at token"""
        pos = self.pos + offset
        if pos < len(self.tokens):
            return self.tokens[pos]
        return None

    def consume(self, expected: str | None = None) -> str:
        """
        Consume and return current token, optionally checking it matches expected

        Args:
            expected: If provided, raises ParseError if current token doesn't match

        Returns:
            The consumed token

        Raises:
            ParseError: If expected token doesn't match or no more tokens
        """
        if self.pos >= len(self.tokens):
            raise ParseError(f"Unexpected end of query. Expected: {expected}")

        token = self.tokens[self.pos]

        if expected and token.upper() != expected.upper():
            raise ParseError(f"Expected '{expected}' but got '{token}' at position {self.pos}")

        self.pos += 1
        return token

    def parse(self) -> SelectStatement:
        """Parse SQL query into AST"""
        return self._parse_select()

    def _parse_select(self) -> SelectStatement:
        """Parse SELECT statement"""
        self.consume("SELECT")

        # Parse columns (may include aggregates)
        columns, aggregates = self._parse_columns()

        # Parse FROM
        self.consume("FROM")
        source = self._parse_table_name()

        # Skip optional table alias (e.g., FROM 'file.csv' AS t or FROM 'file.csv' t)
        if self.current() and self.current().upper() == "AS":
            self.consume("AS")
            self.consume()  # Skip alias name
        elif self.current() and self.current().upper() not in (
            "WHERE",
            "GROUP",
            "ORDER",
            "LIMIT",
            "JOIN",
            "INNER",
            "LEFT",
            "RIGHT",
        ):
            # If next token is not a keyword, it's likely an alias
            self.consume()  # Skip alias name

        # Optional JOIN clause
        join = None
        if self.current() and self.current().upper() in ("INNER", "LEFT", "RIGHT", "JOIN"):
            join = self._parse_join()

        # Optional WHERE clause
        where = None
        if self.current() and self.current().upper() == "WHERE":
            where = self._parse_where()

        # Optional GROUP BY clause
        group_by = None
        if self.current() and self.current().upper() == "GROUP":
            group_by = self._parse_group_by()

        # Optional ORDER BY clause
        order_by = None
        if self.current() and self.current().upper() == "ORDER":
            order_by = self._parse_order_by()

        # Optional LIMIT clause
        limit = None
        if self.current() and self.current().upper() == "LIMIT":
            limit = self._parse_limit()

        return SelectStatement(
            columns=columns,
            source=source,
            where=where,
            group_by=group_by,
            order_by=order_by,
            limit=limit,
            aggregates=aggregates,
            join=join,
        )

    def _parse_columns(self):
        """
        Parse column list, including aggregate functions

        Examples:
            *
            name, age
            COUNT(*), SUM(amount)
            city, COUNT(*) AS count

        Returns:
            Tuple of (columns, aggregates)
        """
        columns = []
        aggregates = []

        # Check for SELECT *
        if self.current() == "*":
            self.consume()
            return ["*"], None

        # Parse comma-separated columns/aggregates
        while True:
            # Check if this is an aggregate function
            if self._is_aggregate_function():
                agg = self._parse_aggregate()
                aggregates.append(agg)
                # Add placeholder column name for aggregate
                col_name = agg.alias if agg.alias else f"{agg.function.lower()}_{agg.column}"
                columns.append(col_name)
            else:
                # Regular column
                column = self.consume()
                columns.append(column)

            # Check for comma (more columns)
            if self.current() == ",":
                self.consume(",")
            else:
                break

        return columns, aggregates if aggregates else None

    def _is_aggregate_function(self) -> bool:
        """Check if current token is start of aggregate function"""
        current = self.current()
        if not current:
            return False
        func = current.upper()
        return func in ("COUNT", "SUM", "AVG", "MIN", "MAX") and self.peek() == "("

    def _parse_aggregate(self) -> AggregateFunction:
        """
        Parse aggregate function

        Examples:
            COUNT(*)
            COUNT(id)
            SUM(amount) AS total
        """
        # Parse function name
        function = self.consume().upper()

        # Parse opening paren
        self.consume("(")

        # Parse column (or *)
        column = self.consume()

        # Parse closing paren
        self.consume(")")

        # Optional AS alias
        alias = None
        if self.current() and self.current().upper() == "AS":
            self.consume("AS")
            alias = self.consume()

        return AggregateFunction(function=function, column=column, alias=alias)

    def _parse_where(self) -> WhereClause:
        """
        Parse WHERE clause

        Example: WHERE age > 25 AND city = 'NYC'
        """
        self.consume("WHERE")

        conditions = []

        # Parse first condition
        conditions.append(self._parse_condition())

        # Parse additional AND conditions
        while self.current() and self.current().upper() == "AND":
            self.consume("AND")
            conditions.append(self._parse_condition())

        return WhereClause(conditions=conditions)

    def _parse_condition(self) -> Condition:
        """
        Parse a single condition: column operator value

        Examples:
            age > 25
            name = 'Alice'
            city != 'NYC'
        """
        column = self.consume()
        operator = self.consume()

        # Parse value (could be number, string, or identifier)
        value_token = self.consume()
        value = self._parse_value(value_token)

        # Validate operator
        valid_operators = {"=", ">", "<", ">=", "<=", "!=", "<>"}
        if operator not in valid_operators:
            raise ParseError(f"Invalid operator: {operator}")

        # Normalize <> to !=
        if operator == "<>":
            operator = "!="

        return Condition(column=column, operator=operator, value=value)

    def _parse_value(self, token: str):
        """
        Parse a value token into appropriate Python type

        Examples:
            '123' -> 123 (int)
            '3.14' -> 3.14 (float)
            "'Alice'" -> 'Alice' (string, quotes removed)
            'Alice' -> 'Alice' (string)
        """
        # Remove quotes if present
        if (token.startswith("'") and token.endswith("'")) or (
            token.startswith('"') and token.endswith('"')
        ):
            return token[1:-1]

        # Try parsing as number
        try:
            # Try int first
            if "." not in token:
                return int(token)
            # Then float
            return float(token)
        except ValueError:
            # Return as string
            return token

    def _parse_table_name(self) -> str:
        """
        Parse a table name (file path), handling quoted and unquoted forms

        Examples:
            'data.csv' -> data.csv (quotes removed)
            "data.csv" -> data.csv (quotes removed)
            '/path/to/file.csv' -> /path/to/file.csv (quotes removed)
            data.csv -> data.csv (no quotes)
            data -> data (table alias without extension)

        Returns:
            Table name/file path with quotes removed if present
        """
        token = self.consume()

        # Remove quotes if present (for file paths with spaces or special chars)
        if (token.startswith("'") and token.endswith("'")) or (
            token.startswith('"') and token.endswith('"')
        ):
            return token[1:-1]

        return token

    def _parse_group_by(self) -> list[str]:
        """
        Parse GROUP BY clause

        Example: GROUP BY city, country
        """
        self.consume("GROUP")
        self.consume("BY")

        columns = []

        # Parse comma-separated column names
        while True:
            column = self.consume()
            columns.append(column)

            # Check for comma (more columns)
            if self.current() == ",":
                self.consume(",")
            else:
                break

        return columns

    def _parse_order_by(self) -> list[OrderByColumn]:
        """
        Parse ORDER BY clause

        Examples:
            ORDER BY name
            ORDER BY age DESC
            ORDER BY city ASC, age DESC
        """
        self.consume("ORDER")
        self.consume("BY")

        order_columns = []

        # Parse comma-separated column specifications
        while True:
            column = self.consume()

            # Check for optional ASC/DESC
            direction = "ASC"  # Default
            if self.current() and self.current().upper() in ("ASC", "DESC"):
                direction = self.consume().upper()

            order_columns.append(OrderByColumn(column=column, direction=direction))

            # Check for comma (more columns)
            if self.current() == ",":
                self.consume(",")
            else:
                break

        return order_columns

    def _parse_limit(self) -> int:
        """Parse LIMIT clause"""
        self.consume("LIMIT")
        limit_str = self.consume()

        try:
            limit = int(limit_str)
            if limit < 0:
                raise ParseError(f"LIMIT must be non-negative, got {limit}")
            return limit
        except ValueError as e:
            raise ParseError(f"LIMIT must be an integer, got '{limit_str}'") from e

    def _parse_join(self) -> JoinClause:
        """
        Parse JOIN clause

        Examples:
            JOIN orders ON customers.id = orders.customer_id
            INNER JOIN orders ON customers.id = orders.customer_id
            LEFT JOIN products ON orders.product_id = products.id
            RIGHT JOIN users ON orders.user_id = users.id
        """
        # Parse join type (INNER/LEFT/RIGHT or just JOIN)
        current = self.current().upper()

        if current in ("INNER", "LEFT", "RIGHT"):
            join_type = self.consume().upper()
            self.consume("JOIN")
        elif current == "JOIN":
            self.consume("JOIN")
            join_type = "INNER"  # Default to INNER JOIN
        else:
            raise ParseError(f"Expected JOIN keyword, got '{self.current()}'")

        # Parse right table name (handle quoted paths)
        right_source = self._parse_table_name()

        # Skip optional table alias (e.g., JOIN 'file.csv' AS t or JOIN 'file.csv' t)
        if self.current() and self.current().upper() == "AS":
            self.consume("AS")
            self.consume()  # Skip alias name
        elif self.current() and self.current().upper() not in (
            "ON",
            "WHERE",
            "GROUP",
            "ORDER",
            "LIMIT",
        ):
            # If next token is not a keyword, it's likely an alias
            self.consume()  # Skip alias name

        # Parse ON keyword
        self.consume("ON")

        # Parse join condition: left.column = right.column
        # Support both qualified (table.column) and unqualified (column) names
        left_col_token = self.consume()

        # Check if it's qualified (contains a dot for table.column)
        if "." in left_col_token:
            # Split on last dot to handle paths like /tmp/file.csv correctly
            # table.column -> extract column
            parts = left_col_token.rsplit(".", 1)
            left_col = parts[1] if len(parts) == 2 else left_col_token
        else:
            left_col = left_col_token

        # Parse equals operator
        if self.current() != "=":
            raise ParseError(f"Expected '=' in JOIN condition, got '{self.current()}'")
        self.consume("=")

        # Parse right column
        right_col_token = self.consume()

        # Check if it's qualified (contains a dot for table.column)
        if "." in right_col_token:
            # Split on last dot to extract column name
            parts = right_col_token.rsplit(".", 1)
            right_col = parts[1] if len(parts) == 2 else right_col_token
        else:
            right_col = right_col_token

        return JoinClause(
            right_source=right_source,
            join_type=join_type,
            on_left=left_col,
            on_right=right_col,
        )

current

current() -> str | None

Get current token without advancing

Source code in sqlstream/sql/parser.py
def current(self) -> str | None:
    """Get current token without advancing"""
    if self.pos < len(self.tokens):
        return self.tokens[self.pos]
    return None

peek

peek(offset: int = 1) -> str | None

Look ahead at token

Source code in sqlstream/sql/parser.py
def peek(self, offset: int = 1) -> str | None:
    """Look ahead at token"""
    pos = self.pos + offset
    if pos < len(self.tokens):
        return self.tokens[pos]
    return None

consume

consume(expected: str | None = None) -> str

Consume and return current token, optionally checking it matches expected

Parameters:

Name Type Description Default
expected str | None

If provided, raises ParseError if current token doesn't match

None

Returns:

Type Description
str

The consumed token

Raises:

Type Description
ParseError

If expected token doesn't match or no more tokens

Source code in sqlstream/sql/parser.py
def consume(self, expected: str | None = None) -> str:
    """
    Consume and return current token, optionally checking it matches expected

    Args:
        expected: If provided, raises ParseError if current token doesn't match

    Returns:
        The consumed token

    Raises:
        ParseError: If expected token doesn't match or no more tokens
    """
    if self.pos >= len(self.tokens):
        raise ParseError(f"Unexpected end of query. Expected: {expected}")

    token = self.tokens[self.pos]

    if expected and token.upper() != expected.upper():
        raise ParseError(f"Expected '{expected}' but got '{token}' at position {self.pos}")

    self.pos += 1
    return token

parse

parse() -> SelectStatement

Parse SQL query into AST

Source code in sqlstream/sql/parser.py
def parse(self) -> SelectStatement:
    """Parse SQL query into AST"""
    return self._parse_select()

parse

parse

parse(sql: str) -> SelectStatement

Convenience function to parse SQL query

Parameters:

Name Type Description Default
sql str

SQL query string

required

Returns:

Type Description
SelectStatement

Parsed SelectStatement AST

Raises:

Type Description
ParseError

If query is invalid

Examples:

>>> ast = parse("SELECT * FROM data")
>>> ast = parse("SELECT name, age FROM users WHERE age > 25 LIMIT 10")
Source code in sqlstream/sql/parser.py
def parse(sql: str) -> SelectStatement:
    """
    Convenience function to parse SQL query

    Args:
        sql: SQL query string

    Returns:
        Parsed SelectStatement AST

    Raises:
        ParseError: If query is invalid

    Examples:
        >>> ast = parse("SELECT * FROM data")
        >>> ast = parse("SELECT name, age FROM users WHERE age > 25 LIMIT 10")
    """
    parser = SQLParser(sql)
    return parser.parse()