fix: improve client-side regex statement parser by olavloite · Pull Request #1328 · googleapis/python-spanner · GitHub | Latest TMZ Celebrity News & Gossip | Watch TMZ Live
Skip to content

fix: improve client-side regex statement parser #1328

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions google/cloud/spanner_dbapi/batch_dml_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,9 +54,12 @@ def execute_statement(self, parsed_statement: ParsedStatement):
"""
from google.cloud.spanner_dbapi import ProgrammingError

# Note: Let the server handle it if the client-side parser did not
# recognize the type of statement.
if (
parsed_statement.statement_type != StatementType.UPDATE
and parsed_statement.statement_type != StatementType.INSERT
and parsed_statement.statement_type != StatementType.UNKNOWN
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This provides a fallback if the statement is not recognized by the client-side parser. Instead of failing directly, we allow it to be sent to Spanner and let the Spanner parser determine whether it is valid or not. This prevents potential new DML syntax in the future from being rejected here (e.g. if we were to support merge statements).

):
raise ProgrammingError("Only DML statements are allowed in batch DML mode.")
self._statements.append(parsed_statement.statement)
Expand Down
16 changes: 8 additions & 8 deletions google/cloud/spanner_dbapi/client_side_statement_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,18 @@
Statement,
)

RE_BEGIN = re.compile(r"^\s*(BEGIN|START)(TRANSACTION)?", re.IGNORECASE)
RE_COMMIT = re.compile(r"^\s*(COMMIT)(TRANSACTION)?", re.IGNORECASE)
RE_ROLLBACK = re.compile(r"^\s*(ROLLBACK)(TRANSACTION)?", re.IGNORECASE)
RE_BEGIN = re.compile(r"^\s*(BEGIN|START)(\s+TRANSACTION)?\s*$", re.IGNORECASE)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lack of $ at the end of these regular expressions meant that statements like begin whatever string follows does not matter to be accepted as valid statements.

RE_COMMIT = re.compile(r"^\s*(COMMIT)(\s+TRANSACTION)?\s*$", re.IGNORECASE)
RE_ROLLBACK = re.compile(r"^\s*(ROLLBACK)(\s+TRANSACTION)?\s*$", re.IGNORECASE)
RE_SHOW_COMMIT_TIMESTAMP = re.compile(
r"^\s*(SHOW)\s+(VARIABLE)\s+(COMMIT_TIMESTAMP)", re.IGNORECASE
r"^\s*(SHOW)\s+(VARIABLE)\s+(COMMIT_TIMESTAMP)\s*$", re.IGNORECASE
)
RE_SHOW_READ_TIMESTAMP = re.compile(
r"^\s*(SHOW)\s+(VARIABLE)\s+(READ_TIMESTAMP)", re.IGNORECASE
r"^\s*(SHOW)\s+(VARIABLE)\s+(READ_TIMESTAMP)\s*$", re.IGNORECASE
)
RE_START_BATCH_DML = re.compile(r"^\s*(START)\s+(BATCH)\s+(DML)", re.IGNORECASE)
RE_RUN_BATCH = re.compile(r"^\s*(RUN)\s+(BATCH)", re.IGNORECASE)
RE_ABORT_BATCH = re.compile(r"^\s*(ABORT)\s+(BATCH)", re.IGNORECASE)
RE_START_BATCH_DML = re.compile(r"^\s*(START)\s+(BATCH)\s+(DML)\s*$", re.IGNORECASE)
RE_RUN_BATCH = re.compile(r"^\s*(RUN)\s+(BATCH)\s*$", re.IGNORECASE)
RE_ABORT_BATCH = re.compile(r"^\s*(ABORT)\s+(BATCH)\s*$", re.IGNORECASE)
RE_PARTITION_QUERY = re.compile(r"^\s*(PARTITION)\s+(.+)", re.IGNORECASE)
RE_RUN_PARTITION = re.compile(r"^\s*(RUN)\s+(PARTITION)\s+(.+)", re.IGNORECASE)
RE_RUN_PARTITIONED_QUERY = re.compile(
Expand Down
10 changes: 1 addition & 9 deletions google/cloud/spanner_dbapi/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,7 @@
from google.cloud import spanner_v1 as spanner
from google.cloud.spanner_dbapi import partition_helper
from google.cloud.spanner_dbapi.batch_dml_executor import BatchMode, BatchDmlExecutor
from google.cloud.spanner_dbapi.parse_utils import _get_statement_type
from google.cloud.spanner_dbapi.parsed_statement import (
StatementType,
AutocommitDmlMode,
)
from google.cloud.spanner_dbapi.parsed_statement import AutocommitDmlMode
from google.cloud.spanner_dbapi.partition_helper import PartitionId
from google.cloud.spanner_dbapi.parsed_statement import ParsedStatement, Statement
from google.cloud.spanner_dbapi.transaction_helper import TransactionRetryHelper
Expand Down Expand Up @@ -702,10 +698,6 @@ def set_autocommit_dml_mode(
self._autocommit_dml_mode = autocommit_dml_mode

def _partitioned_query_validation(self, partitioned_query, statement):
if _get_statement_type(Statement(partitioned_query)) is not StatementType.QUERY:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Skip validation here and let Spanner return an error if the statement is not valid for PartitionQuery.

raise ProgrammingError(
"Only queries can be partitioned. Invalid statement: " + statement.sql
)
if self.read_only is not True and self._client_transaction_started is True:
raise ProgrammingError(
"Partitioned query is not supported, because the connection is in a read/write transaction."
Expand Down
3 changes: 3 additions & 0 deletions google/cloud/spanner_dbapi/cursor.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,9 +404,12 @@ def executemany(self, operation, seq_of_params):
# For every operation, we've got to ensure that any prior DDL
# statements were run.
self.connection.run_prior_DDL_statements()
# Treat UNKNOWN statements as if they are DML and let the server
# determine what is wrong with it.
if self._parsed_statement.statement_type in (
StatementType.INSERT,
StatementType.UPDATE,
StatementType.UNKNOWN,
):
statements = []
for params in seq_of_params:
Expand Down
22 changes: 16 additions & 6 deletions google/cloud/spanner_dbapi/parse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,25 +155,30 @@
STMT_INSERT = "INSERT"

# Heuristic for identifying statements that don't need to be run as updates.
# TODO: This and the other regexes do not match statements that start with a hint.
RE_NON_UPDATE = re.compile(r"^\W*(SELECT|GRAPH|FROM)", re.IGNORECASE)

RE_WITH = re.compile(r"^\s*(WITH)", re.IGNORECASE)

# DDL statements follow
# https://cloud.google.com/spanner/docs/data-definition-language
RE_DDL = re.compile(
r"^\s*(CREATE|ALTER|DROP|GRANT|REVOKE|RENAME)", re.IGNORECASE | re.DOTALL
r"^\s*(CREATE|ALTER|DROP|GRANT|REVOKE|RENAME|ANALYZE)", re.IGNORECASE | re.DOTALL
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ANALYZE is a valid DDL statement in Spanner.

)

RE_IS_INSERT = re.compile(r"^\s*(INSERT)", re.IGNORECASE | re.DOTALL)
# TODO: These do not match statements that start with a hint.
RE_IS_INSERT = re.compile(r"^\s*(INSERT\s+)", re.IGNORECASE | re.DOTALL)
RE_IS_UPDATE = re.compile(r"^\s*(UPDATE\s+)", re.IGNORECASE | re.DOTALL)
RE_IS_DELETE = re.compile(r"^\s*(DELETE\s+)", re.IGNORECASE | re.DOTALL)

RE_INSERT = re.compile(
# Only match the `INSERT INTO <table_name> (columns...)
# otherwise the rest of the statement could be a complex
# operation.
r"^\s*INSERT INTO (?P<table_name>[^\s\(\)]+)\s*\((?P<columns>[^\(\)]+)\)",
r"^\s*INSERT(?:\s+INTO)?\s+(?P<table_name>[^\s\(\)]+)\s*\((?P<columns>[^\(\)]+)\)",
re.IGNORECASE | re.DOTALL,
)
"""Deprecated: Use the RE_IS_INSERT, RE_IS_UPDATE, and RE_IS_DELETE regexes"""
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a new deprecation, these regular expressions were already 'deprecated' in the sense that:

  1. They were no longer in use by the library
  2. They are not intended for public use


RE_VALUES_TILL_END = re.compile(r"VALUES\s*\(.+$", re.IGNORECASE | re.DOTALL)

Expand Down Expand Up @@ -259,8 +264,13 @@ def _get_statement_type(statement):
# statements and doesn't yet support WITH for DML statements.
return StatementType.QUERY

statement.sql = ensure_where_clause(query)
return StatementType.UPDATE
if RE_IS_UPDATE.match(query) or RE_IS_DELETE.match(query):
# TODO: Remove this? It makes more sense to have this in SQLAlchemy and
# Django than here.
statement.sql = ensure_where_clause(query)
return StatementType.UPDATE

return StatementType.UNKNOWN


def sql_pyformat_args_to_spanner(sql, params):
Expand Down Expand Up @@ -355,7 +365,7 @@ def get_param_types(params):
def ensure_where_clause(sql):
"""
Cloud Spanner requires a WHERE clause on UPDATE and DELETE statements.
Add a dummy WHERE clause if non detected.
Add a dummy WHERE clause if not detected.

:type sql: str
:param sql: SQL code to check.
Expand Down
1 change: 1 addition & 0 deletions google/cloud/spanner_dbapi/parsed_statement.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


class StatementType(Enum):
UNKNOWN = 0
CLIENT_SIDE = 1
DDL = 2
QUERY = 3
Expand Down
22 changes: 21 additions & 1 deletion tests/unit/spanner_dbapi/test_parse_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,31 @@ def test_classify_stmt(self):
("REVOKE SELECT ON TABLE Singers TO ROLE parent", StatementType.DDL),
("GRANT ROLE parent TO ROLE child", StatementType.DDL),
("INSERT INTO table (col1) VALUES (1)", StatementType.INSERT),
("INSERT table (col1) VALUES (1)", StatementType.INSERT),
("INSERT OR UPDATE table (col1) VALUES (1)", StatementType.INSERT),
("INSERT OR IGNORE table (col1) VALUES (1)", StatementType.INSERT),
("UPDATE table SET col1 = 1 WHERE col1 = NULL", StatementType.UPDATE),
("delete from table WHERE col1 = 2", StatementType.UPDATE),
("delete from table WHERE col1 in (select 1)", StatementType.UPDATE),
("dlete from table where col1 = 2", StatementType.UNKNOWN),
("udpate table set col2=1 where col1 = 2", StatementType.UNKNOWN),
("begin foo", StatementType.UNKNOWN),
("begin transaction foo", StatementType.UNKNOWN),
("commit foo", StatementType.UNKNOWN),
("commit transaction foo", StatementType.UNKNOWN),
("rollback foo", StatementType.UNKNOWN),
("rollback transaction foo", StatementType.UNKNOWN),
("show variable", StatementType.UNKNOWN),
("show variable read_timestamp foo", StatementType.UNKNOWN),
("INSERTs INTO table (col1) VALUES (1)", StatementType.UNKNOWN),
("UPDATEs table SET col1 = 1 WHERE col1 = NULL", StatementType.UNKNOWN),
("DELETEs from table WHERE col1 = 2", StatementType.UNKNOWN),
)

for query, want_class in cases:
self.assertEqual(classify_statement(query).statement_type, want_class)
self.assertEqual(
classify_statement(query).statement_type, want_class, query
)

def test_partition_query_classify_stmt(self):
parsed_statement = classify_statement(
Expand Down
Loading

TMZ Celebrity News – Breaking Stories, Videos & Gossip

Looking for the latest TMZ celebrity news? You've come to the right place. From shocking Hollywood scandals to exclusive videos, TMZ delivers it all in real time.

Whether it’s a red carpet slip-up, a viral paparazzi moment, or a legal drama involving your favorite stars, TMZ news is always first to break the story. Stay in the loop with daily updates, insider tips, and jaw-dropping photos.

🎥 Watch TMZ Live

TMZ Live brings you daily celebrity news and interviews straight from the TMZ newsroom. Don’t miss a beat—watch now and see what’s trending in Hollywood.