plusplusjiajia commented on code in PR #7389: URL: https://github.com/apache/paimon/pull/7389#discussion_r2909492766
########## paimon-python/pypaimon/cli/where_parser.py: ########## @@ -0,0 +1,359 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" +SQL WHERE clause parser for Paimon CLI. + +Parses simple SQL-like WHERE expressions into Predicate objects. + +Supported operators: + =, !=, <>, <, <=, >, >=, + IS NULL, IS NOT NULL, + IN (...), NOT IN (...), + BETWEEN ... AND ..., + LIKE '...' + +Supported connectors: AND, OR (AND has higher precedence than OR). +Parenthesized grouping is supported. + +Examples: + "age > 18" + "name = 'Alice' AND age >= 20" + "status IN ('active', 'pending')" + "score BETWEEN 60 AND 100" + "name LIKE 'A%'" + "deleted_at IS NULL" + "age > 18 OR (name = 'Bob' AND status = 'active')" +""" + +import re +from typing import Any, Dict, List, Optional + +from pypaimon.common.predicate import Predicate +from pypaimon.common.predicate_builder import PredicateBuilder +from pypaimon.schema.data_types import AtomicType, DataField + + +def extract_fields_from_where(where_string: str, available_fields: set) -> set: + """Extract all field names referenced in a WHERE clause. + + Args: + where_string: The WHERE clause string. + available_fields: Set of valid field names from the table schema. + + Returns: + A set of field names referenced in the WHERE clause. + """ + if not where_string or not where_string.strip(): + return set() + + tokens = _tokenize(where_string.strip()) + referenced_fields = set() + for token in tokens: + if token in available_fields: + referenced_fields.add(token) + return referenced_fields + + +def parse_where_clause(where_string: str, fields: List[DataField]) -> Optional[Predicate]: + """Parse a SQL-like WHERE clause string into a Predicate. + + Args: + where_string: The WHERE clause string (without the 'WHERE' keyword). + fields: The table schema fields for type resolution. + + Returns: + A Predicate object, or None if the string is empty. + + Raises: + ValueError: If the WHERE clause cannot be parsed. + """ + where_string = where_string.strip() + if not where_string: + return None + + field_type_map = _build_field_type_map(fields) + predicate_builder = PredicateBuilder(fields) + tokens = _tokenize(where_string) + predicate, remaining = _parse_or_expression(tokens, predicate_builder, field_type_map) + + if remaining: + raise ValueError( + f"Unexpected tokens after parsing: {' '.join(remaining)}" + ) + + return predicate + + +def _build_field_type_map(fields: List[DataField]) -> Dict[str, str]: + """Build a mapping from field name to its base type string.""" + result = {} + for field in fields: + if isinstance(field.type, AtomicType): + result[field.name] = field.type.type.upper() + else: + result[field.name] = str(field.type).upper() + return result + + +def _cast_literal(value_str: str, type_name: str) -> Any: + """Cast a literal string to the appropriate Python type based on the field type.""" + integer_types = {'TINYINT', 'SMALLINT', 'INT', 'INTEGER', 'BIGINT'} + float_types = {'FLOAT', 'DOUBLE'} + + base_type = type_name.split('(')[0].strip() + + if base_type in integer_types: + return int(value_str) + if base_type in float_types: + return float(value_str) + if base_type.startswith('DECIMAL') or base_type in ('DECIMAL', 'NUMERIC', 'DEC'): + return float(value_str) + if base_type == 'BOOLEAN': + return value_str.lower() in ('true', '1', 'yes') + return value_str + + +_TOKEN_PATTERN = re.compile( + r""" + '(?:[^'\\]|\\.)*' # single-quoted string + | "(?:[^"\\]|\\.)*" # double-quoted string + | <= # <= + | >= # >= + | <> # <> + | != # != + | [=<>] # single-char operators + | [(),] # punctuation + | [^\s,()=<>!'"]+ # unquoted word / number + """, + re.VERBOSE, +) + + +def _tokenize(expression: str) -> List[str]: + """Tokenize a WHERE clause string.""" + return _TOKEN_PATTERN.findall(expression) + + +def _parse_or_expression( + tokens: List[str], + builder: PredicateBuilder, + type_map: Dict[str, str], +) -> (Predicate, List[str]): + """Parse an OR expression (lowest precedence).""" + left, tokens = _parse_and_expression(tokens, builder, type_map) + or_operands = [left] + + while tokens and tokens[0].upper() == 'OR': + tokens = tokens[1:] # consume 'OR' + right, tokens = _parse_and_expression(tokens, builder, type_map) + or_operands.append(right) + + if len(or_operands) == 1: + return or_operands[0], tokens + return PredicateBuilder.or_predicates(or_operands), tokens + + +def _parse_and_expression( + tokens: List[str], + builder: PredicateBuilder, + type_map: Dict[str, str], +) -> (Predicate, List[str]): + """Parse an AND expression.""" + left, tokens = _parse_primary(tokens, builder, type_map) + and_operands = [left] + + while tokens and tokens[0].upper() == 'AND': + # Distinguish 'AND' as connector vs. 'AND' in 'BETWEEN ... AND ...' + # BETWEEN's AND is consumed inside _parse_primary, so here it's always a connector. + tokens = tokens[1:] # consume 'AND' + right, tokens = _parse_primary(tokens, builder, type_map) + and_operands.append(right) + + if len(and_operands) == 1: + return and_operands[0], tokens + return PredicateBuilder.and_predicates(and_operands), tokens + + +def _parse_primary( + tokens: List[str], + builder: PredicateBuilder, + type_map: Dict[str, str], +) -> (Predicate, List[str]): + """Parse a primary expression: a single condition or a parenthesized group.""" + if not tokens: + raise ValueError("Unexpected end of WHERE clause") + + # Parenthesized group + if tokens[0] == '(': + tokens = tokens[1:] # consume '(' + predicate, tokens = _parse_or_expression(tokens, builder, type_map) + if not tokens or tokens[0] != ')': + raise ValueError("Missing closing parenthesis ')'") + tokens = tokens[1:] # consume ')' + return predicate, tokens + + # Must be a condition starting with a field name + field_name = tokens[0] + tokens = tokens[1:] + + if not tokens: + raise ValueError(f"Unexpected end after field name '{field_name}'") + + field_type = type_map.get(field_name, 'STRING') Review Comment: suggestion: reject non atomic type fields with a clear error -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
