xiangfu0 commented on code in PR #18261: URL: https://github.com/apache/pinot/pull/18261#discussion_r3330948880
########## pinot-connectors/pinot-spark-4-connector/src/main/scala/org/apache/pinot/connector/spark/v4/datasource/query/FilterPushDown.scala: ########## @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.connector.spark.v4.datasource.query + +import java.sql.{Date, Timestamp} + +import org.apache.spark.sql.sources._ + +/** + * Helper methods to find valid filters, and convert spark filters to SQL where clause. + */ +private[pinot] object FilterPushDown { + + /** + * Create SQL 'where clause' from Spark filters. + * + * @param filters Supported spark filters + * @return where clause, or None if filters does not exists + */ + def compileFiltersToSqlWhereClause(filters: Array[Filter]): Option[String] = { + if (filters.isEmpty) { + None + } else { + Option(filters.flatMap(compileFilter).map(filter => s"($filter)").mkString(" AND ")) + } + } + + /** + * Accept only filters that supported in SQL. + * + * @param filters Spark filters that contains valid and/or invalid filters + * @return Supported and unsupported filters + */ + def acceptFilters(filters: Array[Filter]): (Array[Filter], Array[Filter]) = { + filters.partition(isFilterSupported) + } + + private def isFilterSupported(filter: Filter): Boolean = filter match { + case _: EqualTo => true + case _: EqualNullSafe => true + case _: In => true + case _: LessThan => true + case _: LessThanOrEqual => true + case _: GreaterThan => true + case _: GreaterThanOrEqual => true + case _: IsNull => true + case _: IsNotNull => true + case _: StringStartsWith => true + case _: StringEndsWith => true + case _: StringContains => true + case _: Not => true Review Comment: Addressed. `isFilterSupported` is now recursive — `And` / `Or` / `Not` are accepted only when every child is supported ([FilterPushDown.scala:100–104](https://github.com/apache/pinot/blob/claude/musing-neumann-2a8e62/pinot-connectors/pinot-spark-4-connector/src/main/scala/org/apache/pinot/connector/spark/v4/datasource/query/FilterPushDown.scala#L100-L104)). Regression tests pin both directions: `Compound filter with unsupported child should be rejected by acceptFilters` and `Null-leaf rejection propagates through enclosing And/Or/Not compounds`. ########## pinot-connectors/pinot-spark-4-connector/src/main/scala/org/apache/pinot/connector/spark/v4/datasource/query/FilterPushDown.scala: ########## @@ -0,0 +1,127 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.connector.spark.v4.datasource.query + +import java.sql.{Date, Timestamp} + +import org.apache.spark.sql.sources._ + +/** + * Helper methods to find valid filters, and convert spark filters to SQL where clause. + */ +private[pinot] object FilterPushDown { + + /** + * Create SQL 'where clause' from Spark filters. + * + * @param filters Supported spark filters + * @return where clause, or None if filters does not exists + */ + def compileFiltersToSqlWhereClause(filters: Array[Filter]): Option[String] = { + if (filters.isEmpty) { + None + } else { + Option(filters.flatMap(compileFilter).map(filter => s"($filter)").mkString(" AND ")) + } + } + + /** + * Accept only filters that supported in SQL. + * + * @param filters Spark filters that contains valid and/or invalid filters + * @return Supported and unsupported filters + */ + def acceptFilters(filters: Array[Filter]): (Array[Filter], Array[Filter]) = { + filters.partition(isFilterSupported) + } + + private def isFilterSupported(filter: Filter): Boolean = filter match { + case _: EqualTo => true + case _: EqualNullSafe => true + case _: In => true + case _: LessThan => true + case _: LessThanOrEqual => true + case _: GreaterThan => true + case _: GreaterThanOrEqual => true + case _: IsNull => true + case _: IsNotNull => true + case _: StringStartsWith => true + case _: StringEndsWith => true + case _: StringContains => true + case _: Not => true + case _: Or => true + case _: And => true + case _ => false + } + + private def escapeSql(value: String): String = + if (value == null) null else value.replace("'", "''") + + private def compileValue(value: Any): Any = value match { + case stringValue: String => s"'${escapeSql(stringValue)}'" + case timestampValue: Timestamp => "'" + timestampValue + "'" + case dateValue: Date => "'" + dateValue + "'" + case arrayValue: Array[Any] => arrayValue.map(compileValue).mkString(", ") + case _ => value + } + + private def escapeAttr(attr: String): String = { + if (attr.contains("\"")) attr else s""""$attr"""" + } + + private def compileFilter(filter: Filter): Option[String] = { + val whereCondition = filter match { + case EqualTo(attr, value) => s"${escapeAttr(attr)} = ${compileValue(value)}" + case EqualNullSafe(attr, value) => + s"NOT (${escapeAttr(attr)} != ${compileValue(value)} OR ${escapeAttr(attr)} IS NULL OR " + + s"${compileValue(value)} IS NULL) OR " + + s"(${escapeAttr(attr)} IS NULL AND ${compileValue(value)} IS NULL)" + case LessThan(attr, value) => s"${escapeAttr(attr)} < ${compileValue(value)}" + case GreaterThan(attr, value) => s"${escapeAttr(attr)} > ${compileValue(value)}" + case LessThanOrEqual(attr, value) => s"${escapeAttr(attr)} <= ${compileValue(value)}" + case GreaterThanOrEqual(attr, value) => s"${escapeAttr(attr)} >= ${compileValue(value)}" + case IsNull(attr) => s"${escapeAttr(attr)} IS NULL" + case IsNotNull(attr) => s"${escapeAttr(attr)} IS NOT NULL" + case StringStartsWith(attr, value) => s"${escapeAttr(attr)} LIKE '$value%'" Review Comment: Addressed. `StringStartsWith` / `StringEndsWith` / `StringContains` now route through `escapeLikeLiteral` (escapes `%`, `_`, `\`, `'`) and emit `LIKE … ESCAPE '\'` ([FilterPushDown.scala:188–192](https://github.com/apache/pinot/blob/claude/musing-neumann-2a8e62/pinot-connectors/pinot-spark-4-connector/src/main/scala/org/apache/pinot/connector/spark/v4/datasource/query/FilterPushDown.scala#L188-L192)). Values containing a literal backslash are rejected at `isFilterSupported` because Pinot's `RegexpPatternConverterUtils#likeToRegexpLike` does not round-trip `\\` correctly. A cross-module regression test runs the emitted SQL through `likeToRegexpLike` and verifies the resulting regex matches the original literal via `Matcher#find`. ########## pinot-connectors/pinot-spark-4-connector/src/main/scala/org/apache/pinot/connector/spark/v4/datasource/PinotWriteBuilder.scala: ########## @@ -0,0 +1,44 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.pinot.connector.spark.v4.datasource + +import org.apache.spark.sql.connector.expressions.filter.Predicate +import org.apache.spark.sql.connector.write.{LogicalWriteInfo, SupportsOverwriteV2, Write, WriteBuilder} + +/** + * Spark 4 write builder. Uses SupportsOverwriteV2 (Predicate-based) — the V1 SupportsOverwrite + * (Filter-based) is deprecated in Spark 4 and slated for removal in a future release. + * Predicates are not currently consulted; writes are unconditional. The parameter is retained + * to record the caller's intent for potential future use. + */ +class PinotWriteBuilder( + predicates: Array[Predicate], + logicalWriteInfo: LogicalWriteInfo, + ) + extends WriteBuilder with SupportsOverwriteV2 { + + override def build(): Write = { + // TODO: utilize predicates + new PinotWrite(logicalWriteInfo) Review Comment: Addressed via explicit reject. `overwrite(predicates)`, `truncate()`, and `canOverwrite(predicates)` all reject — `overwrite` and `truncate` throw `UnsupportedOperationException` with messages pointing at the supported alternatives, and `canOverwrite` returns `false` so Spark's V2Writes analyzer surfaces the rejection earlier ([PinotWriteBuilder.scala:60–80](https://github.com/apache/pinot/blob/claude/musing-neumann-2a8e62/pinot-connectors/pinot-spark-4-connector/src/main/scala/org/apache/pinot/connector/spark/v4/datasource/PinotWriteBuilder.scala#L60-L80)). `truncate()` is load-bearing because Spark 4's V2Writes lowers `df.write.mode("overwrite")` to `truncate()` rather than `overwrite([AlwaysTrue])`. Regression tests pin all three rejection paths. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
