This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new 1c730ec feat: Support InSet expression in Comet (#59)
1c730ec is described below
commit 1c730ec34b04f8b26319525d99aea9bb7e6d877f
Author: Liang-Chi Hsieh <[email protected]>
AuthorDate: Tue Feb 20 16:33:44 2024 -0800
feat: Support InSet expression in Comet (#59)
---
.../org/apache/comet/serde/QueryPlanSerde.scala | 9 ++++++
.../org/apache/comet/CometExpressionSuite.scala | 34 ++++++++++++----------
2 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 15a26a0..a497a44 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -1377,6 +1377,15 @@ object QueryPlanSerde extends Logging with
ShimQueryPlanSerde {
case In(value, list) =>
in(value, list, inputs, false)
+ case InSet(value, hset) =>
+ val valueDataType = value.dataType
+ val list = hset.map { setVal =>
+ Literal(setVal, valueDataType)
+ }.toSeq
+ // Change `InSet` to `In` expression
+ // We do Spark `InSet` optimization in native (DataFusion) side.
+ in(value, list, inputs, false)
+
case Not(In(value, list)) =>
in(value, list, inputs, true)
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
index 5ead490..df8bc7c 100644
--- a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
+++ b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -996,23 +996,27 @@ class CometExpressionSuite extends CometTestBase with
AdaptiveSparkPlanHelper {
}
}
- test("test in/not in") {
- Seq(false, true).foreach { dictionary =>
- withSQLConf("parquet.enable.dictionary" -> dictionary.toString) {
- val table = "names"
- withTable(table) {
- sql(s"create table $table(id int, name varchar(20)) using parquet")
- sql(
- s"insert into $table values(1, 'James'), (1, 'Jones'), (2,
'Smith'), (3, 'Smith')," +
- "(NULL, 'Jones'), (4, NULL)")
+ test("test in(set)/not in(set)") {
+ Seq("100", "0").foreach { inSetThreshold =>
+ Seq(false, true).foreach { dictionary =>
+ withSQLConf(
+ SQLConf.OPTIMIZER_INSET_CONVERSION_THRESHOLD.key -> inSetThreshold,
+ "parquet.enable.dictionary" -> dictionary.toString) {
+ val table = "names"
+ withTable(table) {
+ sql(s"create table $table(id int, name varchar(20)) using parquet")
+ sql(
+ s"insert into $table values(1, 'James'), (1, 'Jones'), (2,
'Smith'), (3, 'Smith')," +
+ "(NULL, 'Jones'), (4, NULL)")
- checkSparkAnswerAndOperator(s"SELECT * FROM $table WHERE id in (1,
2, 4, NULL)")
- checkSparkAnswerAndOperator(
- s"SELECT * FROM $table WHERE name in ('Smith', 'Brown', NULL)")
+ checkSparkAnswerAndOperator(s"SELECT * FROM $table WHERE id in (1,
2, 4, NULL)")
+ checkSparkAnswerAndOperator(
+ s"SELECT * FROM $table WHERE name in ('Smith', 'Brown', NULL)")
- // TODO: why with not in, the plan is only `LocalTableScan`?
- checkSparkAnswer(s"SELECT * FROM $table WHERE id not in (1)")
- checkSparkAnswer(s"SELECT * FROM $table WHERE name not in ('Smith',
'Brown', NULL)")
+ // TODO: why with not in, the plan is only `LocalTableScan`?
+ checkSparkAnswer(s"SELECT * FROM $table WHERE id not in (1)")
+ checkSparkAnswer(s"SELECT * FROM $table WHERE name not in
('Smith', 'Brown', NULL)")
+ }
}
}
}