This is an automated email from the ASF dual-hosted git repository.

andygrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
     new 26b0e8510 feat: add native support for substring_index expression 
(#4286)
26b0e8510 is described below

commit 26b0e8510d82f22cbad0dabc275509e741bec617
Author: Andy Grove <[email protected]>
AuthorDate: Tue May 12 10:58:06 2026 -0600

    feat: add native support for substring_index expression (#4286)
---
 .../contributor-guide/spark_expressions_support.md |   2 +-
 .../org/apache/comet/serde/QueryPlanSerde.scala    |   1 +
 .../scala/org/apache/comet/serde/strings.scala     |  18 +++-
 .../expressions/string/substring_index.sql         | 119 +++++++++++++++++++++
 4 files changed, 138 insertions(+), 2 deletions(-)

diff --git a/docs/source/contributor-guide/spark_expressions_support.md 
b/docs/source/contributor-guide/spark_expressions_support.md
index 588ae5b45..1250beee1 100644
--- a/docs/source/contributor-guide/spark_expressions_support.md
+++ b/docs/source/contributor-guide/spark_expressions_support.md
@@ -563,7 +563,7 @@
 - [x] startswith
 - [x] substr
 - [x] substring
-- [ ] substring_index
+- [x] substring_index
 - [ ] to_binary
 - [ ] to_char
 - [ ] to_number
diff --git a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala 
b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
index 2d138450e..1fa143fd8 100644
--- a/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/QueryPlanSerde.scala
@@ -198,6 +198,7 @@ object QueryPlanSerde extends Logging with CometExprShim 
with CometTypeShim {
       classOf[Left] -> CometLeft,
       classOf[Right] -> CometRight,
       classOf[Substring] -> CometSubstring,
+      classOf[SubstringIndex] -> CometSubstringIndex,
       classOf[Upper] -> CometUpper)
 
   private val bitwiseExpressions: Map[Class[_ <: Expression], 
CometExpressionSerde[_]] = Map(
diff --git a/spark/src/main/scala/org/apache/comet/serde/strings.scala 
b/spark/src/main/scala/org/apache/comet/serde/strings.scala
index 968fe8cd6..aec4b1911 100644
--- a/spark/src/main/scala/org/apache/comet/serde/strings.scala
+++ b/spark/src/main/scala/org/apache/comet/serde/strings.scala
@@ -21,7 +21,7 @@ package org.apache.comet.serde
 
 import java.util.Locale
 
-import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, 
ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, 
Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, 
StringRPad, StringSplit, Substring, Upper}
+import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Concat, 
ConcatWs, Expression, GetJsonObject, If, InitCap, IsNull, Left, Length, Like, 
Literal, Lower, RegExpReplace, Right, RLike, StringLPad, StringRepeat, 
StringRPad, StringSplit, Substring, SubstringIndex, Upper}
 import org.apache.spark.sql.types.{BinaryType, DataTypes, LongType, StringType}
 import org.apache.spark.unsafe.types.UTF8String
 
@@ -129,6 +129,22 @@ object CometSubstring extends 
CometExpressionSerde[Substring] {
   }
 }
 
+object CometSubstringIndex extends CometExpressionSerde[SubstringIndex] {
+
+  override def convert(
+      expr: SubstringIndex,
+      inputs: Seq[Attribute],
+      binding: Boolean): Option[ExprOuterClass.Expr] = {
+    val strExpr = exprToProtoInternal(expr.strExpr, inputs, binding)
+    val delimExpr = exprToProtoInternal(expr.delimExpr, inputs, binding)
+    val countCast = Cast(expr.countExpr, LongType)
+    val countExpr = exprToProtoInternal(countCast, inputs, binding)
+    val optExpr =
+      scalarFunctionExprToProto("substring_index", strExpr, delimExpr, 
countExpr)
+    optExprWithInfo(optExpr, expr, expr.strExpr, expr.delimExpr, 
expr.countExpr)
+  }
+}
+
 object CometLeft extends CometExpressionSerde[Left] {
 
   override def getUnsupportedReasons(): Seq[String] = Seq(
diff --git 
a/spark/src/test/resources/sql-tests/expressions/string/substring_index.sql 
b/spark/src/test/resources/sql-tests/expressions/string/substring_index.sql
new file mode 100644
index 000000000..843e23fb1
--- /dev/null
+++ b/spark/src/test/resources/sql-tests/expressions/string/substring_index.sql
@@ -0,0 +1,119 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- ConfigMatrix: parquet.enable.dictionary=false,true
+
+statement
+CREATE TABLE test_substring_index(s string, delim string, cnt int) USING 
parquet
+
+statement
+INSERT INTO test_substring_index VALUES
+  ('www.apache.org', '.', 1),
+  ('www.apache.org', '.', 2),
+  ('www.apache.org', '.', 3),
+  ('www.apache.org', '.', -1),
+  ('www.apache.org', '.', -2),
+  ('www.apache.org', '.', -3),
+  ('www.apache.org', '.', 0),
+  ('hello', '.', 1),
+  ('', '.', 1),
+  ('www.apache.org', '', 1),
+  (NULL, '.', 1),
+  ('www.apache.org', NULL, 1),
+  ('www.apache.org', '.', NULL)
+
+-- all columns
+query
+SELECT substring_index(s, delim, cnt) FROM test_substring_index
+
+-- literal arguments
+query
+SELECT substring_index('www.apache.org', '.', 1),
+       substring_index('www.apache.org', '.', 2),
+       substring_index('www.apache.org', '.', -1),
+       substring_index('www.apache.org', '.', -2),
+       substring_index('www.apache.org', '.', 0)
+
+-- NULL literal arguments
+query
+SELECT substring_index(NULL, '.', 1),
+       substring_index('www.apache.org', NULL, 1),
+       substring_index('www.apache.org', '.', NULL)
+
+-- column string, literal delimiter and count
+query
+SELECT substring_index(s, '.', 1) FROM test_substring_index
+
+-- literal string, column delimiter and count
+query
+SELECT substring_index('www.apache.org', delim, cnt) FROM test_substring_index
+
+-- count exceeds number of delimiters (returns full string)
+query
+SELECT substring_index('www.apache.org', '.', 10),
+       substring_index('www.apache.org', '.', -10)
+
+-- multi-character delimiter
+query
+SELECT substring_index('one::two::three', '::', 1),
+       substring_index('one::two::three', '::', 2),
+       substring_index('one::two::three', '::', -1),
+       substring_index('one::two::three', '::', -2)
+
+-- delimiter not found
+query
+SELECT substring_index('hello world', 'xyz', 1),
+       substring_index('hello world', 'xyz', -1)
+
+-- empty string input
+query
+SELECT substring_index('', '.', 1),
+       substring_index('', '.', -1)
+
+-- empty delimiter
+query
+SELECT substring_index('www.apache.org', '', 1),
+       substring_index('www.apache.org', '', -1)
+
+-- multibyte UTF-8 characters
+query
+SELECT substring_index('a.b.c', '.', 2),
+       substring_index('中文.测试.数据', '.', 1),
+       substring_index('中文.测试.数据', '.', -1),
+       substring_index('中文.测试.数据', '.', 2)
+
+-- delimiter at start of string
+query
+SELECT substring_index('.www.apache.org', '.', 1),
+       substring_index('.www.apache.org', '.', 2),
+       substring_index('.www.apache.org', '.', -1)
+
+-- delimiter at end of string
+query
+SELECT substring_index('www.apache.org.', '.', -1),
+       substring_index('www.apache.org.', '.', 3),
+       substring_index('www.apache.org.', '.', -2)
+
+-- delimiter equals the full string
+query
+SELECT substring_index('abc', 'abc', 1),
+       substring_index('abc', 'abc', -1)
+
+-- large count values
+query
+SELECT substring_index('www.apache.org', '.', 2147483647),
+       substring_index('www.apache.org', '.', -2147483647)


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to