This is an automated email from the ASF dual-hosted git repository.
yunyd pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/texera.git
The following commit(s) were added to refs/heads/main by this push:
new d217b8c20e feat(op): add Substring Search operator with case
sensitivity support (#3780)
d217b8c20e is described below
commit d217b8c20e9f99edb93119215fa3b4a47db573ef
Author: yunyad <[email protected]>
AuthorDate: Wed Oct 15 15:28:17 2025 -0700
feat(op): add Substring Search operator with case sensitivity support
(#3780)
This PR introduces a new `Substring Search` operator that performs
substring-based matching on a string column, with optional case
sensitivity.
### Motivation
Unlike the existing `KeywordSearch` operator which relies on Lucene
token-based parsing, this operator performs a raw substring match,
making it more suitable for exact or partial field matches where
tokenization is not desired.
### Changes
- Added `SubstringSearchOpDesc` extending `FilterOpDesc`, with three
parameters:
- `attribute`: target column to search
- `substring`: query substring
- `isCaseSensitive`: whether matching is case-sensitive
- Implemented `SubstringSearchOpExec` with string matching logic
- Registered the operator in the `SEARCH_GROUP` group with proper
metadata
- Included an operator icon for UI integration
### Demo

Fix #3107
---
.../org/apache/amber/operator/LogicalOp.scala | 2 +
.../substringSearch/SubstringSearchOpDesc.scala | 77 +++++++++++++++++++++
.../substringSearch/SubstringSearchOpExec.scala | 40 +++++++++++
.../src/assets/operator_images/SubstringSearch.png | Bin 0 -> 507231 bytes
4 files changed, 119 insertions(+)
diff --git
a/common/workflow-operator/src/main/scala/org/apache/amber/operator/LogicalOp.scala
b/common/workflow-operator/src/main/scala/org/apache/amber/operator/LogicalOp.scala
index e00dfd5c72..209ac0e481 100644
---
a/common/workflow-operator/src/main/scala/org/apache/amber/operator/LogicalOp.scala
+++
b/common/workflow-operator/src/main/scala/org/apache/amber/operator/LogicalOp.scala
@@ -79,6 +79,7 @@ import
org.apache.amber.operator.source.sql.asterixdb.AsterixDBSourceOpDesc
import org.apache.amber.operator.source.sql.mysql.MySQLSourceOpDesc
import org.apache.amber.operator.source.sql.postgresql.PostgreSQLSourceOpDesc
import org.apache.amber.operator.split.SplitOpDesc
+import org.apache.amber.operator.substringSearch.SubstringSearchOpDesc
import org.apache.amber.operator.symmetricDifference.SymmetricDifferenceOpDesc
import org.apache.amber.operator.typecasting.TypeCastingOpDesc
import org.apache.amber.operator.udf.java.JavaUDFOpDesc
@@ -171,6 +172,7 @@ trait StateTransferFunc
new Type(value = classOf[ProjectionOpDesc], name = "Projection"),
new Type(value = classOf[UnionOpDesc], name = "Union"),
new Type(value = classOf[KeywordSearchOpDesc], name = "KeywordSearch"),
+ new Type(value = classOf[SubstringSearchOpDesc], name = "SubstringSearch"),
new Type(value = classOf[AggregateOpDesc], name = "Aggregate"),
new Type(value = classOf[LineChartOpDesc], name = "LineChart"),
new Type(value = classOf[WaterfallChartOpDesc], name = "WaterfallChart"),
diff --git
a/common/workflow-operator/src/main/scala/org/apache/amber/operator/substringSearch/SubstringSearchOpDesc.scala
b/common/workflow-operator/src/main/scala/org/apache/amber/operator/substringSearch/SubstringSearchOpDesc.scala
new file mode 100644
index 0000000000..a5b290d237
--- /dev/null
+++
b/common/workflow-operator/src/main/scala/org/apache/amber/operator/substringSearch/SubstringSearchOpDesc.scala
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.amber.operator.substringSearch
+
+import com.fasterxml.jackson.annotation.{JsonProperty, JsonPropertyDescription}
+import com.kjetland.jackson.jsonSchema.annotations.JsonSchemaTitle
+import org.apache.amber.core.executor.OpExecWithClassName
+import org.apache.amber.core.virtualidentity.{ExecutionIdentity,
WorkflowIdentity}
+import org.apache.amber.core.workflow.{InputPort, OutputPort, PhysicalOp}
+import org.apache.amber.operator.filter.FilterOpDesc
+import org.apache.amber.operator.metadata.annotations.AutofillAttributeName
+import org.apache.amber.operator.metadata.{OperatorGroupConstants,
OperatorInfo}
+import org.apache.amber.util.JSONUtils.objectMapper
+
+class SubstringSearchOpDesc extends FilterOpDesc {
+
+ @JsonProperty(required = true)
+ @JsonSchemaTitle("attribute")
+ @JsonPropertyDescription("column to search substring on")
+ @AutofillAttributeName
+ var attribute: String = _
+
+ @JsonProperty(required = true)
+ @JsonSchemaTitle("Substring")
+ @JsonPropertyDescription("substring")
+ var substring: String = _
+
+ @JsonProperty(required = true, defaultValue = "false")
+ @JsonSchemaTitle("Case Sensitive")
+ @JsonPropertyDescription("Whether the substring match is case sensitive.")
+ var isCaseSensitive: Boolean = false
+
+ override def getPhysicalOp(
+ workflowId: WorkflowIdentity,
+ executionId: ExecutionIdentity
+ ): PhysicalOp = {
+ PhysicalOp
+ .oneToOnePhysicalOp(
+ workflowId,
+ executionId,
+ operatorIdentifier,
+ OpExecWithClassName(
+ "org.apache.amber.operator.substringSearch.SubstringSearchOpExec",
+ objectMapper.writeValueAsString(this)
+ )
+ )
+ .withInputPorts(operatorInfo.inputPorts)
+ .withOutputPorts(operatorInfo.outputPorts)
+ }
+
+ override def operatorInfo: OperatorInfo =
+ OperatorInfo(
+ userFriendlyName = "Substring Search",
+ operatorDescription = "Search for Substring(s) in a string column",
+ operatorGroupName = OperatorGroupConstants.SEARCH_GROUP,
+ inputPorts = List(InputPort()),
+ outputPorts = List(OutputPort()),
+ supportReconfiguration = true
+ )
+}
diff --git
a/common/workflow-operator/src/main/scala/org/apache/amber/operator/substringSearch/SubstringSearchOpExec.scala
b/common/workflow-operator/src/main/scala/org/apache/amber/operator/substringSearch/SubstringSearchOpExec.scala
new file mode 100644
index 0000000000..704854b339
--- /dev/null
+++
b/common/workflow-operator/src/main/scala/org/apache/amber/operator/substringSearch/SubstringSearchOpExec.scala
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.amber.operator.substringSearch
+
+import org.apache.amber.core.tuple.Tuple
+import org.apache.amber.operator.filter.FilterOpExec
+import org.apache.amber.util.JSONUtils.objectMapper
+
+class SubstringSearchOpExec(descString: String) extends FilterOpExec {
+ private val desc: SubstringSearchOpDesc =
+ objectMapper.readValue(descString, classOf[SubstringSearchOpDesc])
+
+ this.setFilterFunc(findSubstring)
+
+ private def findSubstring(tuple: Tuple): Boolean = {
+ val content = tuple.getField(desc.attribute).toString
+ if (desc.isCaseSensitive) {
+ content.contains(desc.substring)
+ } else {
+ content.toLowerCase.contains(desc.substring.toLowerCase)
+ }
+ }
+}
diff --git a/frontend/src/assets/operator_images/SubstringSearch.png
b/frontend/src/assets/operator_images/SubstringSearch.png
new file mode 100644
index 0000000000..ea571c3956
Binary files /dev/null and
b/frontend/src/assets/operator_images/SubstringSearch.png differ