Repository: spark
Updated Branches:
  refs/heads/branch-1.0 3e95225c7 -> 4bf8ddaee


[SPARK-2395][SQL] Optimize common LIKE patterns.

Author: Michael Armbrust <mich...@databricks.com>

Closes #1325 from marmbrus/slowLike and squashes the following commits:

023c3eb [Michael Armbrust] add comment.
8b421c2 [Michael Armbrust] Handle the case where the final % is actually 
escaped.
d34d37e [Michael Armbrust] add periods.
3bbf35f [Michael Armbrust] Roll back changes to SparkBuild
53894b1 [Michael Armbrust] Fix grammar.
4094462 [Michael Armbrust] Fix grammar.
6d3d0a0 [Michael Armbrust] Optimize common LIKE patterns.

(cherry picked from commit cc3e0a14daf756ff5c2d4e7916438e175046e5bb)
Signed-off-by: Michael Armbrust <mich...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4bf8ddae
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4bf8ddae
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4bf8ddae

Branch: refs/heads/branch-1.0
Commit: 4bf8ddaeefb5e078bfd163d3a6041b9c53b929df
Parents: 3e95225
Author: Michael Armbrust <mich...@databricks.com>
Authored: Tue Jul 8 10:36:18 2014 -0700
Committer: Michael Armbrust <mich...@databricks.com>
Committed: Tue Jul 8 10:38:30 2014 -0700

----------------------------------------------------------------------
 .../catalyst/expressions/stringOperations.scala | 51 ++++++++++++++++++++
 .../sql/catalyst/optimizer/Optimizer.scala      | 23 +++++++++
 2 files changed, 74 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/4bf8ddae/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
index c074b7b..347471c 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
@@ -156,3 +156,54 @@ case class Lower(child: Expression) extends 
UnaryExpression with CaseConversionE
 
   override def toString() = s"Lower($child)"
 }
+
+/** A base class for functions that compare two strings, returning a boolean. 
*/
+abstract class StringComparison extends Expression {
+  self: Product =>
+
+  type EvaluatedType = Any
+
+  def left: Expression
+  def right: Expression
+
+  override def references = children.flatMap(_.references).toSet
+  override def children = left :: right :: Nil
+
+  override def nullable: Boolean = true
+  override def dataType: DataType = BooleanType
+
+  def compare(l: String, r: String): Boolean
+
+  override def eval(input: Row): Any = {
+    val leftEval = left.eval(input).asInstanceOf[String]
+    if(leftEval == null) {
+      null
+    } else {
+      val rightEval = right.eval(input).asInstanceOf[String]
+      if (rightEval == null) null else compare(leftEval, rightEval)
+    }
+  }
+
+  override def toString() = s"$nodeName($left, $right)"
+}
+
+/**
+ * A function that returns true if the string `left` contains the string 
`right`.
+ */
+case class Contains(left: Expression, right: Expression) extends 
StringComparison {
+  override def compare(l: String, r: String) = l.contains(r)
+}
+
+/**
+ * A function that returns true if the string `left` starts with the string 
`right`.
+ */
+case class StartsWith(left: Expression, right: Expression) extends 
StringComparison {
+  def compare(l: String, r: String) = l.startsWith(r)
+}
+
+/**
+ * A function that returns true if the string `left` ends with the string 
`right`.
+ */
+case class EndsWith(left: Expression, right: Expression) extends 
StringComparison {
+  def compare(l: String, r: String) = l.endsWith(r)
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/4bf8ddae/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
----------------------------------------------------------------------
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index 5bc4784..17c80b6 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -34,6 +34,7 @@ object Optimizer extends RuleExecutor[LogicalPlan] {
     Batch("ConstantFolding", FixedPoint(100),
       NullPropagation,
       ConstantFolding,
+      LikeSimplification,
       BooleanSimplification,
       SimplifyFilters,
       SimplifyCasts,
@@ -112,6 +113,28 @@ object ColumnPruning extends Rule[LogicalPlan] {
 }
 
 /**
+ * Simplifies LIKE expressions that do not need full regular expressions to 
evaluate the condition.
+ * For example, when the expression is just checking to see if a string starts 
with a given
+ * pattern.
+ */
+object LikeSimplification extends Rule[LogicalPlan] {
+  // if guards below protect from escapes on trailing %.
+  // Cases like "something\%" are not optimized, but this does not affect 
correctness.
+  val startsWith = "([^_%]+)%".r
+  val endsWith = "%([^_%]+)".r
+  val contains = "%([^_%]+)%".r
+
+  def apply(plan: LogicalPlan): LogicalPlan = plan transformAllExpressions {
+    case Like(l, Literal(startsWith(pattern), StringType)) if 
!pattern.endsWith("\\") =>
+      StartsWith(l, Literal(pattern))
+    case Like(l, Literal(endsWith(pattern), StringType)) =>
+      EndsWith(l, Literal(pattern))
+    case Like(l, Literal(contains(pattern), StringType)) if 
!pattern.endsWith("\\") =>
+      Contains(l, Literal(pattern))
+  }
+}
+
+/**
  * Replaces [[Expression Expressions]] that can be statically evaluated with
  * equivalent [[Literal]] values. This rule is more specific with
  * Null value propagation from bottom to top of the expression tree.

Reply via email to