Github user davies commented on a diff in the pull request:
https://github.com/apache/spark/pull/7468#discussion_r35072296
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringOperations.scala
---
@@ -876,6 +876,221 @@ case class Encode(value: Expression, charset:
Expression)
}
/**
+ * Replace all substrings of str that match regexp with rep.
+ *
+ * NOTE: this expression is not THREAD-SAFE, we have to create a instance
for each thread.
+ */
+case class RegExpReplace(subject: Expression, regexp: Expression, rep:
Expression)
+ extends Expression with ImplicitCastInputTypes {
+
+ // last regex in string, we will update the pattern iff regexp value
changed.
+ @transient private var lastRegex: UTF8String = _
+ // last regex pattern, we cache it for performance concern
+ @transient private var pattern: Pattern = _
+ // last replacement string, we don't want to convert a UTF8String =>
java.langString every time.
+ @transient private var lastReplacement: String = _
+ @transient private var lastReplacementInUTF8: UTF8String = _
+ // result buffer write by Matcher
+ @transient private val result: StringBuffer = new StringBuffer
+
+ override def nullable: Boolean = subject.nullable || regexp.nullable ||
rep.nullable
+ override def foldable: Boolean = subject.foldable && regexp.foldable &&
rep.foldable
+
+ override def eval(input: InternalRow): Any = {
+ val s = subject.eval(input)
+ if (null != s) {
+ val p = regexp.eval(input)
+ if (null != p) {
+ val r = rep.eval(input)
+ if (null != r) {
+ if (!p.equals(lastRegex)) {
+ // regex value changed
+ lastRegex = p.asInstanceOf[UTF8String]
+ pattern = Pattern.compile(lastRegex.toString)
+ }
+ if (!r.equals(lastReplacementInUTF8)) {
+ // replacement string changed
+ lastReplacementInUTF8 = r.asInstanceOf[UTF8String]
+ lastReplacement = lastReplacementInUTF8.toString
+ }
+ val m = pattern.matcher(s.toString())
+ result.delete(0, result.length())
+
+ while (m.find) {
+ m.appendReplacement(result, lastReplacement)
+ }
+ m.appendTail(result)
+
+ return UTF8String.fromString(result.toString)
+ }
+ }
+ }
+
+ null
+ }
+
+ override def dataType: DataType = StringType
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType,
StringType, StringType)
+ override def children: Seq[Expression] = subject :: regexp :: rep :: Nil
+ override def prettyName: String = "regexp_replace"
+
+ override protected def genCode(ctx: CodeGenContext, ev:
GeneratedExpressionCode): String = {
+ val termLastRegex = ctx.freshName("lastRegex")
+ val termPattern = ctx.freshName("pattern")
+
+ val termLastReplacement = ctx.freshName("lastReplacement")
+ val termLastReplacementInUTF8 = ctx.freshName("lastReplacementInUTF8")
+
+ val termResult = ctx.freshName("result")
+
+ val classNameUTF8String = classOf[UTF8String].getCanonicalName
--- End diff --
nit: UTF8String is imported by default, so we can directly use
`UTF8String`, also String
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]