[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/21010 --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r186606198 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2133,57 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") - val dFormat = ctx.freshName("dFormat") - val lastDValue = -ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") - val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") val numberFormat = ctx.addMutableState(df, "numberFormat", v => s"""$v = new $df("", new $dfs($l.$usLocale));""") - s""" -if ($d >= 0) { - $pattern.delete(0, $pattern.length()); - if ($d != $lastDValue) { -$pattern.append("#,###,###,###,###,###,##0"); - -if ($d > 0) { - $pattern.append("."); - for (int $i = 0; $i < $d; $i++) { -$pattern.append("0"); + right.dataType match { +case IntegerType => + val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") + val i = ctx.freshName("i") + val lastDValue = +ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") + s""" +if ($d >= 0) { + $pattern.delete(0, $pattern.length()); + if ($d != $lastDValue) { +$pattern.append("$defaultFormat"); + +if ($d > 0) { + $pattern.append("."); + for (int $i = 0; $i < $d; $i++) { +$pattern.append("0"); + } +} +$lastDValue = $d; +$numberFormat.applyLocalizedPattern($pattern.toString()); } + ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); +} else { + ${ev.value} = null; + ${ev.isNull} = true; } -$lastDValue = $d; -$numberFormat.applyLocalizedPattern($pattern.toString()); - } - ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); -} else { - ${ev.value} = null; - ${ev.isNull} = true; -} - """ + """ +case StringType => + val lastDValue = ctx.addMutableState("String", "lastDValue", v => s"""$v = null;""") + val dValue = ctx.addMutableState("String", "dValue") + s""" +$dValue = $d.toString(); +if (!$dValue.equals($lastDValue)) { + $lastDValue = $dValue; + if ($dValue.isEmpty()) { +$numberFormat.applyLocalizedPattern("$defaultFormat"); + } else { +$numberFormat.applyLocalizedPattern($dValue); + } +} +${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); + """ +case NullType => --- End diff -- We don't need this case? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r186606172 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2133,57 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") - val dFormat = ctx.freshName("dFormat") - val lastDValue = -ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") - val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") val numberFormat = ctx.addMutableState(df, "numberFormat", v => s"""$v = new $df("", new $dfs($l.$usLocale));""") - s""" -if ($d >= 0) { - $pattern.delete(0, $pattern.length()); - if ($d != $lastDValue) { -$pattern.append("#,###,###,###,###,###,##0"); - -if ($d > 0) { - $pattern.append("."); - for (int $i = 0; $i < $d; $i++) { -$pattern.append("0"); + right.dataType match { +case IntegerType => + val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") + val i = ctx.freshName("i") + val lastDValue = +ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") + s""" +if ($d >= 0) { + $pattern.delete(0, $pattern.length()); + if ($d != $lastDValue) { +$pattern.append("$defaultFormat"); + +if ($d > 0) { + $pattern.append("."); + for (int $i = 0; $i < $d; $i++) { +$pattern.append("0"); + } +} +$lastDValue = $d; +$numberFormat.applyLocalizedPattern($pattern.toString()); } + ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); +} else { + ${ev.value} = null; + ${ev.isNull} = true; } -$lastDValue = $d; -$numberFormat.applyLocalizedPattern($pattern.toString()); - } - ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); -} else { - ${ev.value} = null; - ${ev.isNull} = true; -} - """ + """ +case StringType => + val lastDValue = ctx.addMutableState("String", "lastDValue", v => s"""$v = null;""") + val dValue = ctx.addMutableState("String", "dValue") --- End diff -- Do we need to make this mutable state? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r186607122 --- Diff: sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala --- @@ -706,6 +706,30 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { "15,159,339,180,002,773.2778") checkEvaluation(FormatNumber(Literal.create(null, IntegerType), Literal(3)), null) assert(FormatNumber(Literal.create(null, NullType), Literal(3)).resolved === false) + +checkEvaluation(FormatNumber(Literal(12332.123456), Literal("##.###")), "12332.123") +checkEvaluation(FormatNumber(Literal(12332.123456), Literal("##.###")), "12332.123") +checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Byte]), Literal("##.")), "4") +checkEvaluation(FormatNumber(Literal(4.asInstanceOf[Short]), Literal("##.")), "4") +checkEvaluation(FormatNumber(Literal(4.0f), Literal("##.###")), "4") +checkEvaluation(FormatNumber(Literal(4), Literal("##.###")), "4") +checkEvaluation(FormatNumber(Literal(12831273.23481d), + Literal("###,###,###,###,###.###")), "12,831,273.235") +checkEvaluation(FormatNumber(Literal(12831273.83421d), Literal("")), "12,831,274") +checkEvaluation(FormatNumber(Literal(123123324123L), Literal("###,###,###,###,###.###")), + "123,123,324,123") +checkEvaluation( + FormatNumber(Literal(Decimal(123123324123L) * Decimal(123123.21234d)), +Literal("###,###,###,###,###.")), "15,159,339,180,002,773.2778") +checkEvaluation(FormatNumber(Literal.create(null, IntegerType), Literal("##.###")), null) +assert(FormatNumber(Literal.create(null, NullType), Literal("##.###")).resolved === false) + +checkEvaluation(FormatNumber(Literal(12332.123456), Literal("#,###,###,###,###,###,##0")), + "12,332") +checkEvaluation(FormatNumber( + Literal.create(null, IntegerType), Literal.create(null, NullType)), null) +checkEvaluation(FormatNumber( + Literal.create(null, NullType), Literal.create(null, NullType)), null) --- End diff -- What's for these two cases? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r185691812 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2133,53 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") - val dFormat = ctx.freshName("dFormat") - val lastDValue = -ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") - val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") val numberFormat = ctx.addMutableState(df, "numberFormat", v => s"""$v = new $df("", new $dfs($l.$usLocale));""") - s""" -if ($d >= 0) { - $pattern.delete(0, $pattern.length()); - if ($d != $lastDValue) { -$pattern.append("#,###,###,###,###,###,##0"); - -if ($d > 0) { - $pattern.append("."); - for (int $i = 0; $i < $d; $i++) { -$pattern.append("0"); + right.dataType match { +case IntegerType => + val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") + val i = ctx.freshName("i") + val lastDIntValue = +ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") + s""" +if ($d >= 0) { + $pattern.delete(0, $pattern.length()); + if ($d != $lastDIntValue) { +$pattern.append("$defaultFormat"); + +if ($d > 0) { + $pattern.append("."); + for (int $i = 0; $i < $d; $i++) { +$pattern.append("0"); + } +} +$lastDIntValue = $d; +$numberFormat.applyLocalizedPattern($pattern.toString()); } + ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); +} else { + ${ev.value} = null; + ${ev.isNull} = true; } -$lastDValue = $d; -$numberFormat.applyLocalizedPattern($pattern.toString()); - } - ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); -} else { - ${ev.value} = null; - ${ev.isNull} = true; -} - """ + """ +case StringType => + val lastDStringValue = +ctx.addMutableState("String", "lastDValue", v => s"""$v = "$defaultFormat";""") + val dValue = ctx.addMutableState("String", "dValue") --- End diff -- Do we need to make this mutable state? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user ueshin commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r185691861 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2133,53 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") - val dFormat = ctx.freshName("dFormat") - val lastDValue = -ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") - val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") val numberFormat = ctx.addMutableState(df, "numberFormat", v => s"""$v = new $df("", new $dfs($l.$usLocale));""") - s""" -if ($d >= 0) { - $pattern.delete(0, $pattern.length()); - if ($d != $lastDValue) { -$pattern.append("#,###,###,###,###,###,##0"); - -if ($d > 0) { - $pattern.append("."); - for (int $i = 0; $i < $d; $i++) { -$pattern.append("0"); + right.dataType match { +case IntegerType => + val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") + val i = ctx.freshName("i") + val lastDIntValue = +ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") + s""" +if ($d >= 0) { + $pattern.delete(0, $pattern.length()); + if ($d != $lastDIntValue) { +$pattern.append("$defaultFormat"); + +if ($d > 0) { + $pattern.append("."); + for (int $i = 0; $i < $d; $i++) { +$pattern.append("0"); + } +} +$lastDIntValue = $d; +$numberFormat.applyLocalizedPattern($pattern.toString()); } + ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); +} else { + ${ev.value} = null; + ${ev.isNull} = true; } -$lastDValue = $d; -$numberFormat.applyLocalizedPattern($pattern.toString()); - } - ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); -} else { - ${ev.value} = null; - ${ev.isNull} = true; -} - """ + """ +case StringType => + val lastDStringValue = +ctx.addMutableState("String", "lastDValue", v => s"""$v = "$defaultFormat";""") + val dValue = ctx.addMutableState("String", "dValue") + s""" +$dValue = $d.toString(); +if (!$dValue.equals($lastDStringValue)) { --- End diff -- What if the first `dValue` is the same as the default format? Can you add the test case? --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r180313214 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2132,52 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") val dFormat = ctx.freshName("dFormat") - val lastDValue = -ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") val numberFormat = ctx.addMutableState(df, "numberFormat", v => s"""$v = new $df("", new $dfs($l.$usLocale));""") - s""" -if ($d >= 0) { - $pattern.delete(0, $pattern.length()); - if ($d != $lastDValue) { -$pattern.append("#,###,###,###,###,###,##0"); - -if ($d > 0) { - $pattern.append("."); - for (int $i = 0; $i < $d; $i++) { -$pattern.append("0"); + right.dataType match { +case IntegerType => + val i = ctx.freshName("i") + val lastDIntValue = +ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") + s""" +if ($d >= 0) { + $pattern.delete(0, $pattern.length()); + if ($d != $lastDIntValue) { +$pattern.append("$defaultFormat"); + +if ($d > 0) { + $pattern.append("."); + for (int $i = 0; $i < $d; $i++) { +$pattern.append("0"); + } +} +$lastDIntValue = $d; +$numberFormat.applyLocalizedPattern($pattern.toString()); } + ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); +} else { + ${ev.value} = null; + ${ev.isNull} = true; } -$lastDValue = $d; -$numberFormat.applyLocalizedPattern($pattern.toString()); - } - ${ev.value} = UTF8String.fromString($numberFormat.format(${typeHelper(num)})); -} else { - ${ev.value} = null; - ${ev.isNull} = true; -} - """ + """ +case StringType => + val lastDStringValue = +ctx.addMutableState("String", "lastDValue", v => s"""$v = "$defaultFormat";""") + s""" +if (!$d.toString().equals($lastDStringValue)) { --- End diff -- We should only call `$d.toString()` once. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r180312765 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2132,52 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") val dFormat = ctx.freshName("dFormat") - val lastDValue = -ctx.addMutableState(CodeGenerator.JAVA_INT, "lastDValue", v => s"$v = -100;") val pattern = ctx.addMutableState(sb, "pattern", v => s"$v = new $sb();") --- End diff -- `pattern` is only needed for `IntegerType` case. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r180312702 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2108,35 +2132,52 @@ case class FormatNumber(x: Expression, d: Expression) // SPARK-13515: US Locale configures the DecimalFormat object to use a dot ('.') // as a decimal separator. val usLocale = "US" - val i = ctx.freshName("i") val dFormat = ctx.freshName("dFormat") --- End diff -- `dFormat` is not used in any place. We can remove it. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r180311991 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2050,33 +2058,49 @@ case class FormatNumber(x: Expression, d: Expression) private lazy val numberFormat = new DecimalFormat("", new DecimalFormatSymbols(Locale.US)) override protected def nullSafeEval(xObject: Any, dObject: Any): Any = { -val dValue = dObject.asInstanceOf[Int] -if (dValue < 0) { - return null -} - -lastDValue match { - case Some(last) if last == dValue => -// use the current pattern - case _ => -// construct a new DecimalFormat only if a new dValue -pattern.delete(0, pattern.length) -pattern.append("#,###,###,###,###,###,##0") - -// decimal place -if (dValue > 0) { - pattern.append(".") - - var i = 0 - while (i < dValue) { -i += 1 -pattern.append("0") - } +right.dataType match { + case IntegerType => +val dValue = dObject.asInstanceOf[Int] +if (dValue < 0) { + return null } -lastDValue = Some(dValue) +lastDIntValue match { + case Some(last) if last == dValue => + // use the current pattern + case _ => +// construct a new DecimalFormat only if a new dValue +pattern.delete(0, pattern.length) +pattern.append(defaultFormat) + +// decimal place +if (dValue > 0) { + pattern.append(".") + + var i = 0 + while (i < dValue) { +i += 1 +pattern.append("0") + } +} + +lastDIntValue = Some(dValue) -numberFormat.applyLocalizedPattern(pattern.toString) +numberFormat.applyLocalizedPattern(pattern.toString) +} + case StringType => +val dValue = dObject.asInstanceOf[UTF8String].toString +lastDStringValue match { + case Some(last) if last == dValue => + case _ => +pattern.delete(0, pattern.length) +lastDStringValue = Some(dValue) +if (dValue.toString.isEmpty) { --- End diff -- `dValue` is already a string. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
Github user viirya commented on a diff in the pull request: https://github.com/apache/spark/pull/21010#discussion_r180310931 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala --- @@ -2022,6 +2022,8 @@ case class Encode(value: Expression, charset: Expression) Examples: > SELECT _FUNC_(12332.123456, 4); 12,332.1235 + > SELECT _FUNC_(12332.123456, '##.###'); --- End diff -- We need to update `ExpressionDescription` too. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #21010: [SPARK-23900][SQL] format_number support user spe...
GitHub user wangyum opened a pull request: https://github.com/apache/spark/pull/21010 [SPARK-23900][SQL] format_number support user specifed format as argument ## What changes were proposed in this pull request? `format_number` support user specifed format as argument. For example: ```sql spark-sql> SELECT format_number(12332.123456, '##.###'); 12332.123 ``` ## How was this patch tested? unit test You can merge this pull request into a Git repository by running: $ git pull https://github.com/wangyum/spark SPARK-23900 Alternatively you can review and apply these changes as the patch at: https://github.com/apache/spark/pull/21010.patch To close this pull request, make a commit to your master/trunk branch with (at least) the following in the commit message: This closes #21010 commit 202fa3d505ed4395858a277c9b871006b2f64483 Author: Yuming WangDate: 2018-04-09T14:28:44Z format_number udf should take user specifed format as argument --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org