This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new e037953d4c58 [SPARK-48549][SQL][PYTHON] Improve SQL function
`sentences`
e037953d4c58 is described below
commit e037953d4c5879f927c527685c2d027fe2fc08c1
Author: panbingkun <[email protected]>
AuthorDate: Wed Sep 11 20:58:51 2024 +0200
[SPARK-48549][SQL][PYTHON] Improve SQL function `sentences`
### What changes were proposed in this pull request?
The pr aims to improve SQL function `sentences`, includes:
- update the description of the `sentences` expression to make it more
realistic.
- add `def sentences(string: Column, language: Column): Column` to SQL
functions
- `codegen` support for `sentences`
### Why are the changes needed?
Fix inconsistency in using the function `sentences` in the following
scenarios
<img width="1051" alt="image"
src="https://github.com/apache/spark/assets/15246973/033c731d-5a2f-455f-8517-ed95bd6c1f6e">
- According to the definition of function `sentences`, we should only allow
the following two kinds of parameter calls:
A.sentences(str)
B.sentences(str, language, country) - the parameters `language` and
`country` either coexist or do not exist at the same time
**In file `sql/core/src/main/scala/org/apache/spark/sql/functions.scala`,
only the following two functions are defined**:
https://github.com/apache/spark/blob/f4434c36cc4f7b0147e0e8fe26ac0f177a5199cd/sql/core/src/main/scala/org/apache/spark/sql/functions.scala#L4273-L4282
- When we directly call the expression `sentences`, it actually supports
the following:
A.`df.select(sentences($"str", $"language", $"country"))`;
B.`df.select(sentences($"str", $"language"))`;
C.`df.select(sentences($"str"))`;
## Let's align it
### Does this PR introduce _any_ user-facing change?
Yes, allow calling SQL function `sentences` as parameters (`str`,
`language`).
### How was this patch tested?
- Add new UT & Update existed UT.
- Pass GA.
- Manually check
```scala
scala> val df = Seq(("Hi there! The price was $1,234.56.... But, not
now.", "en", "US")).toDF("str", "language", "country");
val df: org.apache.spark.sql.DataFrame = [str: string, language: string ...
1 more field]
scala> df.select(sentences($"str", $"language", $"country"));
val res0: org.apache.spark.sql.DataFrame = [sentences(str, language,
country): array<array<string>>]
scala> df.select(sentences($"str", $"language"));
val res1: org.apache.spark.sql.DataFrame = [sentences(str, language, ):
array<array<string>>]
scala> df.select(sentences($"str"));
val res2: org.apache.spark.sql.DataFrame = [sentences(str, , ):
array<array<string>>]
scala> df.selectExpr("sentences(str, language, country)");
val res3: org.apache.spark.sql.DataFrame = [sentences(str, language,
country): array<array<string>>]
scala> df.selectExpr("sentences(str, language)");
val res4: org.apache.spark.sql.DataFrame = [sentences(str, language, ):
array<array<string>>]
scala> df.selectExpr("sentences(str)");
val res5: org.apache.spark.sql.DataFrame = [sentences(str, , ):
array<array<string>>]
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #46880 from panbingkun/sentences_improve.
Authored-by: panbingkun <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../apache/spark/sql/PlanGenerationTestSuite.scala | 6 +-
python/pyspark/sql/functions/builtin.py | 22 +++++-
.../scala/org/apache/spark/sql/functions.scala | 9 +++
.../catalyst/expressions/ExpressionImplUtils.java | 61 +++++++++++++---
.../catalyst/expressions/stringExpressions.scala | 80 ++++++++++-----------
.../expressions/StringExpressionsSuite.scala | 2 +-
.../explain-results/function_sentences.explain | 2 +-
.../function_sentences_with_language.explain | 2 +
...ion_sentences_with_language_and_country.explain | 2 +
.../function_sentences_with_locale.explain | 2 -
....json => function_sentences_with_language.json} | 4 --
... => function_sentences_with_language.proto.bin} | Bin 194 -> 186 bytes
...ction_sentences_with_language_and_country.json} | 0
..._sentences_with_language_and_country.proto.bin} | Bin
.../apache/spark/sql/StringFunctionsSuite.scala | 28 ++++++++
15 files changed, 158 insertions(+), 62 deletions(-)
diff --git
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index ee91f3aa6c00..315f80e13eff 100644
---
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -1809,7 +1809,11 @@ class PlanGenerationTestSuite
fn.sentences(fn.col("g"))
}
- functionTest("sentences with locale") {
+ functionTest("sentences with language") {
+ fn.sentences(fn.col("g"), lit("en"))
+ }
+
+ functionTest("sentences with language and country") {
fn.sentences(fn.col("g"), lit("en"), lit("US"))
}
diff --git a/python/pyspark/sql/functions/builtin.py
b/python/pyspark/sql/functions/builtin.py
index b6499eb1546e..781bf3d9f83a 100644
--- a/python/pyspark/sql/functions/builtin.py
+++ b/python/pyspark/sql/functions/builtin.py
@@ -11241,13 +11241,27 @@ def sentences(
) -> Column:
"""
Splits a string into arrays of sentences, where each sentence is an array
of words.
- The 'language' and 'country' arguments are optional, and if omitted, the
default locale is used.
+ The `language` and `country` arguments are optional,
+ When they are omitted:
+ 1.If they are both omitted, the `Locale.ROOT - locale(language='',
country='')` is used.
+ The `Locale.ROOT` is regarded as the base locale of all locales, and is
used as the
+ language/country neutral locale for the locale sensitive operations.
+ 2.If the `country` is omitted, the `locale(language, country='')` is used.
+ When they are null:
+ 1.If they are both `null`, the `Locale.US - locale(language='en',
country='US')` is used.
+ 2.If the `language` is null and the `country` is not null,
+ the `Locale.US - locale(language='en', country='US')` is used.
+ 3.If the `language` is not null and the `country` is null, the
`locale(language)` is used.
+ 4.If neither is `null`, the `locale(language, country)` is used.
.. versionadded:: 3.2.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
+ .. versionchanged:: 4.0.0
+ Supports `sentences(string, language)`.
+
Parameters
----------
string : :class:`~pyspark.sql.Column` or str
@@ -11271,6 +11285,12 @@ def sentences(
+-----------------------------------+
|[[This, is, an, example, sentence]]|
+-----------------------------------+
+ >>> df.select(sentences(df.string, lit("en"))).show(truncate=False)
+ +-----------------------------------+
+ |sentences(string, en, ) |
+ +-----------------------------------+
+ |[[This, is, an, example, sentence]]|
+ +-----------------------------------+
>>> df = spark.createDataFrame([["Hello world. How are you?"]], ["s"])
>>> df.select(sentences("s")).show(truncate=False)
+---------------------------------+
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
index 1ee86ae1a113..86f8923f36b4 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/functions.scala
@@ -4349,6 +4349,15 @@ object functions {
def sentences(string: Column, language: Column, country: Column): Column =
Column.fn("sentences", string, language, country)
+ /**
+ * Splits a string into arrays of sentences, where each sentence is an array
of words. The
+ * default `country`('') is used.
+ * @group string_funcs
+ * @since 4.0.0
+ */
+ def sentences(string: Column, language: Column): Column =
+ Column.fn("sentences", string, language)
+
/**
* Splits a string into arrays of sentences, where each sentence is an array
of words. The
* default locale is used.
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index 07a9409bc57a..18646f67975c 100644
---
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -17,20 +17,25 @@
package org.apache.spark.sql.catalyst.expressions;
-import org.apache.spark.SparkBuildInfo;
-import org.apache.spark.sql.errors.QueryExecutionErrors;
-import org.apache.spark.unsafe.types.UTF8String;
-import org.apache.spark.util.VersionUtils;
-
-import javax.crypto.Cipher;
-import javax.crypto.spec.GCMParameterSpec;
-import javax.crypto.spec.IvParameterSpec;
-import javax.crypto.spec.SecretKeySpec;
import java.nio.ByteBuffer;
import java.security.GeneralSecurityException;
import java.security.SecureRandom;
import java.security.spec.AlgorithmParameterSpec;
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+import javax.crypto.Cipher;
+import javax.crypto.spec.GCMParameterSpec;
+import javax.crypto.spec.IvParameterSpec;
+import javax.crypto.spec.SecretKeySpec;
+import org.apache.spark.SparkBuildInfo;
+import org.apache.spark.sql.catalyst.util.ArrayData;
+import org.apache.spark.sql.catalyst.util.GenericArrayData;
+import org.apache.spark.sql.errors.QueryExecutionErrors;
+import org.apache.spark.unsafe.types.UTF8String;
+import org.apache.spark.util.VersionUtils;
/**
* A utility class for constructing expressions.
@@ -272,4 +277,42 @@ public class ExpressionImplUtils {
throw QueryExecutionErrors.aesCryptoError(e.getMessage());
}
}
+
+ public static ArrayData getSentences(
+ UTF8String str,
+ UTF8String language,
+ UTF8String country) {
+ if (str == null) return null;
+ Locale locale;
+ if (language != null && country != null) {
+ locale = new Locale(language.toString(), country.toString());
+ } else if (language != null) {
+ locale = new Locale(language.toString());
+ } else {
+ locale = Locale.US;
+ }
+ String sentences = str.toString();
+ BreakIterator sentenceInstance = BreakIterator.getSentenceInstance(locale);
+ sentenceInstance.setText(sentences);
+
+ int sentenceIndex = 0;
+ List<GenericArrayData> res = new ArrayList<>();
+ while (sentenceInstance.next() != BreakIterator.DONE) {
+ String sentence = sentences.substring(sentenceIndex,
sentenceInstance.current());
+ sentenceIndex = sentenceInstance.current();
+ BreakIterator wordInstance = BreakIterator.getWordInstance(locale);
+ wordInstance.setText(sentence);
+ int wordIndex = 0;
+ List<UTF8String> words = new ArrayList<>();
+ while (wordInstance.next() != BreakIterator.DONE) {
+ String word = sentence.substring(wordIndex, wordInstance.current());
+ wordIndex = wordInstance.current();
+ if (Character.isLetterOrDigit(word.charAt(0))) {
+ words.add(UTF8String.fromString(word));
+ }
+ }
+ res.add(new GenericArrayData(words.toArray(new UTF8String[0])));
+ }
+ return new GenericArrayData(res.toArray(new GenericArrayData[0]));
+ }
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index f211da52e457..e75df87994f0 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
import java.nio.{ByteBuffer, CharBuffer}
import java.nio.charset.CharacterCodingException
-import java.text.{BreakIterator, DecimalFormat, DecimalFormatSymbols}
+import java.text.{DecimalFormat, DecimalFormatSymbols}
import java.util.{Base64 => JBase64, HashMap, Locale, Map => JMap}
import scala.collection.mutable.ArrayBuffer
@@ -3327,14 +3327,37 @@ case class FormatNumber(x: Expression, d: Expression)
/**
* Splits a string into arrays of sentences, where each sentence is an array
of words.
- * The 'lang' and 'country' arguments are optional, and if omitted, the
default locale is used.
+ * The `lang` and `country` arguments are optional, their default values are
all '',
+ * - When they are omitted:
+ * 1. If they are both omitted, the `Locale.ROOT - locale(language='',
country='')` is used.
+ * The `Locale.ROOT` is regarded as the base locale of all locales, and
is used as the
+ * language/country neutral locale for the locale sensitive operations.
+ * 2. If the `country` is omitted, the `locale(language, country='')` is
used.
+ * - When they are null:
+ * 1. If they are both `null`, the `Locale.US - locale(language='en',
country='US')` is used.
+ * 2. If the `language` is null and the `country` is not null,
+ * the `Locale.US - locale(language='en', country='US')` is used.
+ * 3. If the `language` is not null and the `country` is null, the
`locale(language)` is used.
+ * 4. If neither is `null`, the `locale(language, country)` is used.
*/
@ExpressionDescription(
- usage = "_FUNC_(str[, lang, country]) - Splits `str` into an array of array
of words.",
+ usage = "_FUNC_(str[, lang[, country]]) - Splits `str` into an array of
array of words.",
+ arguments = """
+ Arguments:
+ * str - A STRING expression to be parsed.
+ * lang - An optional STRING expression with a language code from ISO 639
Alpha-2 (e.g. 'DE'),
+ Alpha-3, or a language subtag of up to 8 characters.
+ * country - An optional STRING expression with a country code from ISO
3166 alpha-2 country
+ code or a UN M.49 numeric-3 area code.
+ """,
examples = """
Examples:
> SELECT _FUNC_('Hi there! Good morning.');
[["Hi","there"],["Good","morning"]]
+ > SELECT _FUNC_('Hi there! Good morning.', 'en');
+ [["Hi","there"],["Good","morning"]]
+ > SELECT _FUNC_('Hi there! Good morning.', 'en', 'US');
+ [["Hi","there"],["Good","morning"]]
""",
since = "2.0.0",
group = "string_funcs")
@@ -3342,7 +3365,9 @@ case class Sentences(
str: Expression,
language: Expression = Literal(""),
country: Expression = Literal(""))
- extends TernaryExpression with ImplicitCastInputTypes with CodegenFallback {
+ extends TernaryExpression
+ with ImplicitCastInputTypes
+ with RuntimeReplaceable {
def this(str: Expression) = this(str, Literal(""), Literal(""))
def this(str: Expression, language: Expression) = this(str, language,
Literal(""))
@@ -3356,49 +3381,18 @@ case class Sentences(
override def second: Expression = language
override def third: Expression = country
- override def eval(input: InternalRow): Any = {
- val string = str.eval(input)
- if (string == null) {
- null
- } else {
- val languageStr = language.eval(input).asInstanceOf[UTF8String]
- val countryStr = country.eval(input).asInstanceOf[UTF8String]
- val locale = if (languageStr != null && countryStr != null) {
- new Locale(languageStr.toString, countryStr.toString)
- } else {
- Locale.US
- }
- getSentences(string.asInstanceOf[UTF8String].toString, locale)
- }
- }
-
- private def getSentences(sentences: String, locale: Locale) = {
- val bi = BreakIterator.getSentenceInstance(locale)
- bi.setText(sentences)
- var idx = 0
- val result = new ArrayBuffer[GenericArrayData]
- while (bi.next != BreakIterator.DONE) {
- val sentence = sentences.substring(idx, bi.current)
- idx = bi.current
-
- val wi = BreakIterator.getWordInstance(locale)
- var widx = 0
- wi.setText(sentence)
- val words = new ArrayBuffer[UTF8String]
- while (wi.next != BreakIterator.DONE) {
- val word = sentence.substring(widx, wi.current)
- widx = wi.current
- if (Character.isLetterOrDigit(word.charAt(0))) words +=
UTF8String.fromString(word)
- }
- result += new GenericArrayData(words)
- }
- new GenericArrayData(result)
- }
+ override def replacement: Expression =
+ StaticInvoke(
+ classOf[ExpressionImplUtils],
+ dataType,
+ "getSentences",
+ Seq(str, language, country),
+ inputTypes,
+ propagateNull = false)
override protected def withNewChildrenInternal(
newFirst: Expression, newSecond: Expression, newThird: Expression):
Sentences =
copy(str = newFirst, language = newSecond, country = newThird)
-
}
/**
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
index beefabd98108..29b878230472 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala
@@ -1987,7 +1987,7 @@ class StringExpressionsSuite extends SparkFunSuite with
ExpressionEvalHelper {
// Test escaping of arguments
GenerateUnsafeProjection.generate(
- Sentences(Literal("\"quote"), Literal("\"quote"), Literal("\"quote")) ::
Nil)
+ Sentences(Literal("\"quote"), Literal("\"quote"),
Literal("\"quote")).replacement :: Nil)
}
test("SPARK-33386: elt ArrayIndexOutOfBoundsException") {
diff --git
a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
index 5c88a1f7b3ab..f4532e70675a 100644
---
a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
+++
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain
@@ -1,2 +1,2 @@
-Project [sentences(g#0, , ) AS sentences(g, , )#0]
+Project [static_invoke(ExpressionImplUtils.getSentences(g#0, , )) AS
sentences(g, , )#0]
+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain
new file mode 100644
index 000000000000..37bcbf9a319b
--- /dev/null
+++
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language.explain
@@ -0,0 +1,2 @@
+Project [static_invoke(ExpressionImplUtils.getSentences(g#0, en, )) AS
sentences(g, en, )#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain
new file mode 100644
index 000000000000..8a8d54cfa0d1
--- /dev/null
+++
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_language_and_country.explain
@@ -0,0 +1,2 @@
+Project [static_invoke(ExpressionImplUtils.getSentences(g#0, en, US)) AS
sentences(g, en, US)#0]
++- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain
b/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain
deleted file mode 100644
index 7819f9b54234..000000000000
---
a/sql/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain
+++ /dev/null
@@ -1,2 +0,0 @@
-Project [sentences(g#0, en, US) AS sentences(g, en, US)#0]
-+- LocalRelation <empty>, [id#0L, a#0, b#0, d#0, e#0, f#0, g#0]
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json
b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json
similarity index 89%
copy from
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json
copy to
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json
index 991b42faddb7..869e074ccd60 100644
---
a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json
+++
b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.json
@@ -22,10 +22,6 @@
"literal": {
"string": "en"
}
- }, {
- "literal": {
- "string": "US"
- }
}]
}
}]
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin
b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin
similarity index 78%
copy from
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin
copy to
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin
index 01c0136c6df1..7514b380a1c8 100644
Binary files
a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin
and
b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language.proto.bin
differ
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json
b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json
similarity index 100%
rename from
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json
rename to
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.json
diff --git
a/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin
b/sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin
similarity index 100%
rename from
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin
rename to
sql/connect/common/src/test/resources/query-tests/queries/function_sentences_with_language_and_country.proto.bin
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
index c98dddbfe8e9..ec240d71b851 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala
@@ -714,6 +714,34 @@ class StringFunctionsSuite extends QueryTest with
SharedSparkSession {
df.select(sentences($"str", $"language", $"country")),
Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+ checkAnswer(
+ df.selectExpr("sentences(str, language)"),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
+ checkAnswer(
+ df.select(sentences($"str", $"language")),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
+ checkAnswer(
+ df.selectExpr("sentences(str)"),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
+ checkAnswer(
+ df.select(sentences($"str")),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
+ checkAnswer(
+ df.selectExpr("sentences(str, null, null)"),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
+ checkAnswer(
+ df.selectExpr("sentences(str, '', null)"),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
+ checkAnswer(
+ df.selectExpr("sentences(str, null)"),
+ Row(Seq(Seq("Hi", "there"), Seq("The", "price", "was"), Seq("But",
"not", "now"))))
+
// Type coercion
checkAnswer(
df.selectExpr("sentences(null)", "sentences(10)", "sentences(3.14)"),
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]