Repository: spark Updated Branches: refs/heads/master 65a8d1b87 -> 4506dad8a
[SPARK-25656][SQL][DOC][EXAMPLE] Add a doc and examples about extra data source options ## What changes were proposed in this pull request? Our current doc does not explain how we are passing the data source specific options to the underlying data source. According to [the review comment](https://github.com/apache/spark/pull/22622#discussion_r222911529), this PR aims to add more detailed information and examples ## How was this patch tested? Manual. Closes #22801 from dongjoon-hyun/SPARK-25656. Authored-by: Dongjoon Hyun <dongj...@apache.org> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/4506dad8 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/4506dad8 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/4506dad8 Branch: refs/heads/master Commit: 4506dad8a9613d4b6b319c0240119927265a67c1 Parents: 65a8d1b Author: Dongjoon Hyun <dongj...@apache.org> Authored: Tue Oct 23 12:41:20 2018 -0700 Committer: Dongjoon Hyun <dongj...@apache.org> Committed: Tue Oct 23 12:41:20 2018 -0700 ---------------------------------------------------------------------- docs/sql-data-sources-load-save-functions.md | 44 +++++++++++++++++++ .../examples/sql/JavaSQLDataSourceExample.java | 7 +++ examples/src/main/python/sql/datasource.py | 9 ++++ examples/src/main/r/RSparkSQLExample.R | 6 ++- examples/src/main/resources/users.orc | Bin 0 -> 547 bytes .../examples/sql/SQLDataSourceExample.scala | 7 +++ 6 files changed, 72 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/docs/sql-data-sources-load-save-functions.md ---------------------------------------------------------------------- diff --git a/docs/sql-data-sources-load-save-functions.md b/docs/sql-data-sources-load-save-functions.md index e1dd0a3..e4c7b17 100644 --- a/docs/sql-data-sources-load-save-functions.md +++ b/docs/sql-data-sources-load-save-functions.md @@ -82,6 +82,50 @@ To load a CSV file you can use: </div> </div> +The extra options are also used during write operation. +For example, you can control bloom filters and dictionary encodings for ORC data sources. +The following ORC example will create bloom filter and use dictionary encoding only for `favorite_color`. +For Parquet, there exists `parquet.enable.dictionary`, too. +To find more detailed information about the extra ORC/Parquet options, +visit the official Apache ORC/Parquet websites. + +<div class="codetabs"> + +<div data-lang="scala" markdown="1"> +{% include_example manual_save_options_orc scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala %} +</div> + +<div data-lang="java" markdown="1"> +{% include_example manual_save_options_orc java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java %} +</div> + +<div data-lang="python" markdown="1"> +{% include_example manual_save_options_orc python/sql/datasource.py %} +</div> + +<div data-lang="r" markdown="1"> +{% include_example manual_save_options_orc r/RSparkSQLExample.R %} +</div> + +<div data-lang="sql" markdown="1"> + +{% highlight sql %} +CREATE TABLE users_with_options ( + name STRING, + favorite_color STRING, + favorite_numbers array<integer> +) USING ORC +OPTIONS ( + orc.bloom.filter.columns 'favorite_color', + orc.dictionary.key.threshold '1.0', + orc.column.encoding.direct 'name' +) +{% endhighlight %} + +</div> + +</div> + ### Run SQL on files directly Instead of using read API to load a file into DataFrame and query it, you can also query that http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java index ef3c904..cbe9dfd 100644 --- a/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java +++ b/examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java @@ -123,6 +123,13 @@ public class JavaSQLDataSourceExample { .option("header", "true") .load("examples/src/main/resources/people.csv"); // $example off:manual_load_options_csv$ + // $example on:manual_save_options_orc$ + usersDF.write().format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc"); + // $example off:manual_save_options_orc$ // $example on:direct_sql$ Dataset<Row> sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`"); http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/python/sql/datasource.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/sql/datasource.py b/examples/src/main/python/sql/datasource.py index d8c879d..0466072 100644 --- a/examples/src/main/python/sql/datasource.py +++ b/examples/src/main/python/sql/datasource.py @@ -57,6 +57,15 @@ def basic_datasource_example(spark): format="csv", sep=":", inferSchema="true", header="true") # $example off:manual_load_options_csv$ + # $example on:manual_save_options_orc$ + df = spark.read.orc("examples/src/main/resources/users.orc") + (df.write.format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc")) + # $example off:manual_save_options_orc$ + # $example on:write_sorting_and_bucketing$ df.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed") # $example off:write_sorting_and_bucketing$ http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/r/RSparkSQLExample.R ---------------------------------------------------------------------- diff --git a/examples/src/main/r/RSparkSQLExample.R b/examples/src/main/r/RSparkSQLExample.R index effba94..196a110 100644 --- a/examples/src/main/r/RSparkSQLExample.R +++ b/examples/src/main/r/RSparkSQLExample.R @@ -114,10 +114,14 @@ write.df(namesAndAges, "namesAndAges.parquet", "parquet") # $example on:manual_load_options_csv$ -df <- read.df("examples/src/main/resources/people.csv", "csv", sep=";", inferSchema=T, header=T) +df <- read.df("examples/src/main/resources/people.csv", "csv", sep = ";", inferSchema = TRUE, header = TRUE) namesAndAges <- select(df, "name", "age") # $example off:manual_load_options_csv$ +# $example on:manual_save_options_orc$ +df <- read.df("examples/src/main/resources/users.orc", "orc") +write.orc(df, "users_with_options.orc", orc.bloom.filter.columns = "favorite_color", orc.dictionary.key.threshold = 1.0, orc.column.encoding.direct = "name") +# $example off:manual_save_options_orc$ # $example on:direct_sql$ df <- sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/resources/users.orc ---------------------------------------------------------------------- diff --git a/examples/src/main/resources/users.orc b/examples/src/main/resources/users.orc new file mode 100644 index 0000000..12478a5 Binary files /dev/null and b/examples/src/main/resources/users.orc differ http://git-wip-us.apache.org/repos/asf/spark/blob/4506dad8/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala index 7d83aac..18615d9 100644 --- a/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala @@ -56,6 +56,13 @@ object SQLDataSourceExample { .option("header", "true") .load("examples/src/main/resources/people.csv") // $example off:manual_load_options_csv$ + // $example on:manual_save_options_orc$ + usersDF.write.format("orc") + .option("orc.bloom.filter.columns", "favorite_color") + .option("orc.dictionary.key.threshold", "1.0") + .option("orc.column.encoding.direct", "name") + .save("users_with_options.orc") + // $example off:manual_save_options_orc$ // $example on:direct_sql$ val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org