This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 875def391665 [SPARK-49081][SQL][DOCS] Add data source options docs of
`Protobuf`
875def391665 is described below
commit 875def39166549f9de54b141f5397cb3f74a918e
Author: Wei Guo <[email protected]>
AuthorDate: Thu Sep 12 16:26:33 2024 -0700
[SPARK-49081][SQL][DOCS] Add data source options docs of `Protobuf`
### What changes were proposed in this pull request?
This PR aims to add data source options docs of `Protobuf` data source.
Other data sources such as `csv`, `json` have corresponding options documents.
The document section appears as follows:
<img width="1058" alt="image"
src="https://github.com/user-attachments/assets/6f40a69b-1350-4b6b-9a1e-d780fcabb9f1">
<img width="1171" alt="image"
src="https://github.com/user-attachments/assets/80402560-474b-4608-be51-0a98d9324109">
### Why are the changes needed?
In order to facilitate Spark users to better understand and use the options
of `Protobuf` data source.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Pass GA and local manual check with `SKIP_API=1 bundle exec jekyll build
--watch`.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #47570 from wayneguow/pb_docs.
Authored-by: Wei Guo <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../spark/sql/protobuf/utils/ProtobufOptions.scala | 12 ++--
docs/sql-data-sources-protobuf.md | 67 +++++++++++++++++++++-
2 files changed, 72 insertions(+), 7 deletions(-)
diff --git
a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
index 6644bce98293..e85097a272f2 100644
---
a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
+++
b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
@@ -43,8 +43,8 @@ private[sql] class ProtobufOptions(
/**
* Adds support for recursive fields. If this option is is not specified,
recursive fields are
- * not permitted. Setting it to 0 drops the recursive fields, 1 allows it to
be recursed once,
- * and 2 allows it to be recursed twice and so on, up to 10. Values larger
than 10 are not
+ * not permitted. Setting it to 1 drops the recursive fields, 0 allows it to
be recursed once,
+ * and 3 allows it to be recursed twice and so on, up to 10. Values larger
than 10 are not
* allowed in order avoid inadvertently creating very large schemas. If a
Protobuf message
* has depth beyond this limit, the Spark struct returned is truncated after
the recursion limit.
*
@@ -52,8 +52,8 @@ private[sql] class ProtobufOptions(
* `message Person { string name = 1; Person friend = 2; }`
* The following lists the schema with different values for this setting.
* 1: `struct<name: string>`
- * 2: `struct<name string, friend: struct<name: string>>`
- * 3: `struct<name string, friend: struct<name string, friend:
struct<name: string>>>`
+ * 2: `struct<name: string, friend: struct<name: string>>`
+ * 3: `struct<name: string, friend: struct<name: string, friend:
struct<name: string>>>`
* and so on.
*/
val recursiveFieldMaxDepth: Int =
parameters.getOrElse("recursive.fields.max.depth", "-1").toInt
@@ -181,7 +181,7 @@ private[sql] class ProtobufOptions(
val upcastUnsignedInts: Boolean =
parameters.getOrElse("upcast.unsigned.ints", false.toString).toBoolean
- // Whether to unwrap the struct representation for well known primitve
wrapper types when
+ // Whether to unwrap the struct representation for well known primitive
wrapper types when
// deserializing. By default, the wrapper types for primitives (i.e.
google.protobuf.Int32Value,
// google.protobuf.Int64Value, etc.) will get deserialized as structs. We
allow the option to
// deserialize them as their respective primitives.
@@ -221,7 +221,7 @@ private[sql] class ProtobufOptions(
// By default, in the spark schema field a will be dropped, which result in
schema
// b struct<name: string>
// If retain.empty.message.types=true, field a will be retained by inserting
a dummy column.
- // b struct<a struct<__dummy_field_in_empty_struct: string>, name: string>
+ // b struct<a: struct<__dummy_field_in_empty_struct: string>, name: string>
val retainEmptyMessage: Boolean =
parameters.getOrElse("retain.empty.message.types",
false.toString).toBoolean
}
diff --git a/docs/sql-data-sources-protobuf.md
b/docs/sql-data-sources-protobuf.md
index 34cb1d4997d2..4dd6579f92cd 100644
--- a/docs/sql-data-sources-protobuf.md
+++ b/docs/sql-data-sources-protobuf.md
@@ -434,4 +434,69 @@ message Person {
<div class="d-none">
```
</div>
-</div>
\ No newline at end of file
+</div>
+
+## Data Source Option
+
+Data source options of Protobuf can be set via:
+* the built-in functions below
+ * `from_protobuf`
+ * `to_protobuf`
+
+<table>
+ <thead><tr><th><b>Property
Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th><th><b>Scope</b></th></tr></thead>
+ <tr>
+ <td><code>mode</code></td>
+ <td><code>FAILFAST</code></td>
+ <td>Allows a mode for dealing with corrupt records during parsing.<br>
+ <ul>
+ <li><code>PERMISSIVE</code>: when it meets a corrupted record, sets all
fields to <code>null</code>.</li>
+ <li><code>DROPMALFORMED</code>: ignores the whole corrupted records.
This mode is unsupported in the Protobuf built-in functions.</li>
+ <li><code>FAILFAST</code>: throws an exception when it meets corrupted
records.</li>
+ </ul>
+ </td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>recursive.fields.max.depth</code></td>
+ <td><code>-1</code></td>
+ <td>Specifies the maximum number of recursion levels to allow when parsing
the schema. For more details refers to the section <a
href="https://spark.apache.org/docs/latest/sql-data-sources-protobuf.html#handling-circular-references-protobuf-fields">Handling
circular references protobuf fields</a>.</td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>convert.any.fields.to.json</code></td>
+ <td><code>false</code></td>
+ <td>Enables converting Protobuf <code>Any</code> fields to JSON. This
option should be enabled carefully. JSON conversion and processing are
inefficient. In addition, schema safety is also reduced making downstream
processing error-prone.</td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>emit.default.values</code></td>
+ <td><code>false</code></td>
+ <td>Whether to render fields with zero values when deserializing Protobuf
to a Spark struct. When a field is empty in the serialized Protobuf, this
library will deserialize them as <code>null</code> by default, this option can
control whether to render the type-specific zero values.</td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>enums.as.ints</code></td>
+ <td><code>false</code></td>
+ <td>Whether to render enum fields as their integer values. When this
option set to <code>false</code>, an enum field will be mapped to
<code>StringType</code>, and the value is the name of enum; when set to
<code>true</code>, an enum field will be mapped to <code>IntegerType</code>,
the value is its integer value.</td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>upcast.unsigned.ints</code></td>
+ <td><code>false</code></td>
+ <td>Whether to upcast unsigned integers into a larger type. Setting this
option to <code>true</code>, <code>LongType</code> is used for
<code>uint32</code> and <code>Decimal(20, 0)</code> is used for
<code>uint64</code>, so their representation can contain large unsigned values
without overflow.</td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>unwrap.primitive.wrapper.types</code></td>
+ <td><code>false</code></td>
+ <td>Whether to unwrap the struct representation for well-known primitive
wrapper types when deserializing. By default, the wrapper types for primitives
(i.e. google.protobuf.Int32Value, google.protobuf.Int64Value, etc.) will get
deserialized as structs.</td>
+ <td>read</td>
+ </tr>
+ <tr>
+ <td><code>retain.empty.message.types</code></td>
+ <td><code>false</code></td>
+ <td>Whether to retain fields of the empty proto message type in Schema.
Since Spark doesn't allow writing empty <code>StructType</code>, the empty
proto message type will be dropped by default. Setting this option to
<code>true</code> will insert a dummy
column(<code>__dummy_field_in_empty_struct</code>) to the empty proto message
so that the empty message fields will be retained.</td>
+ <td>read</td>
+ </tr>
+</table>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]