(arrow-datafusion) branch main updated: docs: document SessionConfig (#8771)

wjones127 Thu, 11 Jan 2024 13:09:14 -0800

This is an automated email from the ASF dual-hosted git repository.

wjones127 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new 910cc76f53 docs: document SessionConfig (#8771)
910cc76f53 is described below

commit 910cc76f53dc897708eaab45ed8ce09aac5bc8aa
Author: Will Jones <[email protected]>
AuthorDate: Thu Jan 11 13:07:42 2024 -0800

    docs: document SessionConfig (#8771)
    
    * docs: document SessionConfig
    
    * make some tests happy
    
    * pr feedback
---
 datafusion/common/src/config.rs                    |  30 +++++-
 datafusion/execution/src/config.rs                 | 103 ++++++++++++++++++---
 .../sqllogictest/test_files/information_schema.slt |   2 +-
 docs/source/user-guide/configs.md                  |   2 +-
 4 files changed, 123 insertions(+), 14 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 9921c446f8..5c051a7dee 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -153,6 +153,10 @@ macro_rules! config_namespace {
 
 config_namespace! {
     /// Options related to catalog and directory scanning
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct CatalogOptions {
         /// Whether the default catalog and schema should be created 
automatically.
         pub create_default_catalog_and_schema: bool, default = true
@@ -180,6 +184,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to SQL parser
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct SqlParserOptions {
         /// When set to true, SQL parser will parse float as decimal type
         pub parse_float_as_decimal: bool, default = false
@@ -196,6 +204,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to query execution
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ExecutionOptions {
         /// Default batch size while creating new batches, it's especially 
useful for
         /// buffer-in-memory batches since creating tiny batches would result 
in too much
@@ -283,6 +295,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to parquet files
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ParquetOptions {
         /// If true, reads the Parquet data page level metadata (the
         /// Page Index), if present, to reduce the I/O and number of
@@ -306,7 +322,7 @@ config_namespace! {
         pub metadata_size_hint: Option<usize>, default = None
 
         /// If true, filter expressions are be applied during the parquet 
decoding operation to
-        /// reduce the number of rows decoded
+        /// reduce the number of rows decoded. This optimization is sometimes 
called "late materialization".
         pub pushdown_filters: bool, default = false
 
         /// If true, filter expressions evaluated during the parquet decoding 
operation
@@ -416,6 +432,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to aggregate execution
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct AggregateOptions {
         /// Specifies the threshold for using `ScalarValue`s to update
         /// accumulators during high-cardinality aggregations for each input 
batch.
@@ -433,6 +453,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options related to query optimization
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct OptimizerOptions {
         /// When set to true, the optimizer will push a limit operation into
         /// grouped aggregations which have no aggregate expressions, as a 
soft limit,
@@ -541,6 +565,10 @@ config_namespace! {
 
 config_namespace! {
     /// Options controlling explain output
+    ///
+    /// See also: [`SessionConfig`]
+    ///
+    /// [`SessionConfig`]: 
https://docs.rs/datafusion/latest/datafusion/prelude/struct.SessionConfig.html
     pub struct ExplainOptions {
         /// When set to true, the explain statement will only print logical 
plans
         pub logical_plan_only: bool, default = false
diff --git a/datafusion/execution/src/config.rs 
b/datafusion/execution/src/config.rs
index 8556335b39..5158e71773 100644
--- a/datafusion/execution/src/config.rs
+++ b/datafusion/execution/src/config.rs
@@ -24,7 +24,69 @@ use std::{
 
 use datafusion_common::{config::ConfigOptions, Result, ScalarValue};
 
-/// Configuration options for Execution context
+/// Configuration options for [`SessionContext`].
+///
+/// Can be passed to [`SessionContext::new_with_config`] to customize the 
configuration of DataFusion.
+///
+/// Options can be set using namespaces keys with `.` as the separator, where 
the
+/// namespace determines which configuration struct the value to routed to. All
+/// built-in options are under the `datafusion` namespace.
+///
+/// For example, the key `datafusion.execution.batch_size` will set 
[ExecutionOptions::batch_size][datafusion_common::config::ExecutionOptions::batch_size],
+/// because [ConfigOptions::execution] is 
[ExecutionOptions][datafusion_common::config::ExecutionOptions]. Similarly, the 
key
+/// `datafusion.execution.parquet.pushdown_filters` will set 
[ParquetOptions::pushdown_filters][datafusion_common::config::ParquetOptions::pushdown_filters],
+/// since 
[ExecutionOptions::parquet][datafusion_common::config::ExecutionOptions::parquet]
 is [ParquetOptions][datafusion_common::config::ParquetOptions].
+///
+/// Some options have convenience methods. For example 
[SessionConfig::with_batch_size] is
+/// shorthand for setting `datafusion.execution.batch_size`.
+///
+/// ```
+/// use datafusion_execution::config::SessionConfig;
+/// use datafusion_common::ScalarValue;
+///
+/// let config = SessionConfig::new()
+///    .set("datafusion.execution.batch_size", ScalarValue::UInt64(Some(1234)))
+///    .set_bool("datafusion.execution.parquet.pushdown_filters", true);
+///
+/// assert_eq!(config.batch_size(), 1234);
+/// assert_eq!(config.options().execution.batch_size, 1234);
+/// assert_eq!(config.options().execution.parquet.pushdown_filters, true);
+/// ```
+///
+/// You can also directly mutate the options via [SessionConfig::options_mut].
+/// So the following is equivalent to the above:
+///
+/// ```
+/// # use datafusion_execution::config::SessionConfig;
+/// # use datafusion_common::ScalarValue;
+/// #
+/// let mut config = SessionConfig::new();
+/// config.options_mut().execution.batch_size = 1234;
+/// config.options_mut().execution.parquet.pushdown_filters = true;
+/// #
+/// # assert_eq!(config.batch_size(), 1234);
+/// # assert_eq!(config.options().execution.batch_size, 1234);
+/// # assert_eq!(config.options().execution.parquet.pushdown_filters, true);
+/// ```
+///
+/// ## Built-in options
+///
+/// | Namespace | Config struct |
+/// | --------- | ------------- |
+/// | `datafusion.catalog` | 
[CatalogOptions][datafusion_common::config::CatalogOptions] |
+/// | `datafusion.execution` | 
[ExecutionOptions][datafusion_common::config::ExecutionOptions] |
+/// | `datafusion.execution.aggregate` | 
[AggregateOptions][datafusion_common::config::AggregateOptions] |
+/// | `datafusion.execution.parquet` | 
[ParquetOptions][datafusion_common::config::ParquetOptions] |
+/// | `datafusion.optimizer` | 
[OptimizerOptions][datafusion_common::config::OptimizerOptions] |
+/// | `datafusion.sql_parser` | 
[SqlParserOptions][datafusion_common::config::SqlParserOptions] |
+/// | `datafusion.explain` | 
[ExplainOptions][datafusion_common::config::ExplainOptions] |
+///
+/// ## Custom configuration
+///
+/// Configuration options can be extended. See [SessionConfig::with_extension] 
for details.
+///
+/// [`SessionContext`]: 
https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html
+/// [`SessionContext::new_with_config`]: 
https://docs.rs/datafusion/latest/datafusion/execution/context/struct.SessionContext.html#method.new_with_config
 #[derive(Clone, Debug)]
 pub struct SessionConfig {
     /// Configuration options
@@ -62,6 +124,35 @@ impl SessionConfig {
         Ok(ConfigOptions::from_string_hash_map(settings)?.into())
     }
 
+    /// Return a handle to the configuration options.
+    ///
+    /// Can be used to read the current configuration.
+    ///
+    /// ```
+    /// use datafusion_execution::config::SessionConfig;
+    ///
+    /// let config = SessionConfig::new();
+    /// assert!(config.options().execution.batch_size > 0);
+    /// ```
+    pub fn options(&self) -> &ConfigOptions {
+        &self.options
+    }
+
+    /// Return a mutable handle to the configuration options.
+    ///
+    /// Can be used to set configuration options.
+    ///
+    /// ```
+    /// use datafusion_execution::config::SessionConfig;
+    ///
+    /// let mut config = SessionConfig::new();
+    /// config.options_mut().execution.batch_size = 1024;
+    /// assert_eq!(config.options().execution.batch_size, 1024);
+    /// ```
+    pub fn options_mut(&mut self) -> &mut ConfigOptions {
+        &mut self.options
+    }
+
     /// Set a configuration option
     pub fn set(mut self, key: &str, value: ScalarValue) -> Self {
         self.options.set(key, &value.to_string()).unwrap();
@@ -346,16 +437,6 @@ impl SessionConfig {
         &mut self.options
     }
 
-    /// Return a handle to the configuration options.
-    pub fn options(&self) -> &ConfigOptions {
-        &self.options
-    }
-
-    /// Return a mutable handle to the configuration options.
-    pub fn options_mut(&mut self) -> &mut ConfigOptions {
-        &mut self.options
-    }
-
     /// Add extensions.
     ///
     /// Extensions can be used to attach extra data to the session config -- 
e.g. tracing information or caches.
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt 
b/datafusion/sqllogictest/test_files/information_schema.slt
index 14ef290294..f8893bf7ae 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -248,7 +248,7 @@ 
datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 By def
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 By default 
parallel parquet writer is tuned for minimum memory usage in a streaming 
execution plan. You may see a performance benefit when writing large parquet 
files by increasing maximum_parallel_row_group_writers and 
maximum_buffered_record_batches_per_stream if your system has idle cores and 
can tolerate additional memory usage. Boosting these values is likely 
worthwhile when writing out already in-memory data, such as [...]
 datafusion.execution.parquet.metadata_size_hint NULL If specified, the parquet 
reader will try and fetch the last `size_hint` bytes of the parquet file 
optimistically. If not specified, two reads are required: One read to fetch the 
8-byte parquet footer and another to fetch the metadata length encoded in the 
footer
 datafusion.execution.parquet.pruning true If true, the parquet reader attempts 
to skip entire row groups based on the predicate in the query and the metadata 
(min/max values) stored in the parquet file
-datafusion.execution.parquet.pushdown_filters false If true, filter 
expressions are be applied during the parquet decoding operation to reduce the 
number of rows decoded
+datafusion.execution.parquet.pushdown_filters false If true, filter 
expressions are be applied during the parquet decoding operation to reduce the 
number of rows decoded. This optimization is sometimes called "late 
materialization".
 datafusion.execution.parquet.reorder_filters false If true, filter expressions 
evaluated during the parquet decoding operation will be reordered heuristically 
to minimize the cost of evaluation. If false, the filters are applied in the 
same order as written in the query
 datafusion.execution.parquet.skip_metadata true If true, the parquet reader 
skip the optional embedded metadata that may be in the file Schema. This 
setting can help avoid schema conflicts when querying multiple parquet files 
with schemas containing compatible types but different metadata
 datafusion.execution.parquet.statistics_enabled NULL Sets if statistics are 
enabled for any column Valid values are: "none", "chunk", and "page" These 
values are not case sensitive. If NULL, uses default parquet writer setting
diff --git a/docs/source/user-guide/configs.md 
b/docs/source/user-guide/configs.md
index 4a379d374c..7111ea1d0a 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -53,7 +53,7 @@ Environment variables are read during `SessionConfig` 
initialisation so they mus
 | datafusion.execution.parquet.pruning                                    | 
true                      | If true, the parquet reader attempts to skip entire 
row groups based on the predicate in the query and the metadata (min/max 
values) stored in the parquet file                                              
                                                                                
                                                                                
                        [...]
 | datafusion.execution.parquet.skip_metadata                              | 
true                      | If true, the parquet reader skip the optional 
embedded metadata that may be in the file Schema. This setting can help avoid 
schema conflicts when querying multiple parquet files with schemas containing 
compatible types but different metadata                                         
                                                                                
                           [...]
 | datafusion.execution.parquet.metadata_size_hint                         | 
NULL                      | If specified, the parquet reader will try and fetch 
the last `size_hint` bytes of the parquet file optimistically. If not 
specified, two reads are required: One read to fetch the 8-byte parquet footer 
and another to fetch the metadata length encoded in the footer                  
                                                                                
                            [...]
-| datafusion.execution.parquet.pushdown_filters                           | 
false                     | If true, filter expressions are be applied during 
the parquet decoding operation to reduce the number of rows decoded             
                                                                                
                                                                                
                                                                                
                   [...]
+| datafusion.execution.parquet.pushdown_filters                           | 
false                     | If true, filter expressions are be applied during 
the parquet decoding operation to reduce the number of rows decoded. This 
optimization is sometimes called "late materialization".                        
                                                                                
                                                                                
                         [...]
 | datafusion.execution.parquet.reorder_filters                            | 
false                     | If true, filter expressions evaluated during the 
parquet decoding operation will be reordered heuristically to minimize the cost 
of evaluation. If false, the filters are applied in the same order as written 
in the query                                                                    
                                                                                
                      [...]
 | datafusion.execution.parquet.data_pagesize_limit                        | 
1048576                   | Sets best effort maximum size of data page in bytes 
                                                                                
                                                                                
                                                                                
                                                                                
                 [...]
 | datafusion.execution.parquet.write_batch_size                           | 
1024                      | Sets write_batch_size in bytes                      
                                                                                
                                                                                
                                                                                
                                                                                
                 [...]

(arrow-datafusion) branch main updated: docs: document SessionConfig (#8771)

Reply via email to