This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/arrow-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 82c0b10e32d DataFusion 16.0.0 documentation (#301)
82c0b10e32d is described below
commit 82c0b10e32d28a32294c1dab93c7e3e0a7479087
Author: Andy Grove <[email protected]>
AuthorDate: Sun Jan 22 04:56:05 2023 -0700
DataFusion 16.0.0 documentation (#301)
* DataFusion 16.0.0 documentation
* remove _sources
---
datafusion/.gitignore | 1 +
.../capitalized_example.csv | 5 +
.../9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv | 2 +
datafusion/contributor-guide/roadmap.html | 6 +-
datafusion/objects.inv | Bin 5023 -> 5049 bytes
datafusion/searchindex.js | 2 +-
datafusion/user-guide/cli.html | 60 +++++---
datafusion/user-guide/configs.html | 152 +++++++++++----------
datafusion/user-guide/dataframe.html | 2 +-
datafusion/user-guide/example-usage.html | 12 +-
10 files changed, 146 insertions(+), 96 deletions(-)
diff --git a/datafusion/.gitignore b/datafusion/.gitignore
new file mode 100644
index 00000000000..97d0badbd31
--- /dev/null
+++ b/datafusion/.gitignore
@@ -0,0 +1 @@
+_sources
\ No newline at end of file
diff --git
a/datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv
b/datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv
new file mode 100644
index 00000000000..dbc8f5c5a0a
--- /dev/null
+++
b/datafusion/_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv
@@ -0,0 +1,5 @@
+A,b,c
+1,2,3
+1,10,5
+2,5,6
+2,1,4
\ No newline at end of file
diff --git a/datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv
b/datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv
new file mode 100644
index 00000000000..0eadb69396b
--- /dev/null
+++ b/datafusion/_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv
@@ -0,0 +1,2 @@
+a,b,c
+1,2,3
\ No newline at end of file
diff --git a/datafusion/contributor-guide/roadmap.html
b/datafusion/contributor-guide/roadmap.html
index 5516bc76e6c..0e85413f829 100644
--- a/datafusion/contributor-guide/roadmap.html
+++ b/datafusion/contributor-guide/roadmap.html
@@ -421,7 +421,7 @@ to provide:</p>
<h3>Query Optimizer<a class="headerlink" href="#query-optimizer"
title="Permalink to this heading">¶</a></h3>
<ul class="simple">
<li><p>More sophisticated cost based optimizer for join ordering</p></li>
-<li><p>Implement advanced query optimization framework (Tokomak) #440</p></li>
+<li><p>Implement advanced query optimization framework (Tokomak) <a
class="reference external"
href="https://github.com/apache/arrow-datafusion/issues/440">#440</a></p></li>
<li><p>Finer optimizations for group by and aggregate functions</p></li>
</ul>
</section>
@@ -436,8 +436,8 @@ to provide:</p>
<h3>Runtime / Infrastructure<a class="headerlink"
href="#runtime-infrastructure" title="Permalink to this heading">¶</a></h3>
<ul class="simple">
<li><p>Migrate to some sort of arrow2 based implementation (see <a
class="reference external"
href="https://github.com/apache/arrow-datafusion/milestone/3">milestone</a> for
more details)</p></li>
-<li><p>Add DataFusion to h2oai/db-benchmark <a class="reference external"
href="https://github.com/apache/arrow-datafusion/issues/147">147</a></p></li>
-<li><p>Improve build time <a class="reference external"
href="https://github.com/apache/arrow-datafusion/issues/348">348</a></p></li>
+<li><p>Add DataFusion to h2oai/db-benchmark <a class="reference external"
href="https://github.com/apache/arrow-datafusion/issues/147">#147</a></p></li>
+<li><p>Improve build time <a class="reference external"
href="https://github.com/apache/arrow-datafusion/issues/348">#348</a></p></li>
</ul>
</section>
<section id="resource-management">
diff --git a/datafusion/objects.inv b/datafusion/objects.inv
index ce6713b92ca..0b79f2c92d9 100644
Binary files a/datafusion/objects.inv and b/datafusion/objects.inv differ
diff --git a/datafusion/searchindex.js b/datafusion/searchindex.js
index a052178f56e..0323e771313 100644
--- a/datafusion/searchindex.js
+++ b/datafusion/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["contributor-guide/communication",
"contributor-guide/index", "contributor-guide/quarterly_roadmap",
"contributor-guide/roadmap", "contributor-guide/specification/index",
"contributor-guide/specification/invariants",
"contributor-guide/specification/output-field-name-semantic", "index",
"user-guide/cli", "user-guide/configs", "user-guide/dataframe",
"user-guide/example-usage", "user-guide/expressions", "user-guide/faq",
"user-guide/introduction", "user-guide [...]
\ No newline at end of file
+Search.setIndex({"docnames": ["contributor-guide/communication",
"contributor-guide/index", "contributor-guide/quarterly_roadmap",
"contributor-guide/roadmap", "contributor-guide/specification/index",
"contributor-guide/specification/invariants",
"contributor-guide/specification/output-field-name-semantic", "index",
"user-guide/cli", "user-guide/configs", "user-guide/dataframe",
"user-guide/example-usage", "user-guide/expressions", "user-guide/faq",
"user-guide/introduction", "user-guide [...]
\ No newline at end of file
diff --git a/datafusion/user-guide/cli.html b/datafusion/user-guide/cli.html
index 87c27135e67..d30c06e18c3 100644
--- a/datafusion/user-guide/cli.html
+++ b/datafusion/user-guide/cli.html
@@ -294,6 +294,11 @@
Usage
</a>
</li>
+ <li class="toc-h2 nav-item toc-entry">
+ <a class="reference internal nav-link" href="#selecting-files-directly">
+ Selecting files directly
+ </a>
+ </li>
<li class="toc-h2 nav-item toc-entry">
<a class="reference internal nav-link"
href="#registering-parquet-data-sources">
Registering Parquet Data Sources
@@ -367,26 +372,43 @@
-->
<section id="datafusion-command-line-sql-utility">
<h1>DataFusion Command-line SQL Utility<a class="headerlink"
href="#datafusion-command-line-sql-utility" title="Permalink to this
heading">¶</a></h1>
-<p>The DataFusion CLI is a command-line interactive SQL utility that allows
-queries to be executed against any supported data files. It is a convenient
way to
+<p>The DataFusion CLI is a command-line interactive SQL utility for executing
+queries against any supported data files. It is a convenient way to
try DataFusion out with your own data sources, and test out its SQL
support.</p>
<section id="example">
<h2>Example<a class="headerlink" href="#example" title="Permalink to this
heading">¶</a></h2>
<p>Create a CSV file to query.</p>
-<div class="highlight-bash notranslate"><div
class="highlight"><pre><span></span>$ <span class="nb">echo</span> <span
class="s2">"1,2"</span> > data.csv
+<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span>$ <span class="nb">echo</span> <span
class="s2">"a,b"</span> > data.csv
+$ <span class="nb">echo</span> <span class="s2">"1,2"</span>
>> data.csv
</pre></div>
</div>
-<div class="highlight-bash notranslate"><div
class="highlight"><pre><span></span>$ datafusion-cli
-DataFusion CLI v12.0.0
-❯ CREATE EXTERNAL TABLE foo STORED AS CSV LOCATION <span
class="s1">'data.csv'</span><span class="p">;</span>
-<span class="m">0</span> rows <span class="k">in</span> set. Query took <span
class="m">0</span>.017 seconds.
-❯ <span class="k">select</span> * from foo<span class="p">;</span>
-+----------+----------+
-<span class="p">|</span> column_1 <span class="p">|</span> column_2 <span
class="p">|</span>
-+----------+----------+
-<span class="p">|</span> <span class="m">1</span> <span
class="p">|</span> <span class="m">2</span> <span class="p">|</span>
-+----------+----------+
-<span class="m">1</span> row <span class="k">in</span> set. Query took <span
class="m">0</span>.012 seconds.
+<p>Query that single file (the CLI also supports parquet, compressed csv,
avro, json and more)</p>
+<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span>$ datafusion-cli
+DataFusion CLI v17.0.0
+❯ <span class="k">select</span> * from <span
class="s1">'data.csv'</span><span class="p">;</span>
++---+---+
+<span class="p">|</span> a <span class="p">|</span> b <span class="p">|</span>
++---+---+
+<span class="p">|</span> <span class="m">1</span> <span class="p">|</span>
<span class="m">2</span> <span class="p">|</span>
++---+---+
+<span class="m">1</span> row <span class="k">in</span> set. Query took <span
class="m">0</span>.007 seconds.
+</pre></div>
+</div>
+<p>You can also query directories of files with compatible schemas:</p>
+<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span>$ ls data_dir/
+data.csv data2.csv
+</pre></div>
+</div>
+<div class="highlight-shell notranslate"><div
class="highlight"><pre><span></span>$ datafusion-cli
+DataFusion CLI v16.0.0
+❯ <span class="k">select</span> * from <span
class="s1">'data_dir'</span><span class="p">;</span>
++---+---+
+<span class="p">|</span> a <span class="p">|</span> b <span class="p">|</span>
++---+---+
+<span class="p">|</span> <span class="m">3</span> <span class="p">|</span>
<span class="m">4</span> <span class="p">|</span>
+<span class="p">|</span> <span class="m">1</span> <span class="p">|</span>
<span class="m">2</span> <span class="p">|</span>
++---+---+
+<span class="m">2</span> rows <span class="k">in</span> set. Query took <span
class="m">0</span>.007 seconds.
</pre></div>
</div>
</section>
@@ -430,6 +452,7 @@ docker run -it -v <span
class="k">$(</span>your_data_location<span class="k">)</
</section>
<section id="usage">
<h2>Usage<a class="headerlink" href="#usage" title="Permalink to this
heading">¶</a></h2>
+<p>See the current usage using <code class="docutils literal
notranslate"><span class="pre">datafusion-cli</span> <span
class="pre">--help</span></code>:</p>
<div class="highlight-bash notranslate"><div
class="highlight"><pre><span></span>Apache Arrow <[email protected]>
Command Line Client <span class="k">for</span> DataFusion query engine.
@@ -446,11 +469,16 @@ OPTIONS:
-q, --quiet Reduce printing other than the results
and work quietly
-r, --rc <RC>... Run the provided files on startup
instead of ~/.datafusionrc
-V, --version Print version information
-
-Type <span class="sb">`</span><span class="nb">exit</span><span
class="sb">`</span> or <span class="sb">`</span>quit<span class="sb">`</span>
to <span class="nb">exit</span> the CLI.
</pre></div>
</div>
</section>
+<section id="selecting-files-directly">
+<h2>Selecting files directly<a class="headerlink"
href="#selecting-files-directly" title="Permalink to this heading">¶</a></h2>
+<p>Files can be queried directly by enclosing the file or
+directory name in single <code class="docutils literal notranslate"><span
class="pre">'</span></code> quotes as shown in the example.</p>
+<p>It is also possible to create a table backed by files by explicitly
+via <code class="docutils literal notranslate"><span class="pre">CREATE</span>
<span class="pre">EXTERNAL</span> <span class="pre">TABLE</span></code> as
shown below.</p>
+</section>
<section id="registering-parquet-data-sources">
<h2>Registering Parquet Data Sources<a class="headerlink"
href="#registering-parquet-data-sources" title="Permalink to this
heading">¶</a></h2>
<p>Parquet data sources can be registered by executing a <code class="docutils
literal notranslate"><span class="pre">CREATE</span> <span
class="pre">EXTERNAL</span> <span class="pre">TABLE</span></code> SQL
statement. It is not necessary to provide schema information for Parquet
files.</p>
diff --git a/datafusion/user-guide/configs.html
b/datafusion/user-guide/configs.html
index 3ae271b6fb0..b340d102208 100644
--- a/datafusion/user-guide/configs.html
+++ b/datafusion/user-guide/configs.html
@@ -321,116 +321,130 @@ Environment variables are read during <code
class="docutils literal notranslate"
<table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>key</p></th>
-<th class="head"><p>type</p></th>
<th class="head"><p>default</p></th>
<th class="head"><p>description</p></th>
</tr>
</thead>
<tbody>
+<tr
class="row-even"><td><p>datafusion.catalog.create_default_catalog_and_schema</p></td>
+<td><p>true</p></td>
+<td><p>Number of partitions for query execution. Increasing partitions can
increase concurrency. Defaults to the number of cpu cores on the
system.</p></td>
+</tr>
+<tr class="row-odd"><td><p>datafusion.catalog.default_catalog</p></td>
+<td><p>datafusion</p></td>
+<td><p>The default catalog name - this impacts what SQL queries use if not
specified</p></td>
+</tr>
+<tr class="row-even"><td><p>datafusion.catalog.default_schema</p></td>
+<td><p>public</p></td>
+<td><p>The default schema name - this impacts what SQL queries use if not
specified</p></td>
+</tr>
+<tr class="row-odd"><td><p>datafusion.catalog.information_schema</p></td>
+<td><p>false</p></td>
+<td><p>Should DataFusion provide access to <code class="docutils literal
notranslate"><span class="pre">information_schema</span></code> virtual tables
for displaying schema information</p></td>
+</tr>
<tr class="row-even"><td><p>datafusion.catalog.location</p></td>
-<td><p>Utf8</p></td>
<td><p>NULL</p></td>
-<td><p>Location scanned to load tables for <code class="docutils literal
notranslate"><span class="pre">default</span></code> schema, defaults to
None</p></td>
+<td><p>Location scanned to load tables for <code class="docutils literal
notranslate"><span class="pre">default</span></code> schema</p></td>
</tr>
-<tr class="row-odd"><td><p>datafusion.catalog.type</p></td>
-<td><p>Utf8</p></td>
+<tr class="row-odd"><td><p>datafusion.catalog.format</p></td>
<td><p>NULL</p></td>
-<td><p>Type of <code class="docutils literal notranslate"><span
class="pre">TableProvider</span></code> to use when loading <code
class="docutils literal notranslate"><span class="pre">default</span></code>
schema. Defaults to None</p></td>
+<td><p>Type of <code class="docutils literal notranslate"><span
class="pre">TableProvider</span></code> to use when loading <code
class="docutils literal notranslate"><span class="pre">default</span></code>
schema</p></td>
</tr>
-<tr class="row-even"><td><p>datafusion.execution.batch_size</p></td>
-<td><p>UInt64</p></td>
+<tr class="row-even"><td><p>datafusion.catalog.has_header</p></td>
+<td><p>false</p></td>
+<td><p>If the file has a header</p></td>
+</tr>
+<tr class="row-odd"><td><p>datafusion.execution.batch_size</p></td>
<td><p>8192</p></td>
-<td><p>Default batch size while creating new batches, it’s especially useful
for buffer-in-memory batches since creating tiny batches would results in too
much metadata memory consumption.</p></td>
+<td><p>Default batch size while creating new batches, it’s especially useful
for buffer-in-memory batches since creating tiny batches would results in too
much metadata memory consumption</p></td>
</tr>
-<tr class="row-odd"><td><p>datafusion.execution.coalesce_batches</p></td>
-<td><p>Boolean</p></td>
+<tr class="row-even"><td><p>datafusion.execution.coalesce_batches</p></td>
<td><p>true</p></td>
-<td><p>When set to true, record batches will be examined between each operator
and small batches will be coalesced into larger batches. This is helpful when
there are highly selective filters or joins that could produce tiny output
batches. The target batch size is determined by the configuration setting
‘datafusion.execution.coalesce_target_batch_size’.</p></td>
-</tr>
-<tr
class="row-even"><td><p>datafusion.execution.coalesce_target_batch_size</p></td>
-<td><p>UInt64</p></td>
-<td><p>4096</p></td>
-<td><p>Target batch size when coalescing batches. Uses in conjunction with the
configuration setting ‘datafusion.execution.coalesce_batches’.</p></td>
+<td><p>When set to true, record batches will be examined between each operator
and small batches will be coalesced into larger batches. This is helpful when
there are highly selective filters or joins that could produce tiny output
batches. The target batch size is determined by the configuration
setting</p></td>
</tr>
-<tr
class="row-odd"><td><p>datafusion.execution.parquet.enable_page_index</p></td>
-<td><p>Boolean</p></td>
+<tr class="row-odd"><td><p>datafusion.execution.collect_statistics</p></td>
<td><p>false</p></td>
-<td><p>If true, uses parquet data page level metadata (Page Index) statistics
to reduce the number of rows decoded.</p></td>
+<td><p>Should DataFusion collect statistics after listing files</p></td>
</tr>
-<tr
class="row-even"><td><p>datafusion.execution.parquet.metadata_size_hint</p></td>
-<td><p>UInt64</p></td>
-<td><p>NULL</p></td>
-<td><p>If specified, the parquet reader will try and fetch the last <code
class="docutils literal notranslate"><span class="pre">size_hint</span></code>
bytes of the parquet file optimistically. If not specified, two read are
required: One read to fetch the 8-byte parquet footer and another to fetch the
metadata length encoded in the footer.</p></td>
+<tr class="row-even"><td><p>datafusion.execution.target_partitions</p></td>
+<td><p>0</p></td>
+<td><p>Number of partitions for query execution. Increasing partitions can
increase concurrency. Defaults to the number of cpu cores on the system</p></td>
</tr>
-<tr class="row-odd"><td><p>datafusion.execution.parquet.pruning</p></td>
-<td><p>Boolean</p></td>
-<td><p>true</p></td>
-<td><p>If true, the parquet reader attempts to skip entire row groups based on
the predicate in the query and the metadata (min/max values) stored in the
parquet file.</p></td>
+<tr class="row-odd"><td><p>datafusion.execution.time_zone</p></td>
+<td><p>+00:00</p></td>
+<td><p>The default time zone Some functions, e.g. EXTRACT(HOUR from
SOME_TIME), shift the underlying datetime according to this time zone, and then
extract the hour</p></td>
</tr>
-<tr
class="row-even"><td><p>datafusion.execution.parquet.pushdown_filters</p></td>
-<td><p>Boolean</p></td>
+<tr
class="row-even"><td><p>datafusion.execution.parquet.enable_page_index</p></td>
<td><p>false</p></td>
-<td><p>If true, filter expressions are be applied during the parquet decoding
operation to reduce the number of rows decoded.</p></td>
+<td><p>If true, uses parquet data page level metadata (Page Index) statistics
to reduce the number of rows decoded.</p></td>
</tr>
-<tr
class="row-odd"><td><p>datafusion.execution.parquet.reorder_filters</p></td>
-<td><p>Boolean</p></td>
-<td><p>false</p></td>
-<td><p>If true, filter expressions evaluated during the parquet decoding
opearation will be reordered heuristically to minimize the cost of evaluation.
If false, the filters are applied in the same order as written in the
query.</p></td>
+<tr class="row-odd"><td><p>datafusion.execution.parquet.pruning</p></td>
+<td><p>true</p></td>
+<td><p>If true, the parquet reader attempts to skip entire row groups based on
the predicate in the query and the metadata (min/max values) stored in the
parquet file</p></td>
</tr>
<tr class="row-even"><td><p>datafusion.execution.parquet.skip_metadata</p></td>
-<td><p>Boolean</p></td>
<td><p>true</p></td>
-<td><p>If true, the parquet reader skip the optional embedded metadata that
may be in the file Schema. This setting can help avoid schema conflicts when
querying multiple parquet files with schemas containing compatible types but
different metadata.</p></td>
-</tr>
-<tr class="row-odd"><td><p>datafusion.execution.time_zone</p></td>
-<td><p>Utf8</p></td>
-<td><p>+00:00</p></td>
-<td><p>The session time zone which some function require e.g. EXTRACT(HOUR
from SOME_TIME) shift the underline datetime according to the time
zone,</p></td>
+<td><p>If true, the parquet reader skip the optional embedded metadata that
may be in the file Schema. This setting can help avoid schema conflicts when
querying multiple parquet files with schemas containing compatible types but
different metadata</p></td>
</tr>
-<tr class="row-even"><td><p>then extract the hour.</p></td>
-<td><p></p></td>
-<td><p></p></td>
-<td><p></p></td>
+<tr
class="row-odd"><td><p>datafusion.execution.parquet.metadata_size_hint</p></td>
+<td><p>NULL</p></td>
+<td><p>If specified, the parquet reader will try and fetch the last <code
class="docutils literal notranslate"><span class="pre">size_hint</span></code>
bytes of the parquet file optimistically. If not specified, two read are
required: One read to fetch the 8-byte parquet footer and another to fetch the
metadata length encoded in the footer</p></td>
</tr>
-<tr class="row-odd"><td><p>datafusion.explain.logical_plan_only</p></td>
-<td><p>Boolean</p></td>
+<tr
class="row-even"><td><p>datafusion.execution.parquet.pushdown_filters</p></td>
<td><p>false</p></td>
-<td><p>When set to true, the explain statement will only print logical
plans.</p></td>
+<td><p>If true, filter expressions are be applied during the parquet decoding
operation to reduce the number of rows decoded</p></td>
</tr>
-<tr class="row-even"><td><p>datafusion.explain.physical_plan_only</p></td>
-<td><p>Boolean</p></td>
+<tr
class="row-odd"><td><p>datafusion.execution.parquet.reorder_filters</p></td>
<td><p>false</p></td>
-<td><p>When set to true, the explain statement will only print physical
plans.</p></td>
+<td><p>If true, filter expressions evaluated during the parquet decoding
operation will be reordered heuristically to minimize the cost of evaluation.
If false, the filters are applied in the same order as written in the
query</p></td>
+</tr>
+<tr
class="row-even"><td><p>datafusion.optimizer.enable_round_robin_repartition</p></td>
+<td><p>true</p></td>
+<td><p>When set to true, the physical plan optimizer will try to add round
robin repartition to increase parallelism to leverage more CPU cores</p></td>
</tr>
<tr class="row-odd"><td><p>datafusion.optimizer.filter_null_join_keys</p></td>
-<td><p>Boolean</p></td>
<td><p>false</p></td>
<td><p>When set to true, the optimizer will insert filters before a join
between a nullable and non-nullable column to filter out nulls on the nullable
side. This filter can add additional overhead when the file format does not
fully support predicate push down.</p></td>
</tr>
-<tr
class="row-even"><td><p>datafusion.optimizer.hash_join_single_partition_threshold</p></td>
-<td><p>UInt64</p></td>
-<td><p>1048576</p></td>
-<td><p>The maximum estimated size in bytes for one input side of a HashJoin
will be collected into a single partition</p></td>
+<tr
class="row-even"><td><p>datafusion.optimizer.repartition_aggregations</p></td>
+<td><p>true</p></td>
+<td><p>Should DataFusion repartition data using the aggregate keys to execute
aggregates in parallel using the provided <code class="docutils literal
notranslate"><span class="pre">target_partitions</span></code> level”</p></td>
</tr>
-<tr class="row-odd"><td><p>datafusion.optimizer.max_passes</p></td>
-<td><p>UInt64</p></td>
-<td><p>3</p></td>
-<td><p>Number of times that the optimizer will attempt to optimize the
plan</p></td>
+<tr class="row-odd"><td><p>datafusion.optimizer.repartition_joins</p></td>
+<td><p>true</p></td>
+<td><p>Should DataFusion repartition data using the join keys to execute joins
in parallel using the provided <code class="docutils literal notranslate"><span
class="pre">target_partitions</span></code> level”</p></td>
</tr>
-<tr class="row-even"><td><p>datafusion.optimizer.prefer_hash_join</p></td>
-<td><p>Boolean</p></td>
+<tr class="row-even"><td><p>datafusion.optimizer.repartition_windows</p></td>
<td><p>true</p></td>
-<td><p>When set to true, the physical plan optimizer will prefer HashJoin over
SortMergeJoin. HashJoin can work more efficientlythan SortMergeJoin but
consumes more memory. Defaults to true</p></td>
+<td><p>Should DataFusion repartition data using the partitions keys to execute
window functions in parallel using the provided <code class="docutils literal
notranslate"><span class="pre">target_partitions</span></code> level”</p></td>
</tr>
<tr class="row-odd"><td><p>datafusion.optimizer.skip_failed_rules</p></td>
-<td><p>Boolean</p></td>
<td><p>true</p></td>
-<td><p>When set to true, the logical plan optimizer will produce warning
messages if any optimization rules produce errors and then proceed to the next
rule. When set to false, any rules that produce errors will cause the query to
fail.</p></td>
+<td><p>When set to true, the logical plan optimizer will produce warning
messages if any optimization rules produce errors and then proceed to the next
rule. When set to false, any rules that produce errors will cause the query to
fail</p></td>
+</tr>
+<tr class="row-even"><td><p>datafusion.optimizer.max_passes</p></td>
+<td><p>3</p></td>
+<td><p>Number of times that the optimizer will attempt to optimize the
plan</p></td>
+</tr>
+<tr
class="row-odd"><td><p>datafusion.optimizer.top_down_join_key_reordering</p></td>
+<td><p>true</p></td>
+<td><p>When set to true, the physical plan optimizer will run a top down
process to reorder the join keys</p></td>
</tr>
-<tr
class="row-even"><td><p>datafusion.optimizer.top_down_join_key_reordering</p></td>
-<td><p>Boolean</p></td>
+<tr class="row-even"><td><p>datafusion.optimizer.prefer_hash_join</p></td>
<td><p>true</p></td>
-<td><p>When set to true, the physical plan optimizer will run a top down
process to reorder the join keys. Defaults to true</p></td>
+<td><p>When set to true, the physical plan optimizer will prefer HashJoin over
SortMergeJoin. HashJoin can work more efficiently than SortMergeJoin but
consumes more memory</p></td>
+</tr>
+<tr
class="row-odd"><td><p>datafusion.optimizer.hash_join_single_partition_threshold</p></td>
+<td><p>1048576</p></td>
+<td><p>The maximum estimated size in bytes for one input side of a HashJoin
will be collected into a single partition</p></td>
+</tr>
+<tr class="row-even"><td><p>datafusion.explain.logical_plan_only</p></td>
+<td><p>false</p></td>
+<td><p>When set to true, the explain statement will only print logical
plans</p></td>
+</tr>
+<tr class="row-odd"><td><p>datafusion.explain.physical_plan_only</p></td>
+<td><p>false</p></td>
+<td><p>When set to true, the explain statement will only print physical
plans</p></td>
</tr>
</tbody>
</table>
diff --git a/datafusion/user-guide/dataframe.html
b/datafusion/user-guide/dataframe.html
index 69afc2a941d..8ac113dc540 100644
--- a/datafusion/user-guide/dataframe.html
+++ b/datafusion/user-guide/dataframe.html
@@ -338,7 +338,7 @@ to build up a query definition.</p>
</div>
<p>Here is a minimal example showing the execution of a query using the
DataFrame API.</p>
<div class="highlight-rust notranslate"><div
class="highlight"><pre><span></span><span class="kd">let</span><span class="w">
</span><span class="n">ctx</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span
class="n">SessionContext</span>::<span class="n">new</span><span
class="p">();</span><span class="w"></span>
-<span class="kd">let</span><span class="w"> </span><span
class="n">df</span><span class="w"> </span><span class="o">=</span><span
class="w"> </span><span class="n">ctx</span><span class="p">.</span><span
class="n">read_csv</span><span class="p">(</span><span
class="s">"tests/example.csv"</span><span class="p">,</span><span
class="w"> </span><span class="n">CsvReadOptions</span>::<span
class="n">new</span><span class="p">()).</span><span
class="k">await</span><span class="o">?</ [...]
+<span class="kd">let</span><span class="w"> </span><span
class="n">df</span><span class="w"> </span><span class="o">=</span><span
class="w"> </span><span class="n">ctx</span><span class="p">.</span><span
class="n">read_csv</span><span class="p">(</span><span
class="s">"tests/data/example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span class="k">await</span><span class="o [...]
<span class="kd">let</span><span class="w"> </span><span
class="n">df</span><span class="w"> </span><span class="o">=</span><span
class="w"> </span><span class="n">df</span><span class="p">.</span><span
class="n">filter</span><span class="p">(</span><span class="n">col</span><span
class="p">(</span><span class="s">"a"</span><span
class="p">).</span><span class="n">lt_eq</span><span class="p">(</span><span
class="n">col</span><span class="p">(</span><span class="s">"b" [...]
<span class="w"> </span><span class="p">.</span><span
class="n">aggregate</span><span class="p">(</span><span
class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span
class="p">(</span><span class="s">"a"</span><span
class="p">)],</span><span class="w"> </span><span class="fm">vec!</span><span
class="p">[</span><span class="n">min</span><span class="p">(</span><span
class="n">col</span><span class="p">(</span><span
class="s">"b"</span><s [...]
<span class="w"> </span><span class="p">.</span><span
class="n">limit</span><span class="p">(</span><span class="mi">0</span><span
class="p">,</span><span class="w"> </span><span class="nb">Some</span><span
class="p">(</span><span class="mi">100</span><span class="p">))</span><span
class="o">?</span><span class="p">;</span><span class="w"></span>
diff --git a/datafusion/user-guide/example-usage.html
b/datafusion/user-guide/example-usage.html
index e3b40604abc..a997783dfbf 100644
--- a/datafusion/user-guide/example-usage.html
+++ b/datafusion/user-guide/example-usage.html
@@ -364,7 +364,7 @@
-->
<section id="example-usage">
<h1>Example Usage<a class="headerlink" href="#example-usage" title="Permalink
to this heading">¶</a></h1>
-<p>In this example some simple processing is performed on the <a
class="reference download internal" download=""
href="../_downloads/93d189c7b5c72cedb29eeeb76cf44221/example.csv"><span
class="xref download myst"><code class="docutils literal notranslate"><span
class="pre">example.csv</span></code></span></a> file.</p>
+<p>In this example some simple processing is performed on the <a
class="reference download internal" download=""
href="../_downloads/9f6fbc67bd5c63cb1fd7ba4efdf82d7a/example.csv"><span
class="xref download myst"><code class="docutils literal notranslate"><span
class="pre">example.csv</span></code></span></a> file.</p>
<section id="update-cargo-toml">
<h2>Update <code class="docutils literal notranslate"><span
class="pre">Cargo.toml</span></code><a class="headerlink"
href="#update-cargo-toml" title="Permalink to this heading">¶</a></h2>
<p>Add the following to your <code class="docutils literal notranslate"><span
class="pre">Cargo.toml</span></code> file:</p>
@@ -381,7 +381,7 @@
<span class="k">async</span><span class="w"> </span><span class="k">fn</span>
<span class="nf">main</span><span class="p">()</span><span class="w">
</span>-> <span class="nc">datafusion</span>::<span
class="n">error</span>::<span class="nb">Result</span><span
class="o"><</span><span class="p">()</span><span class="o">></span><span
class="w"> </span><span class="p">{</span><span class="w"></span>
<span class="w"> </span><span class="c1">// register the table</span>
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">ctx</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span
class="n">SessionContext</span>::<span class="n">new</span><span
class="p">();</span><span class="w"></span>
-<span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">register_csv</span><span
class="p">(</span><span class="s">"example"</span><span
class="p">,</span><span class="w"> </span><span
class="s">"tests/example.csv"</span><span class="p">,</span><span
class="w"> </span><span class="n">CsvReadOptions</span>::<span
class="n">new</span><span class="p">()).</span><span
class="k">await</span><span class="o">?</span><span class="p">;</span> [...]
+<span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">register_csv</span><span
class="p">(</span><span class="s">"example"</span><span
class="p">,</span><span class="w"> </span><span
class="s">"tests/data/example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span class="k">await</span><span class="o">?</span><span
class="p">;</ [...]
<span class="w"> </span><span class="c1">// create a plan to run a SQL
query</span>
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">sql</span><span class="p">(</span><span
class="s">"SELECT a, MIN(b) FROM example GROUP BY a LIMIT
100"</span><span class="p">).</span><span class="k">await</span><span
class="o">?</span><span class="p">;</span><span class="w"></span>
@@ -401,7 +401,7 @@
<span class="k">async</span><span class="w"> </span><span class="k">fn</span>
<span class="nf">main</span><span class="p">()</span><span class="w">
</span>-> <span class="nc">datafusion</span>::<span
class="n">error</span>::<span class="nb">Result</span><span
class="o"><</span><span class="p">()</span><span class="o">></span><span
class="w"> </span><span class="p">{</span><span class="w"></span>
<span class="w"> </span><span class="c1">// create the dataframe</span>
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">ctx</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span
class="n">SessionContext</span>::<span class="n">new</span><span
class="p">();</span><span class="w"></span>
-<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span
class="s">"tests/example.csv"</span><span class="p">,</span><span
class="w"> </span><span class="n">CsvReadOptions</span>::<span
class="n">new</span><span class="p">()).</span><span class="k">await< [...]
+<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span
class="s">"tests/data/example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span class="k">a [...]
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">df</span><span
class="p">.</span><span class="n">filter</span><span class="p">(</span><span
class="n">col</span><span class="p">(</span><span
class="s">"a"</span><span class="p">).</span><span
class="n">lt_eq</span><span class="p">(</span><span class="n">col</span><span
class="p">(</span><spa [...]
<span class="w"> </span><span class="p">.</span><span
class="n">aggregate</span><span class="p">(</span><span
class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span
class="p">(</span><span class="s">"a"</span><span
class="p">)],</span><span class="w"> </span><span class="fm">vec!</span><span
class="p">[</span><span class="n">min</span><span class="p">(</span><span
class="n">col</span><span class="p">(</span><span
class="s">"b"</span><s [...]
@@ -428,7 +428,7 @@
<section id="identifiers-and-capitalization">
<h1>Identifiers and Capitalization<a class="headerlink"
href="#identifiers-and-capitalization" title="Permalink to this
heading">¶</a></h1>
<p>Please be aware that all identifiers are effectively made lower-case in
SQL, so if your csv file has capital letters (ex: <code class="docutils literal
notranslate"><span class="pre">Name</span></code>) you must put your column
name in double quotes or the examples won’t work.</p>
-<p>To illustrate this behavior, consider the <a class="reference download
internal" download=""
href="../_downloads/337ec28d7d061ace511c6d7f15ecff91/capitalized_example.csv"><span
class="xref download myst"><code class="docutils literal notranslate"><span
class="pre">capitalized_example.csv</span></code></span></a> file:</p>
+<p>To illustrate this behavior, consider the <a class="reference download
internal" download=""
href="../_downloads/3cce4d737d8c5814f5b50d859d21ba53/capitalized_example.csv"><span
class="xref download myst"><code class="docutils literal notranslate"><span
class="pre">capitalized_example.csv</span></code></span></a> file:</p>
<section id="id1">
<h2>Run a SQL query against data stored in a CSV:<a class="headerlink"
href="#id1" title="Permalink to this heading">¶</a></h2>
<div class="highlight-rust notranslate"><div
class="highlight"><pre><span></span><span class="k">use</span><span class="w">
</span><span class="n">datafusion</span>::<span class="n">prelude</span>::<span
class="o">*</span><span class="p">;</span><span class="w"></span>
@@ -437,7 +437,7 @@
<span class="k">async</span><span class="w"> </span><span class="k">fn</span>
<span class="nf">main</span><span class="p">()</span><span class="w">
</span>-> <span class="nc">datafusion</span>::<span
class="n">error</span>::<span class="nb">Result</span><span
class="o"><</span><span class="p">()</span><span class="o">></span><span
class="w"> </span><span class="p">{</span><span class="w"></span>
<span class="w"> </span><span class="c1">// register the table</span>
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">ctx</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span
class="n">SessionContext</span>::<span class="n">new</span><span
class="p">();</span><span class="w"></span>
-<span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">register_csv</span><span
class="p">(</span><span class="s">"example"</span><span
class="p">,</span><span class="w"> </span><span
class="s">"tests/capitalized_example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span class="k">await</span><span class="o">?</span><span
class= [...]
+<span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">register_csv</span><span
class="p">(</span><span class="s">"example"</span><span
class="p">,</span><span class="w"> </span><span
class="s">"tests/data/capitalized_example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span class="k">await</span><span class="o">?</span><span
c [...]
<span class="w"> </span><span class="c1">// create a plan to run a SQL
query</span>
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">sql</span><span class="p">(</span><span
class="s">"SELECT </span><span class="se">\"</span><span
class="s">A</span><span class="se">\"</span><span class="s">, MIN(b) FROM
example GROUP BY </span><span class="se">\"</span><span class= [...]
@@ -457,7 +457,7 @@
<span class="k">async</span><span class="w"> </span><span class="k">fn</span>
<span class="nf">main</span><span class="p">()</span><span class="w">
</span>-> <span class="nc">datafusion</span>::<span
class="n">error</span>::<span class="nb">Result</span><span
class="o"><</span><span class="p">()</span><span class="o">></span><span
class="w"> </span><span class="p">{</span><span class="w"></span>
<span class="w"> </span><span class="c1">// create the dataframe</span>
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">ctx</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span
class="n">SessionContext</span>::<span class="n">new</span><span
class="p">();</span><span class="w"></span>
-<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span
class="s">"tests/capitalized_example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span clas [...]
+<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">ctx</span><span
class="p">.</span><span class="n">read_csv</span><span class="p">(</span><span
class="s">"tests/data/capitalized_example.csv"</span><span
class="p">,</span><span class="w"> </span><span
class="n">CsvReadOptions</span>::<span class="n">new</span><span
class="p">()).</span><span [...]
<span class="w"> </span><span class="kd">let</span><span class="w">
</span><span class="n">df</span><span class="w"> </span><span
class="o">=</span><span class="w"> </span><span class="n">df</span><span
class="p">.</span><span class="n">filter</span><span class="p">(</span><span
class="n">col</span><span class="p">(</span><span
class="s">"A"</span><span class="p">).</span><span
class="n">lt_eq</span><span class="p">(</span><span class="n">col</span><span
class="p">(</span><spa [...]
<span class="w"> </span><span class="p">.</span><span
class="n">aggregate</span><span class="p">(</span><span
class="fm">vec!</span><span class="p">[</span><span class="n">col</span><span
class="p">(</span><span class="s">"A"</span><span
class="p">)],</span><span class="w"> </span><span class="fm">vec!</span><span
class="p">[</span><span class="n">min</span><span class="p">(</span><span
class="n">col</span><span class="p">(</span><span
class="s">"b"</span><s [...]