This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git
The following commit(s) were added to refs/heads/main by this push:
new b8c74a7 Docs explaining how to view query plans (#373)
b8c74a7 is described below
commit b8c74a7f50cdc74b163bf4ba071662d93cabfbfd
Author: Andy Grove <[email protected]>
AuthorDate: Mon May 8 07:49:54 2023 -0600
Docs explaining how to view query plans (#373)
* Docs explaining how to view query plans
* diagram
---
docs/mdbook/README.md | 2 +-
docs/mdbook/src/SUMMARY.md | 3 +-
docs/mdbook/src/images/plan.svg | 111 +++++++++++++++++++++++
docs/mdbook/src/usage/query-plans.md | 170 +++++++++++++++++++++++++++++++++++
src/physical_plan.rs | 4 +
src/sql/logical.rs | 2 +-
6 files changed, 289 insertions(+), 3 deletions(-)
diff --git a/docs/mdbook/README.md b/docs/mdbook/README.md
index 664b4b4..6dae6bc 100644
--- a/docs/mdbook/README.md
+++ b/docs/mdbook/README.md
@@ -26,7 +26,7 @@ Open the book locally by running `open book/index.html`.
## Install mdBook
-Download the `mdbook` binary.
+Download the `mdbook` binary or run `cargo install mdbook`.
Then manually open it, so you have permissions to run it on your Mac.
diff --git a/docs/mdbook/src/SUMMARY.md b/docs/mdbook/src/SUMMARY.md
index b490899..23467ed 100644
--- a/docs/mdbook/src/SUMMARY.md
+++ b/docs/mdbook/src/SUMMARY.md
@@ -21,4 +21,5 @@
- [Quickstart](./quickstart.md)
- [Usage](./usage/index.md)
- [Create a table](./usage/create-table.md)
- - [Query a table](./usage/query-table.md)
\ No newline at end of file
+ - [Query a table](./usage/query-table.md)
+ - [Viewing Query Plans](./usage/query-plans.md)
\ No newline at end of file
diff --git a/docs/mdbook/src/images/plan.svg b/docs/mdbook/src/images/plan.svg
new file mode 100644
index 0000000..9271479
--- /dev/null
+++ b/docs/mdbook/src/images/plan.svg
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="1004pt" height="323pt"
+ viewBox="0.00 0.00 1004.00 323.00" xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 319)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-319 1000,-319
1000,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster_1</title>
+<polygon fill="none" stroke="black" points="8,-9 8,-306 494,-306 494,-9 8,-9"/>
+<text text-anchor="middle" x="251" y="-290.8" font-family="Times,serif"
font-size="14.00">LogicalPlan</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_6</title>
+<polygon fill="none" stroke="black" points="502,-8 502,-307 988,-307 988,-8
502,-8"/>
+<text text-anchor="middle" x="745" y="-291.8" font-family="Times,serif"
font-size="14.00">Detailed LogicalPlan</text>
+</g>
+<!-- 2 -->
+<g id="node1" class="node">
+<title>2</title>
+<polygon fill="none" stroke="black" points="405.5,-275 96.5,-275 96.5,-239
405.5,-239 405.5,-275"/>
+<text text-anchor="middle" x="251" y="-253.3" font-family="Times,serif"
font-size="14.00">Projection: my_table.a, SUM(my_table.b)</text>
+</g>
+<!-- 3 -->
+<g id="node2" class="node">
+<title>3</title>
+<polygon fill="none" stroke="black" points="486,-201 16,-201 16,-165 486,-165
486,-201"/>
+<text text-anchor="middle" x="251" y="-179.3" font-family="Times,serif"
font-size="14.00">Aggregate: groupBy=[[my_table.a]],
aggr=[[SUM(my_table.b)]]</text>
+</g>
+<!-- 2->3 -->
+<g id="edge1" class="edge">
+<title>2->3</title>
+<path fill="none" stroke="black" d="M251,-228.72C251,-219.57 251,-209.63
251,-201.44"/>
+<polygon fill="black" stroke="black" points="247.5,-228.94 251,-238.94
254.5,-228.94 247.5,-228.94"/>
+</g>
+<!-- 4 -->
+<g id="node3" class="node">
+<title>4</title>
+<polygon fill="none" stroke="black" points="361.5,-127 140.5,-127 140.5,-91
361.5,-91 361.5,-127"/>
+<text text-anchor="middle" x="251" y="-105.3" font-family="Times,serif"
font-size="14.00">Filter: my_table.a < Int64(3)</text>
+</g>
+<!-- 3->4 -->
+<g id="edge2" class="edge">
+<title>3->4</title>
+<path fill="none" stroke="black" d="M251,-154.72C251,-145.57 251,-135.63
251,-127.44"/>
+<polygon fill="black" stroke="black" points="247.5,-154.94 251,-164.94
254.5,-154.94 247.5,-154.94"/>
+</g>
+<!-- 5 -->
+<g id="node4" class="node">
+<title>5</title>
+<polygon fill="none" stroke="black" points="332.5,-53 169.5,-53 169.5,-17
332.5,-17 332.5,-53"/>
+<text text-anchor="middle" x="251" y="-31.3" font-family="Times,serif"
font-size="14.00">TableScan: my_table</text>
+</g>
+<!-- 4->5 -->
+<g id="edge3" class="edge">
+<title>4->5</title>
+<path fill="none" stroke="black" d="M251,-80.72C251,-71.57 251,-61.63
251,-53.44"/>
+<polygon fill="black" stroke="black" points="247.5,-80.94 251,-90.94
254.5,-80.94 247.5,-80.94"/>
+</g>
+<!-- 7 -->
+<g id="node5" class="node">
+<title>7</title>
+<polygon fill="none" stroke="black" points="921,-276 569,-276 569,-238
921,-238 921,-276"/>
+<text text-anchor="middle" x="745" y="-260.8" font-family="Times,serif"
font-size="14.00">Projection: my_table.a, SUM(my_table.b)</text>
+<text text-anchor="middle" x="745" y="-245.8" font-family="Times,serif"
font-size="14.00">Schema: [a:Int64;N, SUM(my_table.b):Int64;N]</text>
+</g>
+<!-- 8 -->
+<g id="node6" class="node">
+<title>8</title>
+<polygon fill="none" stroke="black" points="980,-202 510,-202 510,-164
980,-164 980,-202"/>
+<text text-anchor="middle" x="745" y="-186.8" font-family="Times,serif"
font-size="14.00">Aggregate: groupBy=[[my_table.a]],
aggr=[[SUM(my_table.b)]]</text>
+<text text-anchor="middle" x="745" y="-171.8" font-family="Times,serif"
font-size="14.00">Schema: [a:Int64;N, SUM(my_table.b):Int64;N]</text>
+</g>
+<!-- 7->8 -->
+<g id="edge4" class="edge">
+<title>7->8</title>
+<path fill="none" stroke="black" d="M745,-227.57C745,-219.11 745,-210.06
745,-202.41"/>
+<polygon fill="black" stroke="black" points="741.5,-227.83 745,-237.83
748.5,-227.83 741.5,-227.83"/>
+</g>
+<!-- 9 -->
+<g id="node7" class="node">
+<title>9</title>
+<polygon fill="none" stroke="black" points="863,-128 627,-128 627,-90 863,-90
863,-128"/>
+<text text-anchor="middle" x="745" y="-112.8" font-family="Times,serif"
font-size="14.00">Filter: my_table.a < Int64(3)</text>
+<text text-anchor="middle" x="745" y="-97.8" font-family="Times,serif"
font-size="14.00">Schema: [a:Int64;N, b:Int64;N]</text>
+</g>
+<!-- 8->9 -->
+<g id="edge5" class="edge">
+<title>8->9</title>
+<path fill="none" stroke="black" d="M745,-153.57C745,-145.11 745,-136.06
745,-128.41"/>
+<polygon fill="black" stroke="black" points="741.5,-153.83 745,-163.83
748.5,-153.83 741.5,-153.83"/>
+</g>
+<!-- 10 -->
+<g id="node8" class="node">
+<title>10</title>
+<polygon fill="none" stroke="black" points="863,-54 627,-54 627,-16 863,-16
863,-54"/>
+<text text-anchor="middle" x="745" y="-38.8" font-family="Times,serif"
font-size="14.00">TableScan: my_table</text>
+<text text-anchor="middle" x="745" y="-23.8" font-family="Times,serif"
font-size="14.00">Schema: [a:Int64;N, b:Int64;N]</text>
+</g>
+<!-- 9->10 -->
+<g id="edge6" class="edge">
+<title>9->10</title>
+<path fill="none" stroke="black" d="M745,-79.57C745,-71.11 745,-62.06
745,-54.41"/>
+<polygon fill="black" stroke="black" points="741.5,-79.83 745,-89.83
748.5,-79.83 741.5,-79.83"/>
+</g>
+</g>
+</svg>
diff --git a/docs/mdbook/src/usage/query-plans.md
b/docs/mdbook/src/usage/query-plans.md
new file mode 100644
index 0000000..a39aa9e
--- /dev/null
+++ b/docs/mdbook/src/usage/query-plans.md
@@ -0,0 +1,170 @@
+<!---
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+ http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+# DataFusion Query Plans
+
+DataFusion's `DataFrame` is a wrapper around a query plan. In this chapter we
will learn how to view
+logical and physical query plans for DataFrames.
+
+## Sample Data
+
+Let's go ahead and create a simple DataFrame. You can do this in the Python
shell or in a notebook.
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+
+df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table")
+```
+
+## Logical Plan
+
+Next, let's look at the logical plan for this dataframe.
+
+```python
+>>> df.logical_plan()
+TableScan: my_table
+```
+
+The logical plan here consists of a single `TableScan` operator. Let's make a
more interesting plan by creating a new
+`DataFrame` representing an aggregate query with a filter.
+
+```python
+>>> df = ctx.sql("SELECT a, sum(b) FROM my_table WHERE a < 3 GROUP BY a")
+```
+
+When we view the plan for this `DataFrame` we can see that there are now four
operators in the plan, each
+representing a logical transformation of the data. We start with a `TableScan`
to read the data, followed by
+a `Filter` to filter out rows that do not match the filter expression, then an
`Aggregate` is performed. Finally,
+a `Projection` is applied to ensure that the order of the final columns
matches the `SELECT` part of the SQL query.
+
+```python
+>>> df.logical_plan()
+Projection: my_table.a, SUM(my_table.b)
+ Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]]
+ Filter: my_table.a < Int64(3)
+ TableScan: my_table
+```
+
+## Optimized Logical Plan
+
+DataFusion has a powerful query optimizer which will rewrite query plans to
make them more efficient before they are
+executed. We can view the output of the optimized by viewint the optimized
logical plan.
+
+```python
+>>> df.optimized_logical_plan()
+Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]]
+ Filter: my_table.a < Int64(3)
+ TableScan: my_table projection=[a, b]
+```
+
+We can see that there are two key differences compared to the unoptimized
logical plan:
+
+- The `Projection` has been removed because it was redundant in this case (the
output of the `Aggregatge` plan
+ already had the columns in the correct order).
+- The `TableScan` now has a projection pushed down so that it only reads the
columns required to be able to execute
+ the query. In this case the table only has two columns and we referenced
them both in the query, but this optimization
+ can be very effective in real-world queries against large tables.
+
+## Physical Plan
+
+Logical plans provide a representation of "what" the query should do it.
Physical plans explain "how" the query
+should be executed.
+
+We can view the physical plan (also known as an execution plan) using the
`execution_plan` method.
+
+```python
+>>> df.execution_plan()
+AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[SUM(my_table.b)]
+ CoalesceBatchesExec: target_batch_size=8192
+ RepartitionExec: partitioning=Hash([Column { name: "a", index: 0 }], 48),
input_partitions=48
+ AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[SUM(my_table.b)]
+ CoalesceBatchesExec: target_batch_size=8192
+ FilterExec: a@0 < 3
+ RepartitionExec: partitioning=RoundRobinBatch(48),
input_partitions=1
+ MemoryExec: partitions=1, partition_sizes=[1]
+```
+
+The `TableScan` has now been replaced by a more specific `MemoryExec` for
scanning the in-memory data. If we were
+querying a CSV file on disk then we would expect to see a `CsvExec` instead.
+
+This plan has additional operators that were not in the logical plan:
+
+- `RepartionExec` has been added so that the data can be split into partitions
and processed in parallel using
+ multiple cores.
+- `CoalesceBatchesExec` will combine small batches into larger batches to
ensure that processing remains efficient.
+
+The `Aggregate` operator now appears twice. This is because aggregates are
performed in a two step process. Data is
+aggregated within each partition in parallel and then those results (which
could contain duplicate grouping keys) are
+combined and the aggregate operations is applied again.
+
+## Creating Query Plan Diagrams
+
+DataFusion supports generating query plan diagrams in [DOT
format](<https://en.wikipedia.org/wiki/DOT_(graph_description_language)>).
+
+DOT is a language for describing graphs and there are open source tools such
as GraphViz that can render diagrams
+from DOT files.
+
+We can use the following code to generate a DOT file for a logical query plan.
+
+```python
+>>> diagram = df.logical_plan().display_graphviz()
+>>> with open('plan.dot', 'w') as f:
+>>> f.write(diagram)
+```
+
+If we view the view, we will see the following content.
+
+```
+// Begin DataFusion GraphViz Plan (see https://graphviz.org)
+digraph {
+ subgraph cluster_1
+ {
+ graph[label="LogicalPlan"]
+ 2[shape=box label="Projection: my_table.a, SUM(my_table.b)"]
+ 3[shape=box label="Aggregate: groupBy=[[my_table.a]],
aggr=[[SUM(my_table.b)]]"]
+ 2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]
+ 4[shape=box label="Filter: my_table.a < Int64(3)"]
+ 3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]
+ 5[shape=box label="TableScan: my_table"]
+ 4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]
+ }
+ subgraph cluster_6
+ {
+ graph[label="Detailed LogicalPlan"]
+ 7[shape=box label="Projection: my_table.a, SUM(my_table.b)\nSchema:
[a:Int64;N, SUM(my_table.b):Int64;N]"]
+ 8[shape=box label="Aggregate: groupBy=[[my_table.a]],
aggr=[[SUM(my_table.b)]]\nSchema: [a:Int64;N, SUM(my_table.b):Int64;N]"]
+ 7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]
+ 9[shape=box label="Filter: my_table.a < Int64(3)\nSchema: [a:Int64;N,
b:Int64;N]"]
+ 8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]
+ 10[shape=box label="TableScan: my_table\nSchema: [a:Int64;N, b:Int64;N]"]
+ 9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]
+ }
+}
+// End DataFusion GraphViz Plan
+```
+
+We can use GraphViz from the command-line to convert this DOT file into an
image.
+
+```bash
+dot -Tsvg plan.dot > plan.svg
+```
+
+This generates the following diagram:
+
+
diff --git a/src/physical_plan.rs b/src/physical_plan.rs
index 4c35f3e..6f02cef 100644
--- a/src/physical_plan.rs
+++ b/src/physical_plan.rs
@@ -54,6 +54,10 @@ impl PyExecutionPlan {
format!("{}", d.indent())
}
+ fn __repr__(&self) -> String {
+ self.display_indent()
+ }
+
#[getter]
pub fn partition_count(&self) -> usize {
self.plan.output_partitioning().partition_count()
diff --git a/src/sql/logical.rs b/src/sql/logical.rs
index a22f269..a75315d 100644
--- a/src/sql/logical.rs
+++ b/src/sql/logical.rs
@@ -100,7 +100,7 @@ impl PyLogicalPlan {
}
fn display_graphviz(&self) -> String {
- format!("{}", self.plan.display_indent_schema())
+ format!("{}", self.plan.display_graphviz())
}
}