[arrow-datafusion-python] branch main updated: Docs explaining how to view query plans (#373)

agrove Mon, 08 May 2023 06:50:11 -0700

This is an automated email from the ASF dual-hosted git repository.

agrove pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-python.git



The following commit(s) were added to refs/heads/main by this push:
     new b8c74a7  Docs explaining how to view query plans (#373)
b8c74a7 is described below

commit b8c74a7f50cdc74b163bf4ba071662d93cabfbfd
Author: Andy Grove <[email protected]>
AuthorDate: Mon May 8 07:49:54 2023 -0600

    Docs explaining how to view query plans (#373)
    
    * Docs explaining how to view query plans
    
    * diagram
---
 docs/mdbook/README.md                |   2 +-
 docs/mdbook/src/SUMMARY.md           |   3 +-
 docs/mdbook/src/images/plan.svg      | 111 +++++++++++++++++++++++
 docs/mdbook/src/usage/query-plans.md | 170 +++++++++++++++++++++++++++++++++++
 src/physical_plan.rs                 |   4 +
 src/sql/logical.rs                   |   2 +-
 6 files changed, 289 insertions(+), 3 deletions(-)

diff --git a/docs/mdbook/README.md b/docs/mdbook/README.md
index 664b4b4..6dae6bc 100644
--- a/docs/mdbook/README.md
+++ b/docs/mdbook/README.md
@@ -26,7 +26,7 @@ Open the book locally by running `open book/index.html`.
 
 ## Install mdBook
 
-Download the `mdbook` binary.
+Download the `mdbook` binary or run `cargo install mdbook`.
 
 Then manually open it, so you have permissions to run it on your Mac.
 
diff --git a/docs/mdbook/src/SUMMARY.md b/docs/mdbook/src/SUMMARY.md
index b490899..23467ed 100644
--- a/docs/mdbook/src/SUMMARY.md
+++ b/docs/mdbook/src/SUMMARY.md
@@ -21,4 +21,5 @@
 - [Quickstart](./quickstart.md)
 - [Usage](./usage/index.md)
     - [Create a table](./usage/create-table.md)
-    - [Query a table](./usage/query-table.md)
\ No newline at end of file
+    - [Query a table](./usage/query-table.md)
+    - [Viewing Query Plans](./usage/query-plans.md)
\ No newline at end of file
diff --git a/docs/mdbook/src/images/plan.svg b/docs/mdbook/src/images/plan.svg
new file mode 100644
index 0000000..9271479
--- /dev/null
+++ b/docs/mdbook/src/images/plan.svg
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd";>
+<!-- Generated by graphviz version 2.43.0 (0)
+ -->
+<!-- Title: %3 Pages: 1 -->
+<svg width="1004pt" height="323pt"
+ viewBox="0.00 0.00 1004.00 323.00" xmlns="http://www.w3.org/2000/svg"; 
xmlns:xlink="http://www.w3.org/1999/xlink";>
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 319)">
+<title>%3</title>
+<polygon fill="white" stroke="transparent" points="-4,4 -4,-319 1000,-319 
1000,4 -4,4"/>
+<g id="clust1" class="cluster">
+<title>cluster_1</title>
+<polygon fill="none" stroke="black" points="8,-9 8,-306 494,-306 494,-9 8,-9"/>
+<text text-anchor="middle" x="251" y="-290.8" font-family="Times,serif" 
font-size="14.00">LogicalPlan</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_6</title>
+<polygon fill="none" stroke="black" points="502,-8 502,-307 988,-307 988,-8 
502,-8"/>
+<text text-anchor="middle" x="745" y="-291.8" font-family="Times,serif" 
font-size="14.00">Detailed LogicalPlan</text>
+</g>
+<!-- 2 -->
+<g id="node1" class="node">
+<title>2</title>
+<polygon fill="none" stroke="black" points="405.5,-275 96.5,-275 96.5,-239 
405.5,-239 405.5,-275"/>
+<text text-anchor="middle" x="251" y="-253.3" font-family="Times,serif" 
font-size="14.00">Projection: my_table.a, SUM(my_table.b)</text>
+</g>
+<!-- 3 -->
+<g id="node2" class="node">
+<title>3</title>
+<polygon fill="none" stroke="black" points="486,-201 16,-201 16,-165 486,-165 
486,-201"/>
+<text text-anchor="middle" x="251" y="-179.3" font-family="Times,serif" 
font-size="14.00">Aggregate: groupBy=[[my_table.a]], 
aggr=[[SUM(my_table.b)]]</text>
+</g>
+<!-- 2&#45;&gt;3 -->
+<g id="edge1" class="edge">
+<title>2&#45;&gt;3</title>
+<path fill="none" stroke="black" d="M251,-228.72C251,-219.57 251,-209.63 
251,-201.44"/>
+<polygon fill="black" stroke="black" points="247.5,-228.94 251,-238.94 
254.5,-228.94 247.5,-228.94"/>
+</g>
+<!-- 4 -->
+<g id="node3" class="node">
+<title>4</title>
+<polygon fill="none" stroke="black" points="361.5,-127 140.5,-127 140.5,-91 
361.5,-91 361.5,-127"/>
+<text text-anchor="middle" x="251" y="-105.3" font-family="Times,serif" 
font-size="14.00">Filter: my_table.a &lt; Int64(3)</text>
+</g>
+<!-- 3&#45;&gt;4 -->
+<g id="edge2" class="edge">
+<title>3&#45;&gt;4</title>
+<path fill="none" stroke="black" d="M251,-154.72C251,-145.57 251,-135.63 
251,-127.44"/>
+<polygon fill="black" stroke="black" points="247.5,-154.94 251,-164.94 
254.5,-154.94 247.5,-154.94"/>
+</g>
+<!-- 5 -->
+<g id="node4" class="node">
+<title>5</title>
+<polygon fill="none" stroke="black" points="332.5,-53 169.5,-53 169.5,-17 
332.5,-17 332.5,-53"/>
+<text text-anchor="middle" x="251" y="-31.3" font-family="Times,serif" 
font-size="14.00">TableScan: my_table</text>
+</g>
+<!-- 4&#45;&gt;5 -->
+<g id="edge3" class="edge">
+<title>4&#45;&gt;5</title>
+<path fill="none" stroke="black" d="M251,-80.72C251,-71.57 251,-61.63 
251,-53.44"/>
+<polygon fill="black" stroke="black" points="247.5,-80.94 251,-90.94 
254.5,-80.94 247.5,-80.94"/>
+</g>
+<!-- 7 -->
+<g id="node5" class="node">
+<title>7</title>
+<polygon fill="none" stroke="black" points="921,-276 569,-276 569,-238 
921,-238 921,-276"/>
+<text text-anchor="middle" x="745" y="-260.8" font-family="Times,serif" 
font-size="14.00">Projection: my_table.a, SUM(my_table.b)</text>
+<text text-anchor="middle" x="745" y="-245.8" font-family="Times,serif" 
font-size="14.00">Schema: [a:Int64;N, SUM(my_table.b):Int64;N]</text>
+</g>
+<!-- 8 -->
+<g id="node6" class="node">
+<title>8</title>
+<polygon fill="none" stroke="black" points="980,-202 510,-202 510,-164 
980,-164 980,-202"/>
+<text text-anchor="middle" x="745" y="-186.8" font-family="Times,serif" 
font-size="14.00">Aggregate: groupBy=[[my_table.a]], 
aggr=[[SUM(my_table.b)]]</text>
+<text text-anchor="middle" x="745" y="-171.8" font-family="Times,serif" 
font-size="14.00">Schema: [a:Int64;N, SUM(my_table.b):Int64;N]</text>
+</g>
+<!-- 7&#45;&gt;8 -->
+<g id="edge4" class="edge">
+<title>7&#45;&gt;8</title>
+<path fill="none" stroke="black" d="M745,-227.57C745,-219.11 745,-210.06 
745,-202.41"/>
+<polygon fill="black" stroke="black" points="741.5,-227.83 745,-237.83 
748.5,-227.83 741.5,-227.83"/>
+</g>
+<!-- 9 -->
+<g id="node7" class="node">
+<title>9</title>
+<polygon fill="none" stroke="black" points="863,-128 627,-128 627,-90 863,-90 
863,-128"/>
+<text text-anchor="middle" x="745" y="-112.8" font-family="Times,serif" 
font-size="14.00">Filter: my_table.a &lt; Int64(3)</text>
+<text text-anchor="middle" x="745" y="-97.8" font-family="Times,serif" 
font-size="14.00">Schema: [a:Int64;N, b:Int64;N]</text>
+</g>
+<!-- 8&#45;&gt;9 -->
+<g id="edge5" class="edge">
+<title>8&#45;&gt;9</title>
+<path fill="none" stroke="black" d="M745,-153.57C745,-145.11 745,-136.06 
745,-128.41"/>
+<polygon fill="black" stroke="black" points="741.5,-153.83 745,-163.83 
748.5,-153.83 741.5,-153.83"/>
+</g>
+<!-- 10 -->
+<g id="node8" class="node">
+<title>10</title>
+<polygon fill="none" stroke="black" points="863,-54 627,-54 627,-16 863,-16 
863,-54"/>
+<text text-anchor="middle" x="745" y="-38.8" font-family="Times,serif" 
font-size="14.00">TableScan: my_table</text>
+<text text-anchor="middle" x="745" y="-23.8" font-family="Times,serif" 
font-size="14.00">Schema: [a:Int64;N, b:Int64;N]</text>
+</g>
+<!-- 9&#45;&gt;10 -->
+<g id="edge6" class="edge">
+<title>9&#45;&gt;10</title>
+<path fill="none" stroke="black" d="M745,-79.57C745,-71.11 745,-62.06 
745,-54.41"/>
+<polygon fill="black" stroke="black" points="741.5,-79.83 745,-89.83 
748.5,-79.83 741.5,-79.83"/>
+</g>
+</g>
+</svg>
diff --git a/docs/mdbook/src/usage/query-plans.md 
b/docs/mdbook/src/usage/query-plans.md
new file mode 100644
index 0000000..a39aa9e
--- /dev/null
+++ b/docs/mdbook/src/usage/query-plans.md
@@ -0,0 +1,170 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# DataFusion Query Plans
+
+DataFusion's `DataFrame` is a wrapper around a query plan. In this chapter we 
will learn how to view
+logical and physical query plans for DataFrames.
+
+## Sample Data
+
+Let's go ahead and create a simple DataFrame. You can do this in the Python 
shell or in a notebook.
+
+```python
+from datafusion import SessionContext
+
+ctx = SessionContext()
+
+df = ctx.from_pydict({"a": [1, 2, 3, 1], "b": [4, 5, 6, 7]}, name="my_table")
+```
+
+## Logical Plan
+
+Next, let's look at the logical plan for this dataframe.
+
+```python
+>>> df.logical_plan()
+TableScan: my_table
+```
+
+The logical plan here consists of a single `TableScan` operator. Let's make a 
more interesting plan by creating a new
+`DataFrame` representing an aggregate query with a filter.
+
+```python
+>>> df = ctx.sql("SELECT a, sum(b) FROM my_table WHERE a < 3 GROUP BY a")
+```
+
+When we view the plan for this `DataFrame` we can see that there are now four 
operators in the plan, each
+representing a logical transformation of the data. We start with a `TableScan` 
to read the data, followed by
+a `Filter` to filter out rows that do not match the filter expression, then an 
`Aggregate` is performed. Finally,
+a `Projection` is applied to ensure that the order of the final columns 
matches the `SELECT` part of the SQL query.
+
+```python
+>>> df.logical_plan()
+Projection: my_table.a, SUM(my_table.b)
+  Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]]
+    Filter: my_table.a < Int64(3)
+      TableScan: my_table
+```
+
+## Optimized Logical Plan
+
+DataFusion has a powerful query optimizer which will rewrite query plans to 
make them more efficient before they are
+executed. We can view the output of the optimized by viewint the optimized 
logical plan.
+
+```python
+>>> df.optimized_logical_plan()
+Aggregate: groupBy=[[my_table.a]], aggr=[[SUM(my_table.b)]]
+  Filter: my_table.a < Int64(3)
+    TableScan: my_table projection=[a, b]
+```
+
+We can see that there are two key differences compared to the unoptimized 
logical plan:
+
+- The `Projection` has been removed because it was redundant in this case (the 
output of the `Aggregatge` plan
+  already had the columns in the correct order).
+- The `TableScan` now has a projection pushed down so that it only reads the 
columns required to be able to execute
+  the query. In this case the table only has two columns and we referenced 
them both in the query, but this optimization
+  can be very effective in real-world queries against large tables.
+
+## Physical Plan
+
+Logical plans provide a representation of "what" the query should do it. 
Physical plans explain "how" the query
+should be executed.
+
+We can view the physical plan (also known as an execution plan) using the 
`execution_plan` method.
+
+```python
+>>> df.execution_plan()
+AggregateExec: mode=FinalPartitioned, gby=[a@0 as a], aggr=[SUM(my_table.b)]
+  CoalesceBatchesExec: target_batch_size=8192
+    RepartitionExec: partitioning=Hash([Column { name: "a", index: 0 }], 48), 
input_partitions=48
+      AggregateExec: mode=Partial, gby=[a@0 as a], aggr=[SUM(my_table.b)]
+        CoalesceBatchesExec: target_batch_size=8192
+          FilterExec: a@0 < 3
+            RepartitionExec: partitioning=RoundRobinBatch(48), 
input_partitions=1
+              MemoryExec: partitions=1, partition_sizes=[1]
+```
+
+The `TableScan` has now been replaced by a more specific `MemoryExec` for 
scanning the in-memory data. If we were
+querying a CSV file on disk then we would expect to see a `CsvExec` instead.
+
+This plan has additional operators that were not in the logical plan:
+
+- `RepartionExec` has been added so that the data can be split into partitions 
and processed in parallel using
+  multiple cores.
+- `CoalesceBatchesExec` will combine small batches into larger batches to 
ensure that processing remains efficient.
+
+The `Aggregate` operator now appears twice. This is because aggregates are 
performed in a two step process. Data is
+aggregated within each partition in parallel and then those results (which 
could contain duplicate grouping keys) are
+combined and the aggregate operations is applied again.
+
+## Creating Query Plan Diagrams
+
+DataFusion supports generating query plan diagrams in [DOT 
format](<https://en.wikipedia.org/wiki/DOT_(graph_description_language)>).
+
+DOT is a language for describing graphs and there are open source tools such 
as GraphViz that can render diagrams
+from DOT files.
+
+We can use the following code to generate a DOT file for a logical query plan.
+
+```python
+>>> diagram = df.logical_plan().display_graphviz()
+>>> with open('plan.dot', 'w') as f:
+>>>   f.write(diagram)
+```
+
+If we view the view, we will see the following content.
+
+```
+// Begin DataFusion GraphViz Plan (see https://graphviz.org)
+digraph {
+  subgraph cluster_1
+  {
+    graph[label="LogicalPlan"]
+    2[shape=box label="Projection: my_table.a, SUM(my_table.b)"]
+    3[shape=box label="Aggregate: groupBy=[[my_table.a]], 
aggr=[[SUM(my_table.b)]]"]
+    2 -> 3 [arrowhead=none, arrowtail=normal, dir=back]
+    4[shape=box label="Filter: my_table.a < Int64(3)"]
+    3 -> 4 [arrowhead=none, arrowtail=normal, dir=back]
+    5[shape=box label="TableScan: my_table"]
+    4 -> 5 [arrowhead=none, arrowtail=normal, dir=back]
+  }
+  subgraph cluster_6
+  {
+    graph[label="Detailed LogicalPlan"]
+    7[shape=box label="Projection: my_table.a, SUM(my_table.b)\nSchema: 
[a:Int64;N, SUM(my_table.b):Int64;N]"]
+    8[shape=box label="Aggregate: groupBy=[[my_table.a]], 
aggr=[[SUM(my_table.b)]]\nSchema: [a:Int64;N, SUM(my_table.b):Int64;N]"]
+    7 -> 8 [arrowhead=none, arrowtail=normal, dir=back]
+    9[shape=box label="Filter: my_table.a < Int64(3)\nSchema: [a:Int64;N, 
b:Int64;N]"]
+    8 -> 9 [arrowhead=none, arrowtail=normal, dir=back]
+    10[shape=box label="TableScan: my_table\nSchema: [a:Int64;N, b:Int64;N]"]
+    9 -> 10 [arrowhead=none, arrowtail=normal, dir=back]
+  }
+}
+// End DataFusion GraphViz Plan
+```
+
+We can use GraphViz from the command-line to convert this DOT file into an 
image.
+
+```bash
+dot -Tsvg plan.dot > plan.svg
+```
+
+This generates the following diagram:
+
+![Query Plan Diagram](../images/plan.svg)
diff --git a/src/physical_plan.rs b/src/physical_plan.rs
index 4c35f3e..6f02cef 100644
--- a/src/physical_plan.rs
+++ b/src/physical_plan.rs
@@ -54,6 +54,10 @@ impl PyExecutionPlan {
         format!("{}", d.indent())
     }
 
+    fn __repr__(&self) -> String {
+        self.display_indent()
+    }
+
     #[getter]
     pub fn partition_count(&self) -> usize {
         self.plan.output_partitioning().partition_count()
diff --git a/src/sql/logical.rs b/src/sql/logical.rs
index a22f269..a75315d 100644
--- a/src/sql/logical.rs
+++ b/src/sql/logical.rs
@@ -100,7 +100,7 @@ impl PyLogicalPlan {
     }
 
     fn display_graphviz(&self) -> String {
-        format!("{}", self.plan.display_indent_schema())
+        format!("{}", self.plan.display_graphviz())
     }
 }

[arrow-datafusion-python] branch main updated: Docs explaining how to view query plans (#373)

Reply via email to