This is an automated email from the ASF dual-hosted git repository.

planka pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/orc.git


The following commit(s) were added to refs/heads/asf-site by this push:
     new 5d315227b Updated Site to include documentation for LazyFilters
5d315227b is described below

commit 5d315227bf4489458152f6353af8af59eba381fc
Author: Pavan Lanka <pla...@apple.com>
AuthorDate: Tue Jun 14 09:11:07 2022 -0700

    Updated Site to include documentation for LazyFilters
---
 develop/design/index.html             | 145 ++++++
 develop/design/lazy_filter/index.html | 800 ++++++++++++++++++++++++++++++++++
 develop/index.html                    |   3 +
 3 files changed, 948 insertions(+)

diff --git a/develop/design/index.html b/develop/design/index.html
new file mode 100644
index 000000000..b2f5ff612
--- /dev/null
+++ b/develop/design/index.html
@@ -0,0 +1,145 @@
+<!DOCTYPE HTML>
+<html lang="en-US">
+<head>
+  <meta charset="UTF-8">
+  <title>Design</title>
+  <meta name="viewport" content="width=device-width,initial-scale=1">
+  <meta name="generator" content="Jekyll v3.8.6">
+  <link rel="stylesheet" 
href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
+  <link rel="stylesheet" href="/css/screen.css">
+  <link rel="icon" type="image/x-icon" href="/favicon.ico">
+  <!--[if lt IE 9]>
+  <script src="/js/html5shiv.min.js"></script>
+  <script src="/js/respond.min.js"></script>
+  <![endif]-->
+</head>
+
+
+<body class="wrap">
+  <header role="banner">
+  <nav class="mobile-nav show-on-mobiles">
+    <ul>
+  <li class="">
+    <a href="/">Home</a>
+  </li>
+  <li class="">
+    <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+                     <span class="hide-on-mobiles">Documentation</span></a>
+  </li>
+  <li class="">
+    <a href="/talks/">Talks</a>
+  </li>
+  <li class="">
+    <a href="/news/">News</a>
+  </li>
+  <li class="">
+    <a href="/help/">Help</a>
+  </li>
+  <li class="current">
+    <a href="/develop/">Develop</a>
+  </li>
+</ul>
+
+  </nav>
+  <div class="grid">
+    <div class="unit one-third center-on-mobiles">
+      <h1>
+        <a href="/">
+          <span class="sr-only">Apache ORC</span>
+          <img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
+        </a>
+      </h1>
+    </div>
+    <nav class="main-nav unit two-thirds hide-on-mobiles">
+      <ul>
+  <li class="">
+    <a href="/">Home</a>
+  </li>
+  <li class="">
+    <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+                     <span class="hide-on-mobiles">Documentation</span></a>
+  </li>
+  <li class="">
+    <a href="/talks/">Talks</a>
+  </li>
+  <li class="">
+    <a href="/news/">News</a>
+  </li>
+  <li class="">
+    <a href="/help/">Help</a>
+  </li>
+  <li class="current">
+    <a href="/develop/">Develop</a>
+  </li>
+</ul>
+
+    </nav>
+  </div>
+</header>
+
+
+  <section class="standalone">
+  <div class="grid">
+
+    <div class="unit whole">
+      <article>
+        <h1>Design</h1>
+        <ul>
+  <li><a href="lazy_filter">Lazy Filters</a></li>
+</ul>
+
+      </article>
+    </div>
+
+    <div class="clear"></div>
+
+  </div>
+</section>
+
+
+  <footer role="contentinfo">
+  <p>The contents of this website are &copy;&nbsp;2022
+     <a href="https://www.apache.org/";>Apache Software Foundation</a>
+     under the terms of the <a
+      href="https://www.apache.org/licenses/LICENSE-2.0.html";>
+      Apache&nbsp;License&nbsp;v2</a>. Apache ORC and its logo are trademarks
+      of the Apache Software Foundation.</p>
+</footer>
+
+  <script>
+  var anchorForId = function (id) {
+    var anchor = document.createElement("a");
+    anchor.className = "header-link";
+    anchor.href      = "#" + id;
+    anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa 
fa-link\"></i>";
+    anchor.title = "Permalink";
+    return anchor;
+  };
+
+  var linkifyAnchors = function (level, containingElement) {
+    var headers = containingElement.getElementsByTagName("h" + level);
+    for (var h = 0; h < headers.length; h++) {
+      var header = headers[h];
+
+      if (typeof header.id !== "undefined" && header.id !== "") {
+        header.appendChild(anchorForId(header.id));
+      }
+    }
+  };
+
+  document.onreadystatechange = function () {
+    if (this.readyState === "complete") {
+      var contentBlock = document.getElementsByClassName("docs")[0] || 
document.getElementsByClassName("news")[0];
+      if (!contentBlock) {
+        return;
+      }
+      for (var level = 1; level <= 6; level++) {
+        linkifyAnchors(level, contentBlock);
+      }
+    }
+  };
+</script>
+
+
+</body>
+</html>
diff --git a/develop/design/lazy_filter/index.html 
b/develop/design/lazy_filter/index.html
new file mode 100644
index 000000000..a606d6842
--- /dev/null
+++ b/develop/design/lazy_filter/index.html
@@ -0,0 +1,800 @@
+<!DOCTYPE HTML>
+<html lang="en-US">
+<head>
+  <meta charset="UTF-8">
+  <title>Lazy Filter</title>
+  <meta name="viewport" content="width=device-width,initial-scale=1">
+  <meta name="generator" content="Jekyll v3.8.6">
+  <link rel="stylesheet" 
href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900">
+  <link rel="stylesheet" href="/css/screen.css">
+  <link rel="icon" type="image/x-icon" href="/favicon.ico">
+  <!--[if lt IE 9]>
+  <script src="/js/html5shiv.min.js"></script>
+  <script src="/js/respond.min.js"></script>
+  <![endif]-->
+</head>
+
+
+<body class="wrap">
+  <header role="banner">
+  <nav class="mobile-nav show-on-mobiles">
+    <ul>
+  <li class="">
+    <a href="/">Home</a>
+  </li>
+  <li class="">
+    <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+                     <span class="hide-on-mobiles">Documentation</span></a>
+  </li>
+  <li class="">
+    <a href="/talks/">Talks</a>
+  </li>
+  <li class="">
+    <a href="/news/">News</a>
+  </li>
+  <li class="">
+    <a href="/help/">Help</a>
+  </li>
+  <li class="current">
+    <a href="/develop/">Develop</a>
+  </li>
+</ul>
+
+  </nav>
+  <div class="grid">
+    <div class="unit one-third center-on-mobiles">
+      <h1>
+        <a href="/">
+          <span class="sr-only">Apache ORC</span>
+          <img src="/img/logo.png" width="249" height="101" alt="ORC Logo">
+        </a>
+      </h1>
+    </div>
+    <nav class="main-nav unit two-thirds hide-on-mobiles">
+      <ul>
+  <li class="">
+    <a href="/">Home</a>
+  </li>
+  <li class="">
+    <a href="/docs/"><span class="show-on-mobiles">Docs</span>
+                     <span class="hide-on-mobiles">Documentation</span></a>
+  </li>
+  <li class="">
+    <a href="/talks/">Talks</a>
+  </li>
+  <li class="">
+    <a href="/news/">News</a>
+  </li>
+  <li class="">
+    <a href="/help/">Help</a>
+  </li>
+  <li class="current">
+    <a href="/develop/">Develop</a>
+  </li>
+</ul>
+
+    </nav>
+  </div>
+</header>
+
+
+  <section class="standalone">
+  <div class="grid">
+
+    <div class="unit whole">
+      <article>
+        <h1>Lazy Filter</h1>
+        <ul>
+  <li><a href="#Background">Background</a></li>
+  <li><a href="#Design">Design</a>
+    <ul>
+      <li><a href="#SArgtoFilter">SArg to Filter</a></li>
+      <li><a href="#Read">Read</a></li>
+    </ul>
+  </li>
+  <li><a href="#Configuration">Configuration</a></li>
+  <li><a href="#Tests">Tests</a></li>
+  <li><a href="#Appendix">Appendix</a>
+    <ul>
+      <li><a href="#Benchmarks">Benchmarks</a>
+        <ul>
+          <li><a href="#RowvsVector">Row vs Vector</a></li>
+          <li><a href="#NormalizationvsCompact">Normalization vs 
Compact</a></li>
+          <li><a href="#Summary">Summary</a></li>
+        </ul>
+      </li>
+    </ul>
+  </li>
+</ul>
+
+<h2 id="background-">Background <a id="Background"></a></h2>
+
+<p>This feature request started as a result of a needle in the haystack search 
that is performed with the following
+characteristics:</p>
+
+<ul>
+  <li>The search fields are not part of partition, bucket or sort 
specification.</li>
+  <li>The table is a very large table.</li>
+  <li>The result is very few rows compared to the scan size.</li>
+  <li>The search columns are a significant subset of selection columns in the 
query.</li>
+</ul>
+
+<p>Initial analysis showed that we could have a significant benefit by lazily 
reading the non-search columns only when we
+have a match. We explore the design and some benchmarks in subsequent 
sections.</p>
+
+<h2 id="design-">Design <a id="Design"></a></h2>
+
+<p>This builds further on <a 
href="https://issues.apache.org/jira/browse/ORC-577";>ORC-577</a> which 
currently only restricts deserialization for some selected data types
+but does not improve on IO.</p>
+
+<p>On a high level the design includes the following components:</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>┌──────────────┐          ┌────────────────────────┐
+│              │          │          Read          │
+│              │          │                        │
+│              │          │     ┌────────────┐     │
+│SArg to Filter│─────────▶│     │Read Filter │     │
+│              │          │     │  Columns   │     │
+│              │          │     └────────────┘     │
+│              │          │            │           │
+└──────────────┘          │            ▼           │
+                          │     ┌────────────┐     │
+                          │     │Apply Filter│     │
+                          │     └────────────┘     │
+                          │            │           │
+                          │            ▼           │
+                          │     ┌────────────┐     │
+                          │     │Read Select │     │
+                          │     │  Columns   │     │
+                          │     └────────────┘     │
+                          │                        │
+                          │                        │
+                          └────────────────────────┘
+</code></pre></div></div>
+
+<ul>
+  <li><strong>SArg to Filter</strong>: Converts Search Arguments passed down 
into filters for efficient application during scans.</li>
+  <li><strong>Read</strong>: Performs the lazy read using the filters.
+    <ul>
+      <li><strong>Read Filter Columns</strong>: Read the filter columns from 
the file.</li>
+      <li><strong>Apply Filter</strong>: Apply the filter on the read filter 
columns.</li>
+      <li><strong>Read Select Columns</strong>: If filter selects at least a 
row then read the remaining columns.</li>
+    </ul>
+  </li>
+</ul>
+
+<h3 id="sarg-to-filter-">SArg to Filter <a id="SArgtoFilter"></a></h3>
+
+<p>SArg to Filter converts the passed SArg into a filter. This enables 
automatic compatibility with both Spark and Hive as
+they already push down Search Arguments down to ORC.</p>
+
+<p>The SArg is automatically converted into a <a 
href="https://github.com/apache/orc/tree/main/java/core/src/java/org/apache/orc/impl/filter/VectorFilter.java";>Vector
 Filter</a>. Which is applied during the read process. Two
+filter types were evaluated:</p>
+
+<ul>
+  <li><a 
href="https://github.com/apache/orc/tree/main/java/bench/core/src/java/org/apache/orc/impl/filter/RowFilter.java";>Row
 Filter</a> that evaluates each row across all the predicates once.</li>
+  <li><a 
href="https://github.com/apache/orc/tree/main/java/core/src/java/org/apache/orc/impl/filter/VectorFilter.java";>Vector
 Filter</a> that evaluates each filter across the entire vector and adjusts the 
subsequent evaluation.</li>
+</ul>
+
+<p>While a row based filter is easier to code, it is much <a 
href="#RowvsVector">slower</a> to process. We also see a significant
+<a href="#RowvsVector">performance gain</a> in the absence of 
normalization.</p>
+
+<p>The builder for search argument should allow skipping normalization during 
the <a 
href="https://github.com/apache/hive/blob/storage-branch-2.7/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgumentImpl.java#L491";>build</a>.
 This has been added with
+<a href="https://issues.apache.org/jira/browse/HIVE-24458";>HIVE-24458</a>.</p>
+
+<h3 id="read-">Read <a id="Read"></a></h3>
+
+<p>The read process has the following changes:</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>                         │
+                         │
+                         │
+┌────────────────────────▼────────────────────────┐
+│               ┏━━━━━━━━━━━━━━━━┓                │
+│               ┃Plan ++Search++ ┃                │
+│               ┃    Columns     ┃                │
+│               ┗━━━━━━━━━━━━━━━━┛                │
+│                 Read   │Stripe                  │
+└────────────────────────┼────────────────────────┘
+                         │
+                         ▼
+
+
+                         │
+                         │
+┌────────────────────────▼────────────────────────┐
+│               ┏━━━━━━━━━━━━━━━━┓                │
+│               ┃Read ++Search++ ┃                │
+│               ┃    Columns     ┃◀─────────┐     │
+│               ┗━━━━━━━━━━━━━━━━┛          │     │
+│                        │              Size = 0  │
+│                        ▼                  │     │
+│               ┏━━━━━━━━━━━━━━━━┓          │     │
+│               ┃  Apply Filter  ┃──────────┘     │
+│               ┗━━━━━━━━━━━━━━━━┛                │
+│                    Size &gt; 0                     │
+│                        │                        │
+│                        ▼                        │
+│               ┏━━━━━━━━━━━━━━━━┓                │
+│               ┃  Plan Select   ┃                │
+│               ┃    Columns     ┃                │
+│               ┗━━━━━━━━━━━━━━━━┛                │
+│                        │                        │
+│                        ▼                        │
+│               ┏━━━━━━━━━━━━━━━━┓                │
+│               ┃  Read Select   ┃                │
+│               ┃    Columns     ┃                │
+│               ┗━━━━━━━━━━━━━━━━┛                │
+│                   Next │Batch                   │
+└────────────────────────┼────────────────────────┘
+                         │
+                         ▼
+</code></pre></div></div>
+
+<p>The read process changes:</p>
+
+<ul>
+  <li><strong>Read Stripe</strong> used to plan the read of all (search + 
select) columns. This is enhanced to plan and fetch only the
+search columns. The rest of the stripe planning process optimizations remain 
unchanged e.g. partial read planning of
+the stripe based on RowGroup statistics.</li>
+  <li><strong>Next Batch</strong> identifies the processing that takes place 
when <code class="highlighter-rouge">RecordReader.nextBatch</code> is invoked.
+    <ul>
+      <li><strong>Read Search Columns</strong> takes place instead of reading 
all the selected columns. This is in sync with the planning
+that has taken place during <strong>Read Stripe</strong> where only the search 
columns have been planned.</li>
+      <li><strong>Apply Filter</strong> on the batch that at this point only 
includes search columns. Evaluate the result of the filter:
+        <ul>
+          <li><strong>Size = 0</strong> indicates all records have been 
filtered out. Given this we proceed to the next batch of search
+columns.</li>
+          <li><strong>Size &gt; 0</strong> indicates that at least one record 
accepted by the filter. This record needs to be substantiated with
+other columns.</li>
+        </ul>
+      </li>
+      <li><strong>Plan Select Columns</strong> is invoked to perform read of 
the select columns. The planning happens as follows:
+        <ul>
+          <li>Determine the current position of the read within the stripe and 
plan the read for the select columns from this
+point forward to the end of the stripe.</li>
+          <li>The Read planning of select columns respects the row groups 
filtered out as a result of the stripe planning.</li>
+          <li>Fetch the select columns using the above plan.</li>
+        </ul>
+      </li>
+      <li><strong>Read Select Columns</strong> into the vectorized row 
batch</li>
+      <li>Return this batch.</li>
+    </ul>
+  </li>
+</ul>
+
+<p>The current implementation performs a single read for the select columns in 
a stripe.</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>┌──────────────────────────────────────────────────┐
+│ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ │
+│ │RG0 │ │RG1 │ │RG2■│ │RG3 │ │RG4 │ │RG5■│ │RG6 │ │
+│ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ │
+│                      Stripe                      │
+└──────────────────────────────────────────────────┘
+</code></pre></div></div>
+
+<p>The above diagram depicts a stripe with 7 Row Groups out of which 
<strong>RG2</strong> and <strong>RG5</strong> are selected by the filter. The
+current implementation does the following:</p>
+
+<ul>
+  <li>Start the read planning process from the first match RG2</li>
+  <li>Read to the end of the stripe that includes RG6</li>
+  <li>Based on the above fetch skips RG0 and RG1 subject to compression block 
boundaries</li>
+</ul>
+
+<p>The above logic could be enhanced to perform say <strong>2 or n</strong> 
reads before reading to the end of stripe. The current
+implementation allows 0 reads before reading to the end of the stripe. The 
value of <strong>n</strong> could be configurable but
+should avoid too many short reads.</p>
+
+<p>The read behavior changes as follows with multiple reads being allowed 
within a stripe for select columns:</p>
+
+<div class="language-text highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>┌──────────────────────────────────────────────────┐
+│ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ │
+│ │    │ │    │ │■■■■│ │■■■■│ │■■■■│ │■■■■│ │■■■■│ │
+│ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ │
+│              Current implementation              │
+└──────────────────────────────────────────────────┘
+┌──────────────────────────────────────────────────┐
+│ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌────┐ │
+│ │    │ │    │ │■■■■│ │    │ │    │ │■■■■│ │■■■■│ │
+│ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ └────┘ │
+│               Allow 1 partial read               │
+└──────────────────────────────────────────────────┘
+</code></pre></div></div>
+
+<p>The figure shows that we could read significantly fewer bytes by performing 
an additional read before reading to the end
+of stripe. This shall be included as a subsequent enhancement to this 
patch.</p>
+
+<h2 id="configuration-">Configuration <a id="Configuration"></a></h2>
+
+<p>The following configuration options are exposed that control the filter 
behavior:</p>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Property</th>
+      <th style="text-align: left">Type</th>
+      <th style="text-align: left">Default</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">orc.sarg.to.filter</td>
+      <td style="text-align: left">boolean</td>
+      <td style="text-align: left">false</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">orc.filter.use.selected</td>
+      <td style="text-align: left">boolean</td>
+      <td style="text-align: left">false</td>
+    </tr>
+  </tbody>
+</table>
+
+<ul>
+  <li><code class="highlighter-rouge">orc.sarg.to.filter</code> can be used to 
turn off the SArg to filter conversion. This might be particularly relevant in
+cases where the filter is expensive and does not eliminate a lot of records. 
This will not be relevant once we have
+the option to turn off the filters on the caller as they have been completely 
implemented by the ORC layer.</li>
+  <li><code class="highlighter-rouge">orc.filter.use.selected</code> is an 
important setting that if incorrectly enabled results in wrong output. A 
boolean flag
+to determine if the selected vector is supported by the reading application. 
If false, the output of the ORC reader
+must have the filter reapplied to avoid using unset values in the unselected 
rows. If unsure please leave this as
+false.</li>
+</ul>
+
+<h2 id="tests-">Tests <a id="Tests"></a></h2>
+
+<p>We evaluated this patch against a search job with the following stats:</p>
+
+<ul>
+  <li>Table
+    <ul>
+      <li>Size: ~<strong>420 TB</strong></li>
+      <li>Data fields: ~<strong>120</strong></li>
+      <li>Partition fields: <strong>3</strong></li>
+    </ul>
+  </li>
+  <li>Scan
+    <ul>
+      <li>Search fields: 3 data fields with large (~ 1000 value) IN clauses 
compounded by <strong>OR</strong>.</li>
+      <li>Select fields: 16 data fields (includes the 3 search fields), 1 
partition field</li>
+      <li>Search:
+        <ul>
+          <li>Size: ~<strong>180 TB</strong></li>
+          <li>Records: <strong>3.99 T</strong></li>
+        </ul>
+      </li>
+      <li>Selected:
+        <ul>
+          <li>Size: ~<strong>100 MB</strong></li>
+          <li>Records: <strong>1 M</strong></li>
+        </ul>
+      </li>
+    </ul>
+  </li>
+</ul>
+
+<p>We have observed the following reductions:</p>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Test</th>
+      <th style="text-align: right">IO Reduction %</th>
+      <th style="text-align: right">CPU Reduction %</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">SELECT 16 cols</td>
+      <td style="text-align: right">45</td>
+      <td style="text-align: right">47</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SELECT *</td>
+      <td style="text-align: right">70</td>
+      <td style="text-align: right">87</td>
+    </tr>
+  </tbody>
+</table>
+
+<ul>
+  <li>The savings are more significant as you increase the number of select 
columns with respect to the search columns</li>
+  <li>When the filter selects most data, no significant penalty observed as a 
result of 2 IO compared with a single IO
+    <ul>
+      <li>We do have a penalty as a result of the double filter application 
both in ORC and in the calling engine.</li>
+    </ul>
+  </li>
+</ul>
+
+<h2 id="appendix-">Appendix <a id="Appendix"></a></h2>
+
+<h3 id="benchmarks-">Benchmarks <a id="Benchmarks"></a></h3>
+
+<h4 id="row-vs-vector-">Row vs Vector <a id="RowvsVector"></a></h4>
+
+<p>We start with a decision of using a Row filter vs a Vector filter. The Row 
filter has the advantage of simpler code when
+compared with the Vector filter.</p>
+
+<div class="language-bash highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>java <span class="nt">-jar</span> 
java/bench/core/target/orc-benchmarks-core-<span class="k">*</span><span 
class="nt">-uber</span>.jar filter simple
+</code></pre></div></div>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Benchmark</th>
+      <th style="text-align: right">(fInSize)</th>
+      <th style="text-align: left">(fType)</th>
+      <th style="text-align: left">Mode</th>
+      <th style="text-align: right">Cnt</th>
+      <th style="text-align: right">Score</th>
+      <th style="text-align: left">Error</th>
+      <th style="text-align: left">Units</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">4</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">38.207</td>
+      <td style="text-align: left">± 0.178</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">4</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">18.663</td>
+      <td style="text-align: left">± 0.117</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">8</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">50.694</td>
+      <td style="text-align: left">± 0.313</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">8</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">35.532</td>
+      <td style="text-align: left">± 0.190</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">16</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">52.443</td>
+      <td style="text-align: left">± 0.268</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">16</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">33.966</td>
+      <td style="text-align: left">± 0.204</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">32</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">68.504</td>
+      <td style="text-align: left">± 0.318</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">32</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">51.707</td>
+      <td style="text-align: left">± 0.302</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">256</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">88.348</td>
+      <td style="text-align: left">± 0.793</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">SimpleFilter</td>
+      <td style="text-align: right">256</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">72.602</td>
+      <td style="text-align: left">± 0.282</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Explanation:</p>
+
+<ul>
+  <li><strong>fInSize</strong> calls out the number of values in the IN 
clause.</li>
+  <li><strong>fType</strong> calls out the whether the filter is a row based 
filter, or a vector based filter.</li>
+</ul>
+
+<p>Observations:</p>
+
+<ul>
+  <li>The vector based filter is significantly faster than the row based 
filter.
+    <ul>
+      <li>At best, vector was faster by <strong>51.15%</strong></li>
+      <li>At worst, vector was faster by <strong>17.82%</strong></li>
+    </ul>
+  </li>
+  <li>The performance of the filters is deteriorates with the increase of the 
IN values, however even in this case the
+vector filter is much better than the row filter. The current <code 
class="highlighter-rouge">IN</code> filter employs a binary search on an array 
instead
+of a hash lookup.</li>
+</ul>
+
+<h4 id="normalization-vs-compact-">Normalization vs Compact <a 
id="NormalizationvsCompact"></a></h4>
+
+<p>In this test we use a complex filter with both AND, and OR to understand 
the impact of Conjunctive Normal Form on the
+filter performance. The Search Argument builder by default performs a CNF. The 
advantage of the CNF would again be a
+simpler code base.</p>
+
+<div class="language-bash highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>java <span class="nt">-jar</span> 
java/bench/core/target/orc-benchmarks-core-<span class="k">*</span><span 
class="nt">-uber</span>.jar filter complex
+</code></pre></div></div>
+
+<table>
+  <thead>
+    <tr>
+      <th style="text-align: left">Benchmark</th>
+      <th style="text-align: right">(fSize)</th>
+      <th style="text-align: left">(fType)</th>
+      <th style="text-align: left">(normalize)</th>
+      <th style="text-align: left">Mode</th>
+      <th style="text-align: right">Cnt</th>
+      <th style="text-align: right">Score</th>
+      <th style="text-align: left">Error</th>
+      <th style="text-align: left">Units</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">2</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">true</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">91.922</td>
+      <td style="text-align: left">± 0.301</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">2</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">false</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">90.741</td>
+      <td style="text-align: left">± 0.556</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">2</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">true</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">61.137</td>
+      <td style="text-align: left">± 0.398</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">2</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">false</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">54.829</td>
+      <td style="text-align: left">± 0.431</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">4</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">true</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">284.956</td>
+      <td style="text-align: left">± 1.237</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">4</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">false</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">130.526</td>
+      <td style="text-align: left">± 0.767</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">4</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">true</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">242.387</td>
+      <td style="text-align: left">± 1.053</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">4</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">false</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">98.530</td>
+      <td style="text-align: left">± 0.423</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">8</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">true</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">8007.101</td>
+      <td style="text-align: left">± 54.912</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">8</td>
+      <td style="text-align: left">row</td>
+      <td style="text-align: left">false</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">234.943</td>
+      <td style="text-align: left">± 4.713</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">8</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">true</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">7013.758</td>
+      <td style="text-align: left">± 33.701</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+    <tr>
+      <td style="text-align: left">ComplexFilter</td>
+      <td style="text-align: right">8</td>
+      <td style="text-align: left">vector</td>
+      <td style="text-align: left">false</td>
+      <td style="text-align: left">avgt</td>
+      <td style="text-align: right">20</td>
+      <td style="text-align: right">190.442</td>
+      <td style="text-align: left">± 0.881</td>
+      <td style="text-align: left">us/op</td>
+    </tr>
+  </tbody>
+</table>
+
+<p>Explanation:</p>
+
+<ul>
+  <li><strong>fSize</strong> identifies the size of the children in the OR 
clause that will be normalized.</li>
+  <li><strong>normalize</strong> identifies whether normalize was carried out 
on the Search Argument.</li>
+</ul>
+
+<p>Observations:</p>
+
+<ul>
+  <li>Vector filter is better than the row filter as demonstrated by the <a 
href="#RowvsVector">Row vs Vector Test</a>.</li>
+  <li>Normalizing the search argument results in a significant performance 
penalty given the explosion of the operator tree
+    <ul>
+      <li>In case where an AND includes 8 ORs, the compact version is faster 
by <strong>97.29%</strong></li>
+    </ul>
+  </li>
+</ul>
+
+<h4 id="summary-">Summary <a id="Summary"></a></h4>
+
+<p>Based on the benchmarks we have the following conclusions:</p>
+
+<ul>
+  <li>Vector based filter is significantly better than a row based filter and 
justifies the more complex code.</li>
+  <li>Compact filter is significantly faster than a normalized filter.</li>
+</ul>
+
+
+      </article>
+    </div>
+
+    <div class="clear"></div>
+
+  </div>
+</section>
+
+
+  <footer role="contentinfo">
+  <p>The contents of this website are &copy;&nbsp;2022
+     <a href="https://www.apache.org/";>Apache Software Foundation</a>
+     under the terms of the <a
+      href="https://www.apache.org/licenses/LICENSE-2.0.html";>
+      Apache&nbsp;License&nbsp;v2</a>. Apache ORC and its logo are trademarks
+      of the Apache Software Foundation.</p>
+</footer>
+
+  <script>
+  var anchorForId = function (id) {
+    var anchor = document.createElement("a");
+    anchor.className = "header-link";
+    anchor.href      = "#" + id;
+    anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa 
fa-link\"></i>";
+    anchor.title = "Permalink";
+    return anchor;
+  };
+
+  var linkifyAnchors = function (level, containingElement) {
+    var headers = containingElement.getElementsByTagName("h" + level);
+    for (var h = 0; h < headers.length; h++) {
+      var header = headers[h];
+
+      if (typeof header.id !== "undefined" && header.id !== "") {
+        header.appendChild(anchorForId(header.id));
+      }
+    }
+  };
+
+  document.onreadystatechange = function () {
+    if (this.readyState === "complete") {
+      var contentBlock = document.getElementsByClassName("docs")[0] || 
document.getElementsByClassName("news")[0];
+      if (!contentBlock) {
+        return;
+      }
+      for (var level = 1; level <= 6; level++) {
+        linkifyAnchors(level, contentBlock);
+      }
+    }
+  };
+</script>
+
+
+</body>
+</html>
diff --git a/develop/index.html b/develop/index.html
index 8edf7f005..6db99a372 100644
--- a/develop/index.html
+++ b/develop/index.html
@@ -122,6 +122,9 @@ with archive <a 
href="https://mail-archives.apache.org/mod_mbox/orc-commits/";>he
 <p>Each code change requires a <a 
href="https://issues.apache.org/jira/browse/ORC";>jira</a> to track the
 discussion of the change.</p>
 
+<h2 id="design">Design</h2>
+<p>Some code changes provide <a href="design">design/additional 
documentation</a>.</p>
+
 <h2 id="source-code">Source code</h2>
 
 <p>ORC uses git for version control. Get the source code and configure it

Reply via email to