http://git-wip-us.apache.org/repos/asf/orc/blob/6a400548/docs/compression.html ---------------------------------------------------------------------- diff --git a/docs/compression.html b/docs/compression.html new file mode 100644 index 0000000..f714469 --- /dev/null +++ b/docs/compression.html @@ -0,0 +1,1005 @@ +<!DOCTYPE HTML> +<html lang="en-US"> +<head> + <meta charset="UTF-8"> + <title>Compression</title> + <meta name="viewport" content="width=device-width,initial-scale=1"> + <meta name="generator" content="Jekyll v2.4.0"> + <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900"> + <link rel="stylesheet" href="/css/screen.css"> + <link rel="icon" type="image/x-icon" href="/favicon.ico"> + <!--[if lt IE 9]> + <script src="/js/html5shiv.min.js"></script> + <script src="/js/respond.min.js"></script> + <![endif]--> +</head> + + +<body class="wrap"> + <header role="banner"> + <nav class="mobile-nav show-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + <div class="grid"> + <div class="unit one-third center-on-mobiles"> + <h1> + <a href="/"> + <span class="sr-only">Apache ORC</span> + <img src="/img/logo.png" width="249" height="115" alt="ORC Logo"> + </a> + </h1> + </div> + <nav class="main-nav unit two-thirds hide-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + </div> +</header> + + + <section class="docs"> + <div class="grid"> + + <div class="docs-nav-mobile unit whole show-on-mobiles"> + <select onchange="if (this.value) window.location.href=this.value"> + <option value="">Navigate the docsâ¦</option> + + <optgroup label="Overview"> + + + + + + + + + + + + + + + + + + + + <option value="/docs/index.html">Background</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/types.html">Types</option> + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/indexes.html">Indexes</option> + + + + + + + + + + + + + + + + + + <option value="/docs/acid.html">ACID support</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Hive Usage"> + + + + + + + + + + + + + + + + + + <option value="/docs/hive-ddl.html">Hive DDL</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/hive-config.html">Hive Configuration</option> + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Format Specification"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-intro.html">Introduction</option> + + + + + + + + + + + + + + + + + + <option value="/docs/file-tail.html">File Tail</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/compression.html">Compression</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/run-length.html">Run Length Encoding</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/stripes.html">Stripes</option> + + + + + + + + + + + + + + <option value="/docs/encodings.html">Column Encodings</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-index.html">Indexes</option> + + + + + + + + + + + </optgroup> + + </select> +</div> + + + <div class="unit four-fifths"> + <article> + <h1>Compression</h1> + <p>If the ORC file writer selects a generic compression codec (zlib or +snappy), every part of the ORC file except for the Postscript is +compressed with that codec. However, one of the requirements for ORC +is that the reader be able to skip over compressed bytes without +decompressing the entire stream. To manage this, ORC writes compressed +streams in chunks with headers as in the figure below. +To handle uncompressable data, if the compressed data is larger than +the original, the original is stored and the isOriginal flag is +set. Each header is 3 bytes long with (compressedLength * 2 + +isOriginal) stored as a little endian value. For example, the header +for a chunk that compressed to 100,000 bytes would be [0x40, 0x0d, +0x03]. The header for 5 bytes that did not compress would be [0x0b, +0x00, 0x00]. Each compression chunk is compressed independently so +that as long as a decompressor starts at the top of a header, it can +start decompressing without the previous bytes.</p> + +<p><img src="/img/CompressionStream.png" alt="compression streams" /></p> + +<p>The default compression chunk size is 256K, but writers can choose +their own value less than 223. Larger chunks lead to better +compression, but require more memory. The chunk size is recorded in +the Postscript so that readers can allocate appropriately sized +buffers.</p> + +<p>ORC files without generic compression write each stream directly +with no headers.</p> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <div class="section-nav"> + <div class="left align-right"> + + + + <a href="/docs/file-tail.html" class="prev">Back</a> + + </div> + <div class="right align-left"> + + + + <a href="/docs/run-length.html" class="next">Next</a> + + </div> + </div> + <div class="clear"></div> + + + </article> + </div> + + <div class="unit one-fifth hide-on-mobiles"> + <aside> + + <h4>Overview</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/index.html">Background</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/types.html">Types</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/indexes.html">Indexes</a></li> + + + + + + + + + + + + <li class=""><a href="/docs/acid.html">ACID support</a></li> + + + +</ul> + + + <h4>Hive Usage</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li> + + + +</ul> + + + <h4>Format Specification</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-intro.html">Introduction</a></li> + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/file-tail.html">File Tail</a></li> + + + + + + + + + + + + + + <li class="current"><a href="/docs/compression.html">Compression</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/run-length.html">Run Length Encoding</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/stripes.html">Stripes</a></li> + + + + + + + + + + + + + + + + <li class=""><a href="/docs/encodings.html">Column Encodings</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-index.html">Indexes</a></li> + + + +</ul> + + + </aside> +</div> + + + <div class="clear"></div> + + </div> + </section> + + + <footer role="contentinfo"> + <p>The contents of this website are © 2015 + <a href="https://www.apache.org/">Apache Software Foundation</a> + under the terms of the <a + href="https://www.apache.org/licenses/LICENSE-2.0.html"> + Apache License v2</a>. Apache ORC and its logo are trademarks + of the Apache Software Foundation.</p> +</footer> + + <script> + var anchorForId = function (id) { + var anchor = document.createElement("a"); + anchor.className = "header-link"; + anchor.href = "#" + id; + anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>"; + anchor.title = "Permalink"; + return anchor; + }; + + var linkifyAnchors = function (level, containingElement) { + var headers = containingElement.getElementsByTagName("h" + level); + for (var h = 0; h < headers.length; h++) { + var header = headers[h]; + + if (typeof header.id !== "undefined" && header.id !== "") { + header.appendChild(anchorForId(header.id)); + } + } + }; + + document.onreadystatechange = function () { + if (this.readyState === "complete") { + var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0]; + if (!contentBlock) { + return; + } + for (var level = 1; level <= 6; level++) { + linkifyAnchors(level, contentBlock); + } + } + }; +</script> + + +</body> +</html>
http://git-wip-us.apache.org/repos/asf/orc/blob/6a400548/docs/encodings.html ---------------------------------------------------------------------- diff --git a/docs/encodings.html b/docs/encodings.html new file mode 100644 index 0000000..d7a0c73 --- /dev/null +++ b/docs/encodings.html @@ -0,0 +1,1600 @@ +<!DOCTYPE HTML> +<html lang="en-US"> +<head> + <meta charset="UTF-8"> + <title>Column Encodings</title> + <meta name="viewport" content="width=device-width,initial-scale=1"> + <meta name="generator" content="Jekyll v2.4.0"> + <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900"> + <link rel="stylesheet" href="/css/screen.css"> + <link rel="icon" type="image/x-icon" href="/favicon.ico"> + <!--[if lt IE 9]> + <script src="/js/html5shiv.min.js"></script> + <script src="/js/respond.min.js"></script> + <![endif]--> +</head> + + +<body class="wrap"> + <header role="banner"> + <nav class="mobile-nav show-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + <div class="grid"> + <div class="unit one-third center-on-mobiles"> + <h1> + <a href="/"> + <span class="sr-only">Apache ORC</span> + <img src="/img/logo.png" width="249" height="115" alt="ORC Logo"> + </a> + </h1> + </div> + <nav class="main-nav unit two-thirds hide-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + </div> +</header> + + + <section class="docs"> + <div class="grid"> + + <div class="docs-nav-mobile unit whole show-on-mobiles"> + <select onchange="if (this.value) window.location.href=this.value"> + <option value="">Navigate the docsâ¦</option> + + <optgroup label="Overview"> + + + + + + + + + + + + + + + + + + + + <option value="/docs/index.html">Background</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/types.html">Types</option> + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/indexes.html">Indexes</option> + + + + + + + + + + + + + + + + + + <option value="/docs/acid.html">ACID support</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Hive Usage"> + + + + + + + + + + + + + + + + + + <option value="/docs/hive-ddl.html">Hive DDL</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/hive-config.html">Hive Configuration</option> + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Format Specification"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-intro.html">Introduction</option> + + + + + + + + + + + + + + + + + + <option value="/docs/file-tail.html">File Tail</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/compression.html">Compression</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/run-length.html">Run Length Encoding</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/stripes.html">Stripes</option> + + + + + + + + + + + + + + <option value="/docs/encodings.html">Column Encodings</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-index.html">Indexes</option> + + + + + + + + + + + </optgroup> + + </select> +</div> + + + <div class="unit four-fifths"> + <article> + <h1>Column Encodings</h1> + <h2 id="smallint-int-and-bigint-columns">SmallInt, Int, and BigInt Columns</h2> + +<p>All of the 16, 32, and 64 bit integer column types use the same set of +potential encodings, which is basically whether they use RLE v1 or +v2. If the PRESENT stream is not included, all of the values are +present. For values that have false bits in the present stream, no +values are included in the data stream.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Signed Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Signed Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="float-and-double-columns">Float and Double Columns</h2> + +<p>Floating point types are stored using IEEE 754 floating point bit +layout. Float columns use 4 bytes per value and double columns use 8 +bytes.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">IEEE 754 floating point representation</td> + </tr> + </tbody> +</table> + +<h2 id="string-char-and-varchar-columns">String, Char, and VarChar Columns</h2> + +<p>String columns are adaptively encoded based on whether the first +10,000 values are sufficiently distinct. In all of the encodings, the +PRESENT stream encodes whether the value is null.</p> + +<p>For direct encoding the UTF-8 bytes are saved in the DATA stream and +the length of each value is written into the LENGTH stream. In direct +encoding, if the values were [âNevadaâ, âCaliforniaâ]; the DATA +would be âNevadaCaliforniaâ and the LENGTH would be [6, 10].</p> + +<p>For dictionary encodings the dictionary is sorted and UTF-8 bytes of +each unique value are placed into DICTIONARY_DATA. The length of each +item in the dictionary is put into the LENGTH stream. The DATA stream +consists of the sequence of references to the dictionary elements.</p> + +<p>In dictionary encoding, if the values were [âNevadaâ, +âCaliforniaâ, âNevadaâ, âCaliforniaâ, and âFloridaâ]; the +DICTIONARY_DATA would be âCaliforniaFloridaNevadaâ and LENGTH would +be [10, 7, 6]. The DATA would be [2, 0, 2, 0, 1].</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">String contents</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DICTIONARY</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DICTIONARY_DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">String contents</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">String contents</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + <tr> + <td style="text-align: left">DICTIONARY_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DICTIONARY_DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">String contents</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="boolean-columns">Boolean Columns</h2> + +<p>Boolean columns are rare, but have a simple encoding.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + </tbody> +</table> + +<h2 id="tinyint-columns">TinyInt Columns</h2> + +<p>TinyInt (byte) columns use byte run length encoding.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Byte RLE</td> + </tr> + </tbody> +</table> + +<h2 id="binary-columns">Binary Columns</h2> + +<p>Binary data is encoded with a PRESENT stream, a DATA stream that records +the contents, and a LENGTH stream that records the number of bytes per a +value.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">String contents</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">String contents</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="decimal-columns">Decimal Columns</h2> + +<p>Decimal was introduced in Hive 0.11 with infinite precision (the total +number of digits). In Hive 0.13, the definition was change to limit +the precision to a maximum of 38 digits, which conveniently uses 127 +bits plus a sign bit. The current encoding of decimal columns stores +the integer representation of the value as an unbounded length zigzag +encoded base 128 varint. The scale is stored in the SECONDARY stream +as an unsigned integer.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unbounded base 128 varints</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">SECONDARY</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unbounded base 128 varints</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">SECONDARY</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="date-columns">Date Columns</h2> + +<p>Date data is encoded with a PRESENT stream, a DATA stream that records +the number of days after January 1, 1970 in UTC.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Signed Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Signed Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="timestamp-columns">Timestamp Columns</h2> + +<p>Timestamp records times down to nanoseconds as a PRESENT stream that +records non-null values, a DATA stream that records the number of +seconds after 1 January 2015, and a SECONDARY stream that records the +number of nanoseconds.</p> + +<p>Because the number of nanoseconds often has a large number of trailing +zeros, the number has trailing decimal zero digits removed and the +last three bits are used to record how many zeros were removed. Thus +1000 nanoseconds would be serialized as 0x0b and 100000 would be +serialized as 0x0d.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Signed Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">SECONDARY</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DATA</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Signed Integer RLE v2</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">SECONDARY</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="struct-columns">Struct Columns</h2> + +<p>Structs have no data themselves and delegate everything to their child +columns except for their PRESENT stream. They have a child column +for each of the fields.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + </tbody> +</table> + +<h2 id="list-columns">List Columns</h2> + +<p>Lists are encoded as the PRESENT stream and a length stream with +number of items in each list. They have a single child column for the +element values.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="map-columns">Map Columns</h2> + +<p>Maps are encoded as the PRESENT stream and a length stream with number +of items in each list. They have a child column for the key and +another child column for the value.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v1</td> + </tr> + <tr> + <td style="text-align: left">DIRECT_V2</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">LENGTH</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Unsigned Integer RLE v2</td> + </tr> + </tbody> +</table> + +<h2 id="union-columns">Union Columns</h2> + +<p>Unions are encoded as the PRESENT stream and a tag stream that controls which +potential variant is used. They have a child column for each variant of the +union. Currently ORC union types are limited to 256 variants, which matches +the Hive type model.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Encoding</th> + <th style="text-align: left">Stream Kind</th> + <th style="text-align: left">Optional</th> + <th style="text-align: left">Contents</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">PRESENT</td> + <td style="text-align: left">Yes</td> + <td style="text-align: left">Boolean RLE</td> + </tr> + <tr> + <td style="text-align: left"> </td> + <td style="text-align: left">DIRECT</td> + <td style="text-align: left">No</td> + <td style="text-align: left">Byte RLE</td> + </tr> + </tbody> +</table> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <div class="section-nav"> + <div class="left align-right"> + + + + <a href="/docs/stripes.html" class="prev">Back</a> + + </div> + <div class="right align-left"> + + + + <a href="/docs/spec-index.html" class="next">Next</a> + + </div> + </div> + <div class="clear"></div> + + + </article> + </div> + + <div class="unit one-fifth hide-on-mobiles"> + <aside> + + <h4>Overview</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/index.html">Background</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/types.html">Types</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/indexes.html">Indexes</a></li> + + + + + + + + + + + + <li class=""><a href="/docs/acid.html">ACID support</a></li> + + + +</ul> + + + <h4>Hive Usage</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li> + + + +</ul> + + + <h4>Format Specification</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-intro.html">Introduction</a></li> + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/file-tail.html">File Tail</a></li> + + + + + + + + + + + + + + <li class=""><a href="/docs/compression.html">Compression</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/run-length.html">Run Length Encoding</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/stripes.html">Stripes</a></li> + + + + + + + + + + + + + + + + <li class="current"><a href="/docs/encodings.html">Column Encodings</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-index.html">Indexes</a></li> + + + +</ul> + + + </aside> +</div> + + + <div class="clear"></div> + + </div> + </section> + + + <footer role="contentinfo"> + <p>The contents of this website are © 2015 + <a href="https://www.apache.org/">Apache Software Foundation</a> + under the terms of the <a + href="https://www.apache.org/licenses/LICENSE-2.0.html"> + Apache License v2</a>. Apache ORC and its logo are trademarks + of the Apache Software Foundation.</p> +</footer> + + <script> + var anchorForId = function (id) { + var anchor = document.createElement("a"); + anchor.className = "header-link"; + anchor.href = "#" + id; + anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>"; + anchor.title = "Permalink"; + return anchor; + }; + + var linkifyAnchors = function (level, containingElement) { + var headers = containingElement.getElementsByTagName("h" + level); + for (var h = 0; h < headers.length; h++) { + var header = headers[h]; + + if (typeof header.id !== "undefined" && header.id !== "") { + header.appendChild(anchorForId(header.id)); + } + } + }; + + document.onreadystatechange = function () { + if (this.readyState === "complete") { + var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0]; + if (!contentBlock) { + return; + } + for (var level = 1; level <= 6; level++) { + linkifyAnchors(level, contentBlock); + } + } + }; +</script> + + +</body> +</html> http://git-wip-us.apache.org/repos/asf/orc/blob/6a400548/docs/file-tail.html ---------------------------------------------------------------------- diff --git a/docs/file-tail.html b/docs/file-tail.html new file mode 100644 index 0000000..dc29a0c --- /dev/null +++ b/docs/file-tail.html @@ -0,0 +1,1288 @@ +<!DOCTYPE HTML> +<html lang="en-US"> +<head> + <meta charset="UTF-8"> + <title>File Tail</title> + <meta name="viewport" content="width=device-width,initial-scale=1"> + <meta name="generator" content="Jekyll v2.4.0"> + <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900"> + <link rel="stylesheet" href="/css/screen.css"> + <link rel="icon" type="image/x-icon" href="/favicon.ico"> + <!--[if lt IE 9]> + <script src="/js/html5shiv.min.js"></script> + <script src="/js/respond.min.js"></script> + <![endif]--> +</head> + + +<body class="wrap"> + <header role="banner"> + <nav class="mobile-nav show-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + <div class="grid"> + <div class="unit one-third center-on-mobiles"> + <h1> + <a href="/"> + <span class="sr-only">Apache ORC</span> + <img src="/img/logo.png" width="249" height="115" alt="ORC Logo"> + </a> + </h1> + </div> + <nav class="main-nav unit two-thirds hide-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + </div> +</header> + + + <section class="docs"> + <div class="grid"> + + <div class="docs-nav-mobile unit whole show-on-mobiles"> + <select onchange="if (this.value) window.location.href=this.value"> + <option value="">Navigate the docsâ¦</option> + + <optgroup label="Overview"> + + + + + + + + + + + + + + + + + + + + <option value="/docs/index.html">Background</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/types.html">Types</option> + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/indexes.html">Indexes</option> + + + + + + + + + + + + + + + + + + <option value="/docs/acid.html">ACID support</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Hive Usage"> + + + + + + + + + + + + + + + + + + <option value="/docs/hive-ddl.html">Hive DDL</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/hive-config.html">Hive Configuration</option> + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Format Specification"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-intro.html">Introduction</option> + + + + + + + + + + + + + + + + + + <option value="/docs/file-tail.html">File Tail</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/compression.html">Compression</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/run-length.html">Run Length Encoding</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/stripes.html">Stripes</option> + + + + + + + + + + + + + + <option value="/docs/encodings.html">Column Encodings</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-index.html">Indexes</option> + + + + + + + + + + + </optgroup> + + </select> +</div> + + + <div class="unit four-fifths"> + <article> + <h1>File Tail</h1> + <p>Since HDFS does not support changing the data in a file after it is +written, ORC stores the top level index at the end of the file. The +overall structure of the file is given in the figure above. The +fileâs tail consists of 3 parts; the file metadata, file footer and +postscript.</p> + +<p>The metadata for ORC is stored using +<a href="http://s.apache.org/protobuf_encoding">Protocol Buffers</a>, which provides +the ability to add new fields without breaking readers. This document +incorporates the Protobuf definition from the +<a href="http://s.apache.org/orc_proto">ORC source code</a> and the +reader is encouraged to review the Protobuf encoding if they need to +understand the byte-level encoding</p> + +<h1 id="postscript">Postscript</h1> + +<p>The Postscript section provides the necessary information to interpret +the rest of the file including the length of the fileâs Footer and +Metadata sections, the version of the file, and the kind of general +compression used (eg. none, zlib, or snappy). The Postscript is never +compressed and ends one byte before the end of the file. The version +stored in the Postscript is the lowest version of Hive that is +guaranteed to be able to read the file and it stored as a sequence of +the major and minor version. There are currently two versions that are +used: [0,11] for Hive 0.11, and [0,12] for Hive 0.12 or later.</p> + +<p>The process of reading an ORC file works backwards through the +file. Rather than making multiple short reads, the ORC reader reads +the last 16k bytes of the file with the hope that it will contain both +the Footer and Postscript sections. The final byte of the file +contains the serialized length of the Postscript, which must be less +than 256 bytes. Once the Postscript is parsed, the compressed +serialized length of the Footer is known and it can be decompressed +and parsed.</p> + +<p><code>message PostScript { + // the length of the footer section in bytes + optional uint64 footerLength = 1; + // the kind of generic compression used + optional CompressionKind compression = 2; + // the maximum size of each compression chunk + optional uint64 compressionBlockSize = 3; + // the version of the writer + repeated uint32 version = 4 [packed = true]; + // the length of the metadata section in bytes + optional uint64 metadataLength = 5; + // the fixed string "ORC" + optional string magic = 8000; +} +</code></p> + +<p><code>enum CompressionKind { + NONE = 0; + ZLIB = 1; + SNAPPY = 2; + LZO = 3; +} +</code></p> + +<h1 id="footer">Footer</h1> + +<p>The Footer section contains the layout of the body of the file, the +type schema information, the number of rows, and the statistics about +each of the columns.</p> + +<p>The file is broken in to three parts- Header, Body, and Tail. The +Header consists of the bytes âORCââ to support tools that want to +scan the front of the file to determine the type of the file. The Body +contains the rows and indexes, and the Tail gives the file level +information as described in this section.</p> + +<p><code>message Footer { + // the length of the file header in bytes (always 3) + optional uint64 headerLength = 1; + // the length of the file header and body in bytes + optional uint64 contentLength = 2; + // the information about the stripes + repeated StripeInformation stripes = 3; + // the schema information + repeated Type types = 4; + // the user metadata that was added + repeated UserMetadataItem metadata = 5; + // the total number of rows in the file + optional uint64 numberOfRows = 6; + // the statistics of each column across the file + repeated ColumnStatistics statistics = 7; + // the maximum number of rows in each index entry + optional uint32 rowIndexStride = 8; +} +</code></p> + +<h2 id="stripe-information">Stripe Information</h2> + +<p>The body of the file is divided into stripes. Each stripe is self +contained and may be read using only its own bytes combined with the +fileâs Footer and Postscript. Each stripe contains only entire rows so +that rows never straddle stripe boundaries. Stripes have three +sections: a set of indexes for the rows within the stripe, the data +itself, and a stripe footer. Both the indexes and the data sections +are divided by columns so that only the data for the required columns +needs to be read.</p> + +<p><code>message StripeInformation { + // the start of the stripe within the file + optional uint64 offset = 1; + // the length of the indexes in bytes + optional uint64 indexLength = 2; + // the length of the data in bytes + optional uint64 dataLength = 3; + // the length of the footer in bytes + optional uint64 footerLength = 4; + // the number of rows in the stripe + optional uint64 numberOfRows = 5; +} +</code></p> + +<h2 id="type-information">Type Information</h2> + +<p>All of the rows in an ORC file must have the same schema. Logically +the schema is expressed as a tree as in the figure below, where +the compound types have subcolumns under them.</p> + +<p><img src="/img/TreeWriters.png" alt="ORC column structure" /></p> + +<p>The equivalent Hive DDL would be:</p> + +<p><code>create table Foobar ( + myInt int, + myMap map<string, + struct<myString : string, + myDouble: double>>, + myTime timestamp +); +</code></p> + +<p>The type tree is flattened in to a list via a pre-order traversal +where each type is assigned the next id. Clearly the root of the type +tree is always type id 0. Compound types have a field named subtypes +that contains the list of their childrenâs type ids.</p> + +<p><code>message Type { + enum Kind { + BOOLEAN = 0; + BYTE = 1; + SHORT = 2; + INT = 3; + LONG = 4; + FLOAT = 5; + DOUBLE = 6; + STRING = 7; + BINARY = 8; + TIMESTAMP = 9; + LIST = 10; + MAP = 11; + STRUCT = 12; + UNION = 13; + DECIMAL = 14; + DATE = 15; + VARCHAR = 16; + CHAR = 17; + } + // the kind of this type + required Kind kind = 1; + // the type ids of any subcolumns for list, map, struct, or union + repeated uint32 subtypes = 2 [packed=true]; + // the list of field names for struct + repeated string fieldNames = 3; + // the maximum length of the type for varchar or char + optional uint32 maximumLength = 4; + // the precision and scale for decimal + optional uint32 precision = 5; + optional uint32 scale = 6; +} +</code></p> + +<h2 id="column-statistics">Column Statistics</h2> + +<p>The goal of the column statistics is that for each column, the writer +records the count and depending on the type other useful fields. For +most of the primitive types, it records the minimum and maximum +values; and for numeric types it additionally stores the sum. +From Hive 1.1.0 onwards, the column statistics will also record if +there are any null values within the row group by setting the hasNull flag. +The hasNull flag is used by ORCâs predicate pushdown to better answer +âIS NULLâ queries.</p> + +<p><code>message ColumnStatistics { + // the number of values + optional uint64 numberOfValues = 1; + // At most one of these has a value for any column + optional IntegerStatistics intStatistics = 2; + optional DoubleStatistics doubleStatistics = 3; + optional StringStatistics stringStatistics = 4; + optional BucketStatistics bucketStatistics = 5; + optional DecimalStatistics decimalStatistics = 6; + optional DateStatistics dateStatistics = 7; + optional BinaryStatistics binaryStatistics = 8; + optional TimestampStatistics timestampStatistics = 9; + optional bool hasNull = 10; +} +</code></p> + +<p>For integer types (tinyint, smallint, int, bigint), the column +statistics includes the minimum, maximum, and sum. If the sum +overflows long at any point during the calculation, no sum is +recorded.</p> + +<p><code>message IntegerStatistics { + optional sint64 minimum = 1; + optional sint64 maximum = 2; + optional sint64 sum = 3; +} +</code></p> + +<p>For floating point types (float, double), the column statistics +include the minimum, maximum, and sum. If the sum overflows a double, +no sum is recorded.</p> + +<p><code>message DoubleStatistics { + optional double minimum = 1; + optional double maximum = 2; + optional double sum = 3; +} +</code></p> + +<p>For strings, the minimum value, maximum value, and the sum of the +lengths of the values are recorded.</p> + +<p><code>message StringStatistics { + optional string minimum = 1; + optional string maximum = 2; + // sum will store the total length of all strings + optional sint64 sum = 3; +} +</code></p> + +<p>For booleans, the statistics include the count of false and true values.</p> + +<p><code>message BucketStatistics { + repeated uint64 count = 1 [packed=true]; +} +</code></p> + +<p>For decimals, the minimum, maximum, and sum are stored.</p> + +<p><code>message DecimalStatistics { + optional string minimum = 1; + optional string maximum = 2; + optional string sum = 3; +} +</code></p> + +<p>Date columns record the minimum and maximum values as the number of +days since the epoch (1/1/2015).</p> + +<p><code>message DateStatistics { + // min,max values saved as days since epoch + optional sint32 minimum = 1; + optional sint32 maximum = 2; +} +</code></p> + +<p>Timestamp columns record the minimum and maximum values as the number of +milliseconds since the epoch (1/1/2015).</p> + +<p><code>message TimestampStatistics { + // min,max values saved as milliseconds since epoch + optional sint64 minimum = 1; + optional sint64 maximum = 2; +} +</code></p> + +<p>Binary columns store the aggregate number of bytes across all of the values.</p> + +<p><code>message BinaryStatistics { + // sum will store the total binary blob length + optional sint64 sum = 1; +} +</code></p> + +<h2 id="user-metadata">User Metadata</h2> + +<p>The user can add arbitrary key/value pairs to an ORC file as it is +written. The contents of the keys and values are completely +application defined, but the key is a string and the value is +binary. Care should be taken by applications to make sure that their +keys are unique and in general should be prefixed with an organization +code.</p> + +<p><code>message UserMetadataItem { + // the user defined key + required string name = 1; + // the user defined binary value + required bytes value = 2; +} +</code></p> + +<h2 id="file-metadata">File Metadata</h2> + +<p>The file Metadata section contains column statistics at the stripe +level granularity. These statistics enable input split elimination +based on the predicate push-down evaluated per a stripe.</p> + +<p><code>message StripeStatistics { + repeated ColumnStatistics colStats = 1; +} +</code></p> + +<p><code>message Metadata { + repeated StripeStatistics stripeStats = 1; +} +</code></p> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <div class="section-nav"> + <div class="left align-right"> + + + + <a href="/docs/spec-intro.html" class="prev">Back</a> + + </div> + <div class="right align-left"> + + + + <a href="/docs/compression.html" class="next">Next</a> + + </div> + </div> + <div class="clear"></div> + + + </article> + </div> + + <div class="unit one-fifth hide-on-mobiles"> + <aside> + + <h4>Overview</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/index.html">Background</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/types.html">Types</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/indexes.html">Indexes</a></li> + + + + + + + + + + + + <li class=""><a href="/docs/acid.html">ACID support</a></li> + + + +</ul> + + + <h4>Hive Usage</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-config.html">Hive Configuration</a></li> + + + +</ul> + + + <h4>Format Specification</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-intro.html">Introduction</a></li> + + + + + + + + + + + + + + + + + + <li class="current"><a href="/docs/file-tail.html">File Tail</a></li> + + + + + + + + + + + + + + <li class=""><a href="/docs/compression.html">Compression</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/run-length.html">Run Length Encoding</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/stripes.html">Stripes</a></li> + + + + + + + + + + + + + + + + <li class=""><a href="/docs/encodings.html">Column Encodings</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-index.html">Indexes</a></li> + + + +</ul> + + + </aside> +</div> + + + <div class="clear"></div> + + </div> + </section> + + + <footer role="contentinfo"> + <p>The contents of this website are © 2015 + <a href="https://www.apache.org/">Apache Software Foundation</a> + under the terms of the <a + href="https://www.apache.org/licenses/LICENSE-2.0.html"> + Apache License v2</a>. Apache ORC and its logo are trademarks + of the Apache Software Foundation.</p> +</footer> + + <script> + var anchorForId = function (id) { + var anchor = document.createElement("a"); + anchor.className = "header-link"; + anchor.href = "#" + id; + anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>"; + anchor.title = "Permalink"; + return anchor; + }; + + var linkifyAnchors = function (level, containingElement) { + var headers = containingElement.getElementsByTagName("h" + level); + for (var h = 0; h < headers.length; h++) { + var header = headers[h]; + + if (typeof header.id !== "undefined" && header.id !== "") { + header.appendChild(anchorForId(header.id)); + } + } + }; + + document.onreadystatechange = function () { + if (this.readyState === "complete") { + var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0]; + if (!contentBlock) { + return; + } + for (var level = 1; level <= 6; level++) { + linkifyAnchors(level, contentBlock); + } + } + }; +</script> + + +</body> +</html> http://git-wip-us.apache.org/repos/asf/orc/blob/6a400548/docs/hive-config.html ---------------------------------------------------------------------- diff --git a/docs/hive-config.html b/docs/hive-config.html new file mode 100644 index 0000000..3bd72e4 --- /dev/null +++ b/docs/hive-config.html @@ -0,0 +1,1184 @@ +<!DOCTYPE HTML> +<html lang="en-US"> +<head> + <meta charset="UTF-8"> + <title>Hive Configuration</title> + <meta name="viewport" content="width=device-width,initial-scale=1"> + <meta name="generator" content="Jekyll v2.4.0"> + <link rel="stylesheet" href="//fonts.googleapis.com/css?family=Lato:300,300italic,400,400italic,700,700italic,900"> + <link rel="stylesheet" href="/css/screen.css"> + <link rel="icon" type="image/x-icon" href="/favicon.ico"> + <!--[if lt IE 9]> + <script src="/js/html5shiv.min.js"></script> + <script src="/js/respond.min.js"></script> + <![endif]--> +</head> + + +<body class="wrap"> + <header role="banner"> + <nav class="mobile-nav show-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + <div class="grid"> + <div class="unit one-third center-on-mobiles"> + <h1> + <a href="/"> + <span class="sr-only">Apache ORC</span> + <img src="/img/logo.png" width="249" height="115" alt="ORC Logo"> + </a> + </h1> + </div> + <nav class="main-nav unit two-thirds hide-on-mobiles"> + <ul> + <li class=""> + <a href="/">Home</a> + </li> + <li class="current"> + <a href="/docs/">Documentation</a> + </li> + <li class=""> + <a href="/talks/">Talks</a> + </li> + <li class=""> + <a href="/news/">News</a> + </li> + <li class=""> + <a href="/help/">Help</a> + </li> + <li class=""> + <a href="/develop/">Develop</a> + </li> +</ul> + + </nav> + </div> +</header> + + + <section class="docs"> + <div class="grid"> + + <div class="docs-nav-mobile unit whole show-on-mobiles"> + <select onchange="if (this.value) window.location.href=this.value"> + <option value="">Navigate the docsâ¦</option> + + <optgroup label="Overview"> + + + + + + + + + + + + + + + + + + + + <option value="/docs/index.html">Background</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/types.html">Types</option> + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/indexes.html">Indexes</option> + + + + + + + + + + + + + + + + + + <option value="/docs/acid.html">ACID support</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Hive Usage"> + + + + + + + + + + + + + + + + + + <option value="/docs/hive-ddl.html">Hive DDL</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/hive-config.html">Hive Configuration</option> + + + + + + + + + + + + + + + + + + + + + </optgroup> + + <optgroup label="Format Specification"> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-intro.html">Introduction</option> + + + + + + + + + + + + + + + + + + <option value="/docs/file-tail.html">File Tail</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/compression.html">Compression</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/run-length.html">Run Length Encoding</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/stripes.html">Stripes</option> + + + + + + + + + + + + + + <option value="/docs/encodings.html">Column Encodings</option> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <option value="/docs/spec-index.html">Indexes</option> + + + + + + + + + + + </optgroup> + + </select> +</div> + + + <div class="unit four-fifths"> + <article> + <h1>Hive Configuration</h1> + <h2 id="table-properties">Table properties</h2> + +<p>Tables stored as ORC files use table properties to control their behavior. By +using table properties, the table owner ensures that all clients store data +with the same options.</p> + +<table> + <thead> + <tr> + <th style="text-align: left">Key</th> + <th style="text-align: left">Default</th> + <th style="text-align: left">Notes</th> + </tr> + </thead> + <tbody> + <tr> + <td style="text-align: left">orc.compress</td> + <td style="text-align: left">ZLIB</td> + <td style="text-align: left">high level compression = {NONE, ZLIB, SNAPPY}</td> + </tr> + <tr> + <td style="text-align: left">orc.compress.size</td> + <td style="text-align: left">262,144</td> + <td style="text-align: left">compression chunk size</td> + </tr> + <tr> + <td style="text-align: left">orc.stripe.size</td> + <td style="text-align: left">268,435,456</td> + <td style="text-align: left">memory buffer size in bytes for writing</td> + </tr> + <tr> + <td style="text-align: left">orc.row.index.stride</td> + <td style="text-align: left">10,000</td> + <td style="text-align: left">number of rows between index entries</td> + </tr> + <tr> + <td style="text-align: left">orc.create.index</td> + <td style="text-align: left">true</td> + <td style="text-align: left">create indexes?</td> + </tr> + <tr> + <td style="text-align: left">orc.bloom.filter.columns</td> + <td style="text-align: left">ââ</td> + <td style="text-align: left">comma separated list of column names</td> + </tr> + <tr> + <td style="text-align: left">orc.bloom.filter.fpp</td> + <td style="text-align: left">0.05</td> + <td style="text-align: left">bloom filter false positive rate</td> + </tr> + </tbody> +</table> + +<p>For example, to create an ORC table without high level compression:</p> + +<p><code>CREATE TABLE istari ( + name STRING, + color STRING +) STORED AS ORC TBLPROPERTIES ("orc.compress"="NONE"); +</code></p> + +<h2 id="configuration-properties">Configuration properties</h2> + +<p>There are many Hive configuration properties related to ORC files:</p> + +<table class="configtable"> +<tr> + <th>Key</th> + <th>Default</th> + <th>Notes</th> +</tr> +<tr> + <td>hive.default.fileformat</td> + <td>TextFile</td> + <td>This is the default file format for new tables. If it is set to ORC, + new tables will default to ORC.</td> +</tr> +<tr> + <td>hive.stats.gather.num.threads</td> + <td>10</td> + <td>Number of threads used by partialscan/noscan analyze command for + partitioned tables. This is applicable only for file formats that + implement the StatsProvidingRecordReader interface (like ORC).</td> +</tr> +<tr> + <td>hive.exec.orc.memory.pool</td> + <td>0.5</td> + <td>Maximum fraction of heap that can be used by ORC file writers.</td> +</tr> +<tr> + <td>hive.exec.orc.write.format</td> + <td>NULL</td> + <td>Define the version of the file to write. Possible values are 0.11 and + 0.12. If this parameter is not defined, ORC will use the latest + version.</td> +</tr> +<tr> + <td>hive.exec.orc.default.stripe.size</td> + <td>67,108,864</td> + <td>Define the default size of ORC writer buffers in bytes.</td> +</tr> +<tr> + <td>hive.exec.orc.default.block.size</td> + <td>268,435,456</td> + <td>Define the default file system block size for ORC files.</td> +</tr> +<tr> + <td>hive.exec.orc.dictionary.key.size.threshold</td> + <td>0.8</td> + <td>If the number of keys in a dictionary is greater than this + fraction of the total number of non-null rows, turn off + dictionary encoding. Use 1.0 to always use dictionary encoding.</td> +</tr> +<tr> + <td>hive.exec.orc.default.row.index.stride</td> + <td>10,000</td> + <td>Define the default number of rows between row index entries.</td> +</tr> +<tr> + <td>hive.exec.orc.default.buffer.size</td> + <td>262,144</td> + <td>Define the default ORC buffer size, in bytes.</td> +</tr> +<tr> + <td>hive.exec.orc.default.block.padding</td> + <td>true</td> + <td>Should ORC file writers pad stripes to minimize stripes that cross HDFS + block boundaries.</td> +</tr> +<tr> + <td>hive.exec.orc.block.padding.tolerance</td> + <td>0.05</td> + <td>Define the tolerance for block padding as a decimal fraction of + stripe size (for example, the default value 0.05 is 5% of the + stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS + blocks, a maximum of 3.2Mb will be reserved for padding within + the 256Mb block with the default + hive.exec.orc.block.padding.tolerance. In that case, if the + available size within the block is more than 3.2Mb, a new + smaller stripe will be inserted to fit within that space. This + will make sure that no stripe written will cross block + boundaries and cause remote reads within a node local task. + +<tr> + <td>hive.exec.orc.default.compress</td> + <td>ZLIB</td> + <td>Define the default compression codec for ORC file.</td> +</tr> +<tr> + <td>hive.exec.orc.encoding.strategy</td> + <td>SPEED</td> + <td>Define the encoding strategy to use while writing data. Changing + this will only affect the light weight encoding for + integers. This flag will not change the compression level of + higher level compression codec (like ZLIB). Possible options are + SPEED and COMPRESSION.</td> +</tr> +<tr> + <td>hive.orc.splits.include.file.footer</td> + <td>false</td> + <td>If turned on, splits generated by ORC will include metadata + about the stripes in the file. This data is read remotely (from + the client or HiveServer2 machine) and sent to all the tasks.</td> +</tr> +<tr> + <td>hive.orc.cache.stripe.details.size</td> + <td>10,000</td> + <td>Cache size for keeping meta information about ORC splits cached in the + client.</td> +</tr> +<tr> + <td>hive.orc.compute.splits.num.threads</td> + <td>10</td> + <td>How many threads ORC should use to create splits in parallel.</td> +</tr> +<tr> + <td>hive.exec.orc.skip.corrupt.data</td> + <td>false</td> + <td>If ORC reader encounters corrupt data, this value will be used + to determine whether to skip the corrupt data or throw an + exception. The default behavior is to throw an exception.</td> +</tr> +<tr> + <td>hive.exec.orc.zerocopy</td> + <td>false</td> + <td>Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)</td> +</tr> +<tr> + <td>hive.merge.orcfile.stripe.level</td> + <td>true</td> + <td>When hive.merge.mapfiles, hive.merge.mapredfiles or + hive.merge.tezfiles is enabled while writing a table with ORC + file format, enabling this configuration property will do + stripe-level fast merge for small ORC files. Note that enabling + this configuration property will not honor the padding tolerance + configuration (hive.exec.orc.block.padding.tolerance).</td> +</tr> +<tr> + <td>hive.orc.row.index.stride.dictionary.check</td> + <td>true</td> + <td>If enabled dictionary check will happen after first row index stride + (default 10000 rows) else dictionary check will happen before writing + first stripe. In both cases, the decision to use dictionary or not will + be retained thereafter.</td> +</tr> +<tr> + <td>hive.exec.orc.compression.strategy</td> + <td>SPEED</td> + <td>Define the compression strategy to use while writing data. This changes + the compression level of higher level compression codec. Value can be + SPEED or COMPRESSION.</td> +</tr> + +</td></tr></table> + + + + + + + + + + + + + + + + + + + + + + + + + <div class="section-nav"> + <div class="left align-right"> + + + + <a href="/docs/hive-ddl.html" class="prev">Back</a> + + </div> + <div class="right align-left"> + + + + <a href="/docs/spec-intro.html" class="next">Next</a> + + </div> + </div> + <div class="clear"></div> + + + </article> + </div> + + <div class="unit one-fifth hide-on-mobiles"> + <aside> + + <h4>Overview</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/index.html">Background</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/types.html">Types</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/indexes.html">Indexes</a></li> + + + + + + + + + + + + <li class=""><a href="/docs/acid.html">ACID support</a></li> + + + +</ul> + + + <h4>Hive Usage</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/hive-ddl.html">Hive DDL</a></li> + + + + + + + + + + + + + + + + + + + + <li class="current"><a href="/docs/hive-config.html">Hive Configuration</a></li> + + + +</ul> + + + <h4>Format Specification</h4> + + +<ul> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-intro.html">Introduction</a></li> + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/file-tail.html">File Tail</a></li> + + + + + + + + + + + + + + <li class=""><a href="/docs/compression.html">Compression</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/run-length.html">Run Length Encoding</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/stripes.html">Stripes</a></li> + + + + + + + + + + + + + + + + <li class=""><a href="/docs/encodings.html">Column Encodings</a></li> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + <li class=""><a href="/docs/spec-index.html">Indexes</a></li> + + + +</ul> + + + </aside> +</div> + + + <div class="clear"></div> + + </div> + </section> + + + <footer role="contentinfo"> + <p>The contents of this website are © 2015 + <a href="https://www.apache.org/">Apache Software Foundation</a> + under the terms of the <a + href="https://www.apache.org/licenses/LICENSE-2.0.html"> + Apache License v2</a>. Apache ORC and its logo are trademarks + of the Apache Software Foundation.</p> +</footer> + + <script> + var anchorForId = function (id) { + var anchor = document.createElement("a"); + anchor.className = "header-link"; + anchor.href = "#" + id; + anchor.innerHTML = "<span class=\"sr-only\">Permalink</span><i class=\"fa fa-link\"></i>"; + anchor.title = "Permalink"; + return anchor; + }; + + var linkifyAnchors = function (level, containingElement) { + var headers = containingElement.getElementsByTagName("h" + level); + for (var h = 0; h < headers.length; h++) { + var header = headers[h]; + + if (typeof header.id !== "undefined" && header.id !== "") { + header.appendChild(anchorForId(header.id)); + } + } + }; + + document.onreadystatechange = function () { + if (this.readyState === "complete") { + var contentBlock = document.getElementsByClassName("docs")[0] || document.getElementsByClassName("news")[0]; + if (!contentBlock) { + return; + } + for (var level = 1; level <= 6; level++) { + linkifyAnchors(level, contentBlock); + } + } + }; +</script> + + +</body> +</html>
