[incubator-datasketches-website] branch asf-site updated: Automatic Site Publish by Buildbot

git-site-role Sun, 15 Mar 2020 14:31:27 -0700

This is an automated email from the ASF dual-hosted git repository.

git-site-role pushed a commit to branch asf-site
in repository 
https://gitbox.apache.org/repos/asf/incubator-datasketches-website.git



The following commit(s) were added to refs/heads/asf-site by this push:
     new 53e668f  Automatic Site Publish by Buildbot
53e668f is described below

commit 53e668fa78e5753511adfa1b8f3dcd292339a985
Author: buildbot <[email protected]>
AuthorDate: Sun Mar 15 21:31:01 2020 +0000

    Automatic Site Publish by Buildbot
---
 .../ThetaHiveUDFs.html => CPC/CpcCppExample.html}  | 111 ++++++++--------
 .../ThetaHiveUDFs.html => CPC/CpcHiveExample.html} |  47 ++++---
 .../ThetaHiveUDFs.html => CPC/CpcJavaExample.html} | 143 +++++++++++----------
 .../ThetaPigUDFs.html => CPC/CpcPigExample.html}   |  72 +++--------
 output/docs/HLL/HllHiveUDFs.html                   |   2 -
 output/docs/Theta/ThetaHiveUDFs.html               |   2 -
 output/docs/Theta/ThetaPigUDFs.html                |   3 +-
 7 files changed, 173 insertions(+), 207 deletions(-)

diff --git a/output/docs/Theta/ThetaHiveUDFs.html 
b/output/docs/CPC/CpcCppExample.html
similarity index 88%
copy from output/docs/Theta/ThetaHiveUDFs.html
copy to output/docs/CPC/CpcCppExample.html
index 321fd4d..e7471cf 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/CPC/CpcCppExample.html
@@ -476,77 +476,72 @@
     specific language governing permissions and limitations
     under the License.
 -->
-<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
+<h1 id="cpc-sketch-c-example">CPC Sketch C++ Example</h1>
 
-<p>Depends on sketches-core.</p>
+<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>#include &lt;iostream&gt;
+#include &lt;fstream&gt;
 
-<h3 id="building-sketches-merging-sketches-and-getting-estimates">Building 
sketches, merging sketches and getting estimates</h3>
+#include &lt;cpc_sketch.hpp&gt;
+#include &lt;cpc_union.hpp&gt;
 
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
+//simplified file operations and no error handling for clarity
+int main(int argc, char **argv) {
+  const int lg_k = 10;
 
-create temporary function data2sketch as 
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function unionSketches as 
'org.apache.datasketches.hive.theta.UnionSketchUDAF';
-create temporary function estimate as 
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
+  // this section generates two sketches with some overlap and serializes them 
into files
+  {
+    // 100000 distinct keys
+    datasketches::cpc_sketch sketch1(lg_k);
+    for (int key = 0; key &lt; 100000; key++) sketch1.update(key);
+    std::ofstream os1("cpc_sketch1.bin");
+    sketch1.serialize(os1);
 
-use &lt;your-db-name-here&gt;;
+    // 100000 distinct keys
+    datasketches::cpc_sketch sketch2(lg_k);
+    for (int key = 50000; key &lt; 150000; key++) sketch2.update(key);
+    std::ofstream os2("cpc_sketch2.bin");
+    sketch2.serialize(os2);
+  }
 
-create temporary table theta_input (id int, category char(1));
-insert into table theta_input values
-  (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 
'a'), (9, 'a'), (10, 'a'),
-  (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), 
(13, 'b'), (14, 'b'), (15, 'b');
+  // this section deserializes the sketches, produces union and prints the 
result
+  {
+    std::ifstream is1("cpc_sketch1.bin");
+    datasketches::cpc_sketch sketch1 = 
datasketches::cpc_sketch::deserialize(is1);
 
-create temporary table sketch_intermediate (category char(1), sketch binary);
-insert into sketch_intermediate select category, data2sketch(id) from 
theta_input group by category;
+    std::ifstream is2("cpc_sketch2.bin");
+    datasketches::cpc_sketch sketch2 = 
datasketches::cpc_sketch::deserialize(is2);
 
-select category, estimate(sketch) from sketch_intermediate;
+    datasketches::cpc_union u(lg_k);
+    u.update(sketch1);
+    u.update(sketch2);
+    datasketches::cpc_sketch sketch = u.get_result();
 
-Output:
-a      10.0
-b      10.0
-
-select estimate(unionSketches(sketch)) from sketch_intermediate;
-
-Output:
-15.0
-</code></pre></div></div>
-
-<h3 id="set-operations">Set operations</h3>
-
-<p>Notice the difference between UnionUDF in this example, which takes two 
sketches, and UnionUDAF in the previous example, which is an aggregate function 
taking a collection of sketches as one parameter. The same is true about 
IntersectSketchUDF and IntersectSketchUDAF.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
-
-create temporary function data2sketch as 
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function estimate as 
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-create temporary function union2 as 
'org.apache.datasketches.hive.theta.UnionSketchUDF';
-create temporary function intersect as 
'org.apache.datasketches.hive.theta.IntersectSketchUDF';
-create temporary function anotb as 
'org.apache.datasketches.hive.theta.ExcludeSketchUDF';
-
-use &lt;your-db-nasme-here&gt;;
-
-create temporary table sketch_input (id1 int, id2 int);
-insert into table sketch_input values
-  (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), 
(10, 20);
-
-create temporary table sketch_intermediate (sketch1 binary, sketch2 binary);
+    // debug summary of the union result sketch
+    sketch.to_stream(std::cout);
 
-insert into sketch_intermediate select data2sketch(id1), data2sketch(id2) from 
sketch_input;
+    std::cout &lt;&lt; "Distinct count estimate: " &lt;&lt; 
sketch.get_estimate() &lt;&lt; std::endl;
+    std::cout &lt;&lt; "Distinct count lower bound 95% confidence: " &lt;&lt; 
sketch.get_lower_bound(2) &lt;&lt; std::endl;
+    std::cout &lt;&lt; "Distinct count upper bound 95% confidence: " &lt;&lt; 
sketch.get_upper_bound(2) &lt;&lt; std::endl;
+  }
 
-select
-  estimate(sketch1),
-  estimate(sketch2),
-  estimate(union2(sketch1, sketch2)),
-  estimate((intersect(sketch1, sketch2))),
-  estimate(anotb(sketch1, sketch2)),
-  estimate(anotb(sketch2, sketch1))
-from sketch_intermediate;
+  return 0;
+}
 
 Output:
-10.0   10.0    15.0    5.0     5.0     5.0
+### CPC sketch summary:
+   lg_k           : 10
+   seed hash      : 93cc
+   C              : 7706
+   flavor         : 4
+   merged         : true
+   intresting col : 4
+   table entries  : 27
+   window         : allocated
+   window offset  : 5
+### End sketch summary
+Distinct count estimate: 149797
+Distinct count lower bound 95% confidence: 143416
+Distinct count upper bound 95% confidence: 156397
 </code></pre></div></div>
 
       </div> <!-- End content -->
diff --git a/output/docs/Theta/ThetaHiveUDFs.html 
b/output/docs/CPC/CpcHiveExample.html
similarity index 96%
copy from output/docs/Theta/ThetaHiveUDFs.html
copy to output/docs/CPC/CpcHiveExample.html
index 321fd4d..7219070 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/CPC/CpcHiveExample.html
@@ -476,9 +476,7 @@
     specific language governing permissions and limitations
     under the License.
 -->
-<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
-
-<p>Depends on sketches-core.</p>
+<h2 id="cpc-sketch-hive-udfs">CPC sketch Hive UDFs</h2>
 
 <h3 id="building-sketches-merging-sketches-and-getting-estimates">Building 
sketches, merging sketches and getting estimates</h3>
 
@@ -486,45 +484,49 @@
 add jar datasketches-java-1.2.0-incubating.jar;
 add jar datasketches-hive-1.0.0-incubating.jar;
 
-create temporary function data2sketch as 
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function unionSketches as 
'org.apache.datasketches.hive.theta.UnionSketchUDAF';
-create temporary function estimate as 
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
+create temporary function data2sketch as 
'org.apache.datasketches.hive.cpc.DataToSketchUDAF';
+create temporary function unionSketches as 
'org.apache.datasketches.hive.cpc.UnionSketchUDAF';
+create temporary function estimate as 
'org.apache.datasketches.hive.cpc.GetEstimateUDF';
+create temporary function estimateAndBounds as 
'org.apache.datasketches.hive.cpc.GetEstimateAndErrorBoundsUDF';
 
 use &lt;your-db-name-here&gt;;
 
-create temporary table theta_input (id int, category char(1));
-insert into table theta_input values
+create temporary table sketch_input (id int, category char(1));
+insert into table sketch_input values
   (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 
'a'), (9, 'a'), (10, 'a'),
   (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), 
(13, 'b'), (14, 'b'), (15, 'b');
 
 create temporary table sketch_intermediate (category char(1), sketch binary);
-insert into sketch_intermediate select category, data2sketch(id) from 
theta_input group by category;
+insert into sketch_intermediate select category, data2sketch(id) from 
sketch_input group by category;
 
 select category, estimate(sketch) from sketch_intermediate;
 
 Output:
-a      10.0
-b      10.0
+a      10.007331400971685
+b      10.007331400971685
 
 select estimate(unionSketches(sketch)) from sketch_intermediate;
 
 Output:
-15.0
+15.017114660336853
+
+select estimateAndBounds(unionSketches(sketch)) from sketch_intermediate;
+
+Output:
+[15.017114660336853,15.0,16.0]
 </code></pre></div></div>
 
-<h3 id="set-operations">Set operations</h3>
+<h3 id="union-of-two-sketches">Union of two sketches</h3>
 
-<p>Notice the difference between UnionUDF in this example, which takes two 
sketches, and UnionUDAF in the previous example, which is an aggregate function 
taking a collection of sketches as one parameter. The same is true about 
IntersectSketchUDF and IntersectSketchUDAF.</p>
+<p>Notice the difference between UnionUDF in this example, which takes two 
sketches, and UnionUDAF in the previous example, which is an aggregate function 
taking a collection of sketches as one parameter.</p>
 
 <div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
 add jar datasketches-java-1.2.0-incubating.jar;
 add jar datasketches-hive-1.0.0-incubating.jar;
 
-create temporary function data2sketch as 
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function estimate as 
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-create temporary function union2 as 
'org.apache.datasketches.hive.theta.UnionSketchUDF';
-create temporary function intersect as 
'org.apache.datasketches.hive.theta.IntersectSketchUDF';
-create temporary function anotb as 
'org.apache.datasketches.hive.theta.ExcludeSketchUDF';
+create temporary function data2sketch as 
'org.apache.datasketches.hive.cpc.DataToSketchUDAF';
+create temporary function union2 as 
'org.apache.datasketches.hive.cpc.UnionSketchUDF';
+create temporary function estimate as 
'org.apache.datasketches.hive.cpc.GetEstimateUDF';
 
 use &lt;your-db-nasme-here&gt;;
 
@@ -539,14 +541,11 @@ insert into sketch_intermediate select data2sketch(id1), 
data2sketch(id2) from s
 select
   estimate(sketch1),
   estimate(sketch2),
-  estimate(union2(sketch1, sketch2)),
-  estimate((intersect(sketch1, sketch2))),
-  estimate(anotb(sketch1, sketch2)),
-  estimate(anotb(sketch2, sketch1))
+  estimate(union2(sketch1, sketch2))
 from sketch_intermediate;
 
 Output:
-10.0   10.0    15.0    5.0     5.0     5.0
+10.007331400971685     10.007331400971685      15.017114660336853
 </code></pre></div></div>
 
       </div> <!-- End content -->
diff --git a/output/docs/Theta/ThetaHiveUDFs.html 
b/output/docs/CPC/CpcJavaExample.html
similarity index 88%
copy from output/docs/Theta/ThetaHiveUDFs.html
copy to output/docs/CPC/CpcJavaExample.html
index 321fd4d..82c4e20 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/CPC/CpcJavaExample.html
@@ -476,77 +476,86 @@
     specific language governing permissions and limitations
     under the License.
 -->
-<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
-
-<p>Depends on sketches-core.</p>
-
-<h3 id="building-sketches-merging-sketches-and-getting-estimates">Building 
sketches, merging sketches and getting estimates</h3>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
-
-create temporary function data2sketch as 
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function unionSketches as 
'org.apache.datasketches.hive.theta.UnionSketchUDAF';
-create temporary function estimate as 
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-
-use &lt;your-db-name-here&gt;;
-
-create temporary table theta_input (id int, category char(1));
-insert into table theta_input values
-  (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 
'a'), (9, 'a'), (10, 'a'),
-  (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), 
(13, 'b'), (14, 'b'), (15, 'b');
-
-create temporary table sketch_intermediate (category char(1), sketch binary);
-insert into sketch_intermediate select category, data2sketch(id) from 
theta_input group by category;
-
-select category, estimate(sketch) from sketch_intermediate;
-
-Output:
-a      10.0
-b      10.0
-
-select estimate(unionSketches(sketch)) from sketch_intermediate;
-
-Output:
-15.0
-</code></pre></div></div>
-
-<h3 id="set-operations">Set operations</h3>
-
-<p>Notice the difference between UnionUDF in this example, which takes two 
sketches, and UnionUDAF in the previous example, which is an aggregate function 
taking a collection of sketches as one parameter. The same is true about 
IntersectSketchUDF and IntersectSketchUDAF.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
-
-create temporary function data2sketch as 
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function estimate as 
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-create temporary function union2 as 
'org.apache.datasketches.hive.theta.UnionSketchUDF';
-create temporary function intersect as 
'org.apache.datasketches.hive.theta.IntersectSketchUDF';
-create temporary function anotb as 
'org.apache.datasketches.hive.theta.ExcludeSketchUDF';
-
-use &lt;your-db-nasme-here&gt;;
-
-create temporary table sketch_input (id1 int, id2 int);
-insert into table sketch_input values
-  (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), 
(10, 20);
-
-create temporary table sketch_intermediate (sketch1 binary, sketch2 binary);
+<h1 id="cpc-sketch-java-example">CPC Sketch Java Example</h1>
+
+<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import com.yahoo.memory.Memory;
+import com.yahoo.sketches.cpc.CpcSketch;
+import com.yahoo.sketches.cpc.CpcUnion;
+
+//simplified file operations and no error handling for clarity
+public class CpcExample {
+
+  public static void main(String[] args) throws Exception {
+    final int lgK = 10;
+    // this section generates two sketches with some overlap and serializes 
them into files
+    {
+      // 100000 distinct keys
+      CpcSketch sketch1 = new CpcSketch(lgK);
+      for (int key = 0; key &lt; 100000; key++) sketch1.update(key);
+      FileOutputStream out1 = new FileOutputStream("CpcSketch1.bin");
+      out1.write(sketch1.toByteArray());
+      out1.close();
+
+      // 100000 distinct keys
+      CpcSketch sketch2 = new CpcSketch(lgK);
+      for (int key = 50000; key &lt; 150000; key++) sketch2.update(key);
+      FileOutputStream out2 = new FileOutputStream("CpcSketch2.bin");
+      out2.write(sketch2.toByteArray());
+      out2.close();
+    }
 
-insert into sketch_intermediate select data2sketch(id1), data2sketch(id2) from 
sketch_input;
+    // this section deserializes the sketches, produces a union and prints the 
result
+    {
+      FileInputStream in1 = new FileInputStream("CpcSketch1.bin");
+      byte[] bytes1 = new byte[in1.available()];
+      in1.read(bytes1);
+      in1.close();
+      CpcSketch sketch1 = CpcSketch.heapify(Memory.wrap(bytes1));
+
+      FileInputStream in2 = new FileInputStream("CpcSketch2.bin");
+      byte[] bytes2 = new byte[in2.available()];
+      in2.read(bytes2);
+      in2.close();
+      CpcSketch sketch2 = CpcSketch.heapify(Memory.wrap(bytes2));
+
+      CpcUnion union = new CpcUnion(lgK);
+      union.update(sketch1);
+      union.update(sketch2);
+      CpcSketch result = union.getResult();
+
+      // debug summary of the union result sketch
+      System.out.println(result.toString());
+
+      System.out.println("Distinct count estimate: " + result.getEstimate());
+      System.out.println("Distinct count lower bound 95% confidence: " + 
result.getLowerBound(2));
+      System.out.println("Distinct count upper bound 95% confidence: " + 
result.getUpperBound(2));
+    }
+  }
 
-select
-  estimate(sketch1),
-  estimate(sketch2),
-  estimate(union2(sketch1, sketch2)),
-  estimate((intersect(sketch1, sketch2))),
-  estimate(anotb(sketch1, sketch2)),
-  estimate(anotb(sketch2, sketch1))
-from sketch_intermediate;
+}
 
 Output:
-10.0   10.0    15.0    5.0     5.0     5.0
+### CPD SKETCH - PREAMBLE:
+  Flavor         : SLIDING
+  LgK            : 10
+  Merge Flag     : true
+  Error Const    : 0.6931471805599453
+  RSE            : 0.02166084939249829
+  Seed Hash      : 93cc | 37836
+  Num Coupons    : 7706
+  Num Pairs (SV) : 27
+  First Inter Col: 4
+  Valid Window   : true
+  Valid PairTable: true
+  Window Offset  : 5
+  KxP            : 1024.0
+  HIP Accum      : 0.0
+### END CPC SKETCH
+Distinct count estimate: 149796.50599220005
+Distinct count lower bound 95% confidence: 143416.2744812169
+Distinct count upper bound 95% confidence: 156397.0
 </code></pre></div></div>
 
       </div> <!-- End content -->
diff --git a/output/docs/Theta/ThetaPigUDFs.html 
b/output/docs/CPC/CpcPigExample.html
similarity index 90%
copy from output/docs/Theta/ThetaPigUDFs.html
copy to output/docs/CPC/CpcPigExample.html
index 3e8ac49..a9bf523 100644
--- a/output/docs/Theta/ThetaPigUDFs.html
+++ b/output/docs/CPC/CpcPigExample.html
@@ -476,32 +476,34 @@
     specific language governing permissions and limitations
     under the License.
 -->
-<h2 id="theta-sketch-pig-udfs">Theta Sketch Pig UDFs</h2>
+<h2 id="cpc-sketch-pig-udfs">CPC Sketch Pig UDFs</h2>
 
 <h3 id="instructions">Instructions</h3>
 
 <ul>
   <li>get jars</li>
-  <li>save the following script as theta.pig</li>
+  <li>save the following script as cpc.pig</li>
   <li>adjust jar versions and paths if necessary</li>
   <li>save the below data into a file called “data.txt”</li>
   <li>copy data to hdfs: “hdfs dfs -copyFromLocal data.txt”</li>
-  <li>run pig script: “pig theta.pig”</li>
+  <li>run pig script: “pig cpc.pig”</li>
 </ul>
 
-<h3 
id="thetapig-script-building-sketches-merging-sketches-and-getting-estimates">theta.pig
 script: building sketches, merging sketches and getting estimates</h3>
+<h3 
id="cpcpig-script-building-sketches-merging-sketches-and-getting-estimates">cpc.pig
 script: building sketches, merging sketches and getting estimates</h3>
 
 <div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>register datasketches-memory-1.2.0-incubating.jar;
 register datasketches-java-1.2.0-incubating.jar;
 register datasketches-pig-1.0.0-incubating.jar;
 
-define dataToSketch org.apache.datasketches.pig.theta.DataToSketch('32');
-define unionSketch org.apache.datasketches.pig.theta.Union('32');
-define getEstimate org.apache.datasketches.pig.theta.Estimate();
+define dataToSketch org.apache.datasketches.pig.cpc.DataToSketch('12');       
+define unionSketch org.apache.datasketches.pig.cpc.UnionSketch('12');       
+define getEstimate org.apache.datasketches.pig.cpc.GetEstimate();     
+define getEstimateAndBounds 
org.apache.datasketches.pig.cpc.GetEstimateAndErrorBounds('3');       
+define toString org.apache.datasketches.pig.cpc.SketchToString();              
 
 a = load 'data.txt' as (id, category);
 b = group a by category;
-c = foreach b generate flatten(group) as (category), 
flatten(dataToSketch(a.id)) as (sketch);
+c = foreach b generate flatten(group) as (category), dataToSketch(a.id) as 
sketch;
 -- Sketches can be stored at this point in binary format to be used later:
 -- store c into 'intermediate/$date' using BinStorage();
 -- The next two lines print the results in human readable form for the purpose 
of this example
@@ -511,11 +513,13 @@ dump d;
 -- This can be a separate query
 -- For example, the first part can produce a daily intermediate feed and store 
it,
 -- and this part can load several instances of this daily intermediate feed 
and merge them
--- c = load 'intermediate/$date1,intermediate/$date2' using BinStorage() as 
(category, sketch);
 e = group c all;
-f = foreach e generate flatten(unionSketch(c.sketch)) as (sketch);
+f = foreach e generate unionSketch(c.sketch) as sketch;
 g = foreach f generate getEstimate(sketch);
 dump g;
+
+h = foreach f generate flatten(getEstimateAndBounds(sketch)) as (estimate, lb, 
ub);
+dump h;
 </code></pre></div></div>
 
 <h3 id="datatxt-tab-separated"><a href="/docs/Theta/data.txt">data.txt</a> 
(tab separated)</h3>
@@ -527,56 +531,18 @@ Most of the IDs in these categories overlap, so that 
there are 60 unique IDs in
 <p>Results:
 From ‘dump d’:</p>
 
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(a,46.91487058420659)
-(b,46.23988568048073)
+<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(a,50.09992602861082)
+(b,50.09992602861082)
 </code></pre></div></div>
 
 <p>From ‘dump g’ (merged across categories):</p>
 
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(50.415577215639736)
-</code></pre></div></div>
-
-<p>The expected exact result would be (60.0). The estimate has high relative 
error because the sketch was configured with only 32 nominal entries.</p>
-
-<h3 id="theta_setopspig-script-set-operations-on-sketches">theta_setops.pig 
script: set operations on sketches</h3>
-
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>register datasketches-memory-1.2.0-incubating.jar;
-register datasketches-java-1.2.0-incubating.jar;
-register datasketches-pig-1.0.0-incubating.jar;
-
-define dataToSketch org.apache.datasketches.pig.theta.DataToSketch('32');
-define unionSketch org.apache.datasketches.pig.theta.Union();
-define intersect org.apache.datasketches.pig.theta.Intersect();
-define anotb org.apache.datasketches.pig.theta.AexcludeB();
-define estimate org.apache.datasketches.pig.theta.Estimate();
-
-a = load 'setops_data.txt' as (id1, id2);
-b = group a all;
-c = foreach b generate
-  flatten(dataToSketch(a.id1)) as (sketch1),
-  flatten(dataToSketch(a.id2)) as (sketch2);
-d = foreach c generate
-  sketch1, -- pass sketches through to have all estimates in one place 
-  sketch2,
-  flatten(unionSketch(TOBAG(sketch1, sketch2))) as (a_union_b),
-  flatten(intersect(TOBAG(sketch1, sketch2))) as (a_intersect_b),
-  flatten(anotb(sketch1, sketch2)) as (a_not_b),
-  flatten(anotb(sketch2, sketch1)) as (b_not_a);
-e = foreach d generate
-  estimate(sketch1),
-  estimate(sketch2),
-  estimate(a_union_b),
-  estimate(a_intersect_b),
-  estimate(a_not_b),
-  estimate(b_not_a);
-dump e;
+<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(60.14445031168714)
 </code></pre></div></div>
 
-<h3 id="setops_datatxt-tab-separated"><a 
href="/docs/Theta/setops_data.txt">setops_data.txt</a> (tab separated)</h3>
-
-<p>Result:</p>
+<p>From ‘dump h’ (with error bounds, 99% confidence interval):</p>
 
-<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(10.0,12.0,18.0,4.0,6.0,8.0)
+<div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(60.14445031168714,60.0,63.0)
 </code></pre></div></div>
 
       </div> <!-- End content -->
diff --git a/output/docs/HLL/HllHiveUDFs.html b/output/docs/HLL/HllHiveUDFs.html
index 5d71a49..1fe64d6 100644
--- a/output/docs/HLL/HllHiveUDFs.html
+++ b/output/docs/HLL/HllHiveUDFs.html
@@ -478,8 +478,6 @@
 -->
 <h2 id="hyper-log-log-sketch-hive-udfs">Hyper Log Log sketch Hive UDFs</h2>
 
-<p>This functionality appeared in sketches-hive-0.10.1. Depends on 
sketches-core-0.10.0 and memory-0.10.2.</p>
-
 <h3 id="building-sketches-computing-unions-and-getting-estimates">Building 
sketches, computing unions and getting estimates</h3>
 
 <div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
diff --git a/output/docs/Theta/ThetaHiveUDFs.html 
b/output/docs/Theta/ThetaHiveUDFs.html
index 321fd4d..aaaa82a 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/Theta/ThetaHiveUDFs.html
@@ -478,8 +478,6 @@
 -->
 <h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
 
-<p>Depends on sketches-core.</p>
-
 <h3 id="building-sketches-merging-sketches-and-getting-estimates">Building 
sketches, merging sketches and getting estimates</h3>
 
 <div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
diff --git a/output/docs/Theta/ThetaPigUDFs.html 
b/output/docs/Theta/ThetaPigUDFs.html
index 3e8ac49..7c3980d 100644
--- a/output/docs/Theta/ThetaPigUDFs.html
+++ b/output/docs/Theta/ThetaPigUDFs.html
@@ -536,7 +536,8 @@ From ‘dump d’:</p>
 <div class="highlighter-rouge"><div class="highlight"><pre 
class="highlight"><code>(50.415577215639736)
 </code></pre></div></div>
 
-<p>The expected exact result would be (60.0). The estimate has high relative 
error because the sketch was configured with only 32 nominal entries.</p>
+<p>The expected exact result would be (60.0). The estimate has high relative 
error because the sketch was configured with only 32 nominal entries
+to show the estimation mode for the purposes of this example.</p>
 
 <h3 id="theta_setopspig-script-set-operations-on-sketches">theta_setops.pig 
script: set operations on sketches</h3>
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[incubator-datasketches-website] branch asf-site updated: Automatic Site Publish by Buildbot

Reply via email to