This is an automated email from the ASF dual-hosted git repository.
git-site-role pushed a commit to branch asf-site
in repository
https://gitbox.apache.org/repos/asf/incubator-datasketches-website.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 53e668f Automatic Site Publish by Buildbot
53e668f is described below
commit 53e668fa78e5753511adfa1b8f3dcd292339a985
Author: buildbot <[email protected]>
AuthorDate: Sun Mar 15 21:31:01 2020 +0000
Automatic Site Publish by Buildbot
---
.../ThetaHiveUDFs.html => CPC/CpcCppExample.html} | 111 ++++++++--------
.../ThetaHiveUDFs.html => CPC/CpcHiveExample.html} | 47 ++++---
.../ThetaHiveUDFs.html => CPC/CpcJavaExample.html} | 143 +++++++++++----------
.../ThetaPigUDFs.html => CPC/CpcPigExample.html} | 72 +++--------
output/docs/HLL/HllHiveUDFs.html | 2 -
output/docs/Theta/ThetaHiveUDFs.html | 2 -
output/docs/Theta/ThetaPigUDFs.html | 3 +-
7 files changed, 173 insertions(+), 207 deletions(-)
diff --git a/output/docs/Theta/ThetaHiveUDFs.html
b/output/docs/CPC/CpcCppExample.html
similarity index 88%
copy from output/docs/Theta/ThetaHiveUDFs.html
copy to output/docs/CPC/CpcCppExample.html
index 321fd4d..e7471cf 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/CPC/CpcCppExample.html
@@ -476,77 +476,72 @@
specific language governing permissions and limitations
under the License.
-->
-<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
+<h1 id="cpc-sketch-c-example">CPC Sketch C++ Example</h1>
-<p>Depends on sketches-core.</p>
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>#include <iostream>
+#include <fstream>
-<h3 id="building-sketches-merging-sketches-and-getting-estimates">Building
sketches, merging sketches and getting estimates</h3>
+#include <cpc_sketch.hpp>
+#include <cpc_union.hpp>
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
+//simplified file operations and no error handling for clarity
+int main(int argc, char **argv) {
+ const int lg_k = 10;
-create temporary function data2sketch as
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function unionSketches as
'org.apache.datasketches.hive.theta.UnionSketchUDAF';
-create temporary function estimate as
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
+ // this section generates two sketches with some overlap and serializes them
into files
+ {
+ // 100000 distinct keys
+ datasketches::cpc_sketch sketch1(lg_k);
+ for (int key = 0; key < 100000; key++) sketch1.update(key);
+ std::ofstream os1("cpc_sketch1.bin");
+ sketch1.serialize(os1);
-use <your-db-name-here>;
+ // 100000 distinct keys
+ datasketches::cpc_sketch sketch2(lg_k);
+ for (int key = 50000; key < 150000; key++) sketch2.update(key);
+ std::ofstream os2("cpc_sketch2.bin");
+ sketch2.serialize(os2);
+ }
-create temporary table theta_input (id int, category char(1));
-insert into table theta_input values
- (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8,
'a'), (9, 'a'), (10, 'a'),
- (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'),
(13, 'b'), (14, 'b'), (15, 'b');
+ // this section deserializes the sketches, produces union and prints the
result
+ {
+ std::ifstream is1("cpc_sketch1.bin");
+ datasketches::cpc_sketch sketch1 =
datasketches::cpc_sketch::deserialize(is1);
-create temporary table sketch_intermediate (category char(1), sketch binary);
-insert into sketch_intermediate select category, data2sketch(id) from
theta_input group by category;
+ std::ifstream is2("cpc_sketch2.bin");
+ datasketches::cpc_sketch sketch2 =
datasketches::cpc_sketch::deserialize(is2);
-select category, estimate(sketch) from sketch_intermediate;
+ datasketches::cpc_union u(lg_k);
+ u.update(sketch1);
+ u.update(sketch2);
+ datasketches::cpc_sketch sketch = u.get_result();
-Output:
-a 10.0
-b 10.0
-
-select estimate(unionSketches(sketch)) from sketch_intermediate;
-
-Output:
-15.0
-</code></pre></div></div>
-
-<h3 id="set-operations">Set operations</h3>
-
-<p>Notice the difference between UnionUDF in this example, which takes two
sketches, and UnionUDAF in the previous example, which is an aggregate function
taking a collection of sketches as one parameter. The same is true about
IntersectSketchUDF and IntersectSketchUDAF.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
-
-create temporary function data2sketch as
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function estimate as
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-create temporary function union2 as
'org.apache.datasketches.hive.theta.UnionSketchUDF';
-create temporary function intersect as
'org.apache.datasketches.hive.theta.IntersectSketchUDF';
-create temporary function anotb as
'org.apache.datasketches.hive.theta.ExcludeSketchUDF';
-
-use <your-db-nasme-here>;
-
-create temporary table sketch_input (id1 int, id2 int);
-insert into table sketch_input values
- (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18),
(10, 20);
-
-create temporary table sketch_intermediate (sketch1 binary, sketch2 binary);
+ // debug summary of the union result sketch
+ sketch.to_stream(std::cout);
-insert into sketch_intermediate select data2sketch(id1), data2sketch(id2) from
sketch_input;
+ std::cout << "Distinct count estimate: " <<
sketch.get_estimate() << std::endl;
+ std::cout << "Distinct count lower bound 95% confidence: " <<
sketch.get_lower_bound(2) << std::endl;
+ std::cout << "Distinct count upper bound 95% confidence: " <<
sketch.get_upper_bound(2) << std::endl;
+ }
-select
- estimate(sketch1),
- estimate(sketch2),
- estimate(union2(sketch1, sketch2)),
- estimate((intersect(sketch1, sketch2))),
- estimate(anotb(sketch1, sketch2)),
- estimate(anotb(sketch2, sketch1))
-from sketch_intermediate;
+ return 0;
+}
Output:
-10.0 10.0 15.0 5.0 5.0 5.0
+### CPC sketch summary:
+ lg_k : 10
+ seed hash : 93cc
+ C : 7706
+ flavor : 4
+ merged : true
+ intresting col : 4
+ table entries : 27
+ window : allocated
+ window offset : 5
+### End sketch summary
+Distinct count estimate: 149797
+Distinct count lower bound 95% confidence: 143416
+Distinct count upper bound 95% confidence: 156397
</code></pre></div></div>
</div> <!-- End content -->
diff --git a/output/docs/Theta/ThetaHiveUDFs.html
b/output/docs/CPC/CpcHiveExample.html
similarity index 96%
copy from output/docs/Theta/ThetaHiveUDFs.html
copy to output/docs/CPC/CpcHiveExample.html
index 321fd4d..7219070 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/CPC/CpcHiveExample.html
@@ -476,9 +476,7 @@
specific language governing permissions and limitations
under the License.
-->
-<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
-
-<p>Depends on sketches-core.</p>
+<h2 id="cpc-sketch-hive-udfs">CPC sketch Hive UDFs</h2>
<h3 id="building-sketches-merging-sketches-and-getting-estimates">Building
sketches, merging sketches and getting estimates</h3>
@@ -486,45 +484,49 @@
add jar datasketches-java-1.2.0-incubating.jar;
add jar datasketches-hive-1.0.0-incubating.jar;
-create temporary function data2sketch as
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function unionSketches as
'org.apache.datasketches.hive.theta.UnionSketchUDAF';
-create temporary function estimate as
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
+create temporary function data2sketch as
'org.apache.datasketches.hive.cpc.DataToSketchUDAF';
+create temporary function unionSketches as
'org.apache.datasketches.hive.cpc.UnionSketchUDAF';
+create temporary function estimate as
'org.apache.datasketches.hive.cpc.GetEstimateUDF';
+create temporary function estimateAndBounds as
'org.apache.datasketches.hive.cpc.GetEstimateAndErrorBoundsUDF';
use <your-db-name-here>;
-create temporary table theta_input (id int, category char(1));
-insert into table theta_input values
+create temporary table sketch_input (id int, category char(1));
+insert into table sketch_input values
(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8,
'a'), (9, 'a'), (10, 'a'),
(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'),
(13, 'b'), (14, 'b'), (15, 'b');
create temporary table sketch_intermediate (category char(1), sketch binary);
-insert into sketch_intermediate select category, data2sketch(id) from
theta_input group by category;
+insert into sketch_intermediate select category, data2sketch(id) from
sketch_input group by category;
select category, estimate(sketch) from sketch_intermediate;
Output:
-a 10.0
-b 10.0
+a 10.007331400971685
+b 10.007331400971685
select estimate(unionSketches(sketch)) from sketch_intermediate;
Output:
-15.0
+15.017114660336853
+
+select estimateAndBounds(unionSketches(sketch)) from sketch_intermediate;
+
+Output:
+[15.017114660336853,15.0,16.0]
</code></pre></div></div>
-<h3 id="set-operations">Set operations</h3>
+<h3 id="union-of-two-sketches">Union of two sketches</h3>
-<p>Notice the difference between UnionUDF in this example, which takes two
sketches, and UnionUDAF in the previous example, which is an aggregate function
taking a collection of sketches as one parameter. The same is true about
IntersectSketchUDF and IntersectSketchUDAF.</p>
+<p>Notice the difference between UnionUDF in this example, which takes two
sketches, and UnionUDAF in the previous example, which is an aggregate function
taking a collection of sketches as one parameter.</p>
<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
add jar datasketches-java-1.2.0-incubating.jar;
add jar datasketches-hive-1.0.0-incubating.jar;
-create temporary function data2sketch as
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function estimate as
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-create temporary function union2 as
'org.apache.datasketches.hive.theta.UnionSketchUDF';
-create temporary function intersect as
'org.apache.datasketches.hive.theta.IntersectSketchUDF';
-create temporary function anotb as
'org.apache.datasketches.hive.theta.ExcludeSketchUDF';
+create temporary function data2sketch as
'org.apache.datasketches.hive.cpc.DataToSketchUDAF';
+create temporary function union2 as
'org.apache.datasketches.hive.cpc.UnionSketchUDF';
+create temporary function estimate as
'org.apache.datasketches.hive.cpc.GetEstimateUDF';
use <your-db-nasme-here>;
@@ -539,14 +541,11 @@ insert into sketch_intermediate select data2sketch(id1),
data2sketch(id2) from s
select
estimate(sketch1),
estimate(sketch2),
- estimate(union2(sketch1, sketch2)),
- estimate((intersect(sketch1, sketch2))),
- estimate(anotb(sketch1, sketch2)),
- estimate(anotb(sketch2, sketch1))
+ estimate(union2(sketch1, sketch2))
from sketch_intermediate;
Output:
-10.0 10.0 15.0 5.0 5.0 5.0
+10.007331400971685 10.007331400971685 15.017114660336853
</code></pre></div></div>
</div> <!-- End content -->
diff --git a/output/docs/Theta/ThetaHiveUDFs.html
b/output/docs/CPC/CpcJavaExample.html
similarity index 88%
copy from output/docs/Theta/ThetaHiveUDFs.html
copy to output/docs/CPC/CpcJavaExample.html
index 321fd4d..82c4e20 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/CPC/CpcJavaExample.html
@@ -476,77 +476,86 @@
specific language governing permissions and limitations
under the License.
-->
-<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
-
-<p>Depends on sketches-core.</p>
-
-<h3 id="building-sketches-merging-sketches-and-getting-estimates">Building
sketches, merging sketches and getting estimates</h3>
-
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
-
-create temporary function data2sketch as
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function unionSketches as
'org.apache.datasketches.hive.theta.UnionSketchUDAF';
-create temporary function estimate as
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-
-use <your-db-name-here>;
-
-create temporary table theta_input (id int, category char(1));
-insert into table theta_input values
- (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8,
'a'), (9, 'a'), (10, 'a'),
- (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'),
(13, 'b'), (14, 'b'), (15, 'b');
-
-create temporary table sketch_intermediate (category char(1), sketch binary);
-insert into sketch_intermediate select category, data2sketch(id) from
theta_input group by category;
-
-select category, estimate(sketch) from sketch_intermediate;
-
-Output:
-a 10.0
-b 10.0
-
-select estimate(unionSketches(sketch)) from sketch_intermediate;
-
-Output:
-15.0
-</code></pre></div></div>
-
-<h3 id="set-operations">Set operations</h3>
-
-<p>Notice the difference between UnionUDF in this example, which takes two
sketches, and UnionUDAF in the previous example, which is an aggregate function
taking a collection of sketches as one parameter. The same is true about
IntersectSketchUDF and IntersectSketchUDAF.</p>
-
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
-add jar datasketches-java-1.2.0-incubating.jar;
-add jar datasketches-hive-1.0.0-incubating.jar;
-
-create temporary function data2sketch as
'org.apache.datasketches.hive.theta.DataToSketchUDAF';
-create temporary function estimate as
'org.apache.datasketches.hive.theta.EstimateSketchUDF';
-create temporary function union2 as
'org.apache.datasketches.hive.theta.UnionSketchUDF';
-create temporary function intersect as
'org.apache.datasketches.hive.theta.IntersectSketchUDF';
-create temporary function anotb as
'org.apache.datasketches.hive.theta.ExcludeSketchUDF';
-
-use <your-db-nasme-here>;
-
-create temporary table sketch_input (id1 int, id2 int);
-insert into table sketch_input values
- (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18),
(10, 20);
-
-create temporary table sketch_intermediate (sketch1 binary, sketch2 binary);
+<h1 id="cpc-sketch-java-example">CPC Sketch Java Example</h1>
+
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import com.yahoo.memory.Memory;
+import com.yahoo.sketches.cpc.CpcSketch;
+import com.yahoo.sketches.cpc.CpcUnion;
+
+//simplified file operations and no error handling for clarity
+public class CpcExample {
+
+ public static void main(String[] args) throws Exception {
+ final int lgK = 10;
+ // this section generates two sketches with some overlap and serializes
them into files
+ {
+ // 100000 distinct keys
+ CpcSketch sketch1 = new CpcSketch(lgK);
+ for (int key = 0; key < 100000; key++) sketch1.update(key);
+ FileOutputStream out1 = new FileOutputStream("CpcSketch1.bin");
+ out1.write(sketch1.toByteArray());
+ out1.close();
+
+ // 100000 distinct keys
+ CpcSketch sketch2 = new CpcSketch(lgK);
+ for (int key = 50000; key < 150000; key++) sketch2.update(key);
+ FileOutputStream out2 = new FileOutputStream("CpcSketch2.bin");
+ out2.write(sketch2.toByteArray());
+ out2.close();
+ }
-insert into sketch_intermediate select data2sketch(id1), data2sketch(id2) from
sketch_input;
+ // this section deserializes the sketches, produces a union and prints the
result
+ {
+ FileInputStream in1 = new FileInputStream("CpcSketch1.bin");
+ byte[] bytes1 = new byte[in1.available()];
+ in1.read(bytes1);
+ in1.close();
+ CpcSketch sketch1 = CpcSketch.heapify(Memory.wrap(bytes1));
+
+ FileInputStream in2 = new FileInputStream("CpcSketch2.bin");
+ byte[] bytes2 = new byte[in2.available()];
+ in2.read(bytes2);
+ in2.close();
+ CpcSketch sketch2 = CpcSketch.heapify(Memory.wrap(bytes2));
+
+ CpcUnion union = new CpcUnion(lgK);
+ union.update(sketch1);
+ union.update(sketch2);
+ CpcSketch result = union.getResult();
+
+ // debug summary of the union result sketch
+ System.out.println(result.toString());
+
+ System.out.println("Distinct count estimate: " + result.getEstimate());
+ System.out.println("Distinct count lower bound 95% confidence: " +
result.getLowerBound(2));
+ System.out.println("Distinct count upper bound 95% confidence: " +
result.getUpperBound(2));
+ }
+ }
-select
- estimate(sketch1),
- estimate(sketch2),
- estimate(union2(sketch1, sketch2)),
- estimate((intersect(sketch1, sketch2))),
- estimate(anotb(sketch1, sketch2)),
- estimate(anotb(sketch2, sketch1))
-from sketch_intermediate;
+}
Output:
-10.0 10.0 15.0 5.0 5.0 5.0
+### CPD SKETCH - PREAMBLE:
+ Flavor : SLIDING
+ LgK : 10
+ Merge Flag : true
+ Error Const : 0.6931471805599453
+ RSE : 0.02166084939249829
+ Seed Hash : 93cc | 37836
+ Num Coupons : 7706
+ Num Pairs (SV) : 27
+ First Inter Col: 4
+ Valid Window : true
+ Valid PairTable: true
+ Window Offset : 5
+ KxP : 1024.0
+ HIP Accum : 0.0
+### END CPC SKETCH
+Distinct count estimate: 149796.50599220005
+Distinct count lower bound 95% confidence: 143416.2744812169
+Distinct count upper bound 95% confidence: 156397.0
</code></pre></div></div>
</div> <!-- End content -->
diff --git a/output/docs/Theta/ThetaPigUDFs.html
b/output/docs/CPC/CpcPigExample.html
similarity index 90%
copy from output/docs/Theta/ThetaPigUDFs.html
copy to output/docs/CPC/CpcPigExample.html
index 3e8ac49..a9bf523 100644
--- a/output/docs/Theta/ThetaPigUDFs.html
+++ b/output/docs/CPC/CpcPigExample.html
@@ -476,32 +476,34 @@
specific language governing permissions and limitations
under the License.
-->
-<h2 id="theta-sketch-pig-udfs">Theta Sketch Pig UDFs</h2>
+<h2 id="cpc-sketch-pig-udfs">CPC Sketch Pig UDFs</h2>
<h3 id="instructions">Instructions</h3>
<ul>
<li>get jars</li>
- <li>save the following script as theta.pig</li>
+ <li>save the following script as cpc.pig</li>
<li>adjust jar versions and paths if necessary</li>
<li>save the below data into a file called “data.txt”</li>
<li>copy data to hdfs: “hdfs dfs -copyFromLocal data.txt”</li>
- <li>run pig script: “pig theta.pig”</li>
+ <li>run pig script: “pig cpc.pig”</li>
</ul>
-<h3
id="thetapig-script-building-sketches-merging-sketches-and-getting-estimates">theta.pig
script: building sketches, merging sketches and getting estimates</h3>
+<h3
id="cpcpig-script-building-sketches-merging-sketches-and-getting-estimates">cpc.pig
script: building sketches, merging sketches and getting estimates</h3>
<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>register datasketches-memory-1.2.0-incubating.jar;
register datasketches-java-1.2.0-incubating.jar;
register datasketches-pig-1.0.0-incubating.jar;
-define dataToSketch org.apache.datasketches.pig.theta.DataToSketch('32');
-define unionSketch org.apache.datasketches.pig.theta.Union('32');
-define getEstimate org.apache.datasketches.pig.theta.Estimate();
+define dataToSketch org.apache.datasketches.pig.cpc.DataToSketch('12');
+define unionSketch org.apache.datasketches.pig.cpc.UnionSketch('12');
+define getEstimate org.apache.datasketches.pig.cpc.GetEstimate();
+define getEstimateAndBounds
org.apache.datasketches.pig.cpc.GetEstimateAndErrorBounds('3');
+define toString org.apache.datasketches.pig.cpc.SketchToString();
a = load 'data.txt' as (id, category);
b = group a by category;
-c = foreach b generate flatten(group) as (category),
flatten(dataToSketch(a.id)) as (sketch);
+c = foreach b generate flatten(group) as (category), dataToSketch(a.id) as
sketch;
-- Sketches can be stored at this point in binary format to be used later:
-- store c into 'intermediate/$date' using BinStorage();
-- The next two lines print the results in human readable form for the purpose
of this example
@@ -511,11 +513,13 @@ dump d;
-- This can be a separate query
-- For example, the first part can produce a daily intermediate feed and store
it,
-- and this part can load several instances of this daily intermediate feed
and merge them
--- c = load 'intermediate/$date1,intermediate/$date2' using BinStorage() as
(category, sketch);
e = group c all;
-f = foreach e generate flatten(unionSketch(c.sketch)) as (sketch);
+f = foreach e generate unionSketch(c.sketch) as sketch;
g = foreach f generate getEstimate(sketch);
dump g;
+
+h = foreach f generate flatten(getEstimateAndBounds(sketch)) as (estimate, lb,
ub);
+dump h;
</code></pre></div></div>
<h3 id="datatxt-tab-separated"><a href="/docs/Theta/data.txt">data.txt</a>
(tab separated)</h3>
@@ -527,56 +531,18 @@ Most of the IDs in these categories overlap, so that
there are 60 unique IDs in
<p>Results:
From ‘dump d’:</p>
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(a,46.91487058420659)
-(b,46.23988568048073)
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(a,50.09992602861082)
+(b,50.09992602861082)
</code></pre></div></div>
<p>From ‘dump g’ (merged across categories):</p>
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(50.415577215639736)
-</code></pre></div></div>
-
-<p>The expected exact result would be (60.0). The estimate has high relative
error because the sketch was configured with only 32 nominal entries.</p>
-
-<h3 id="theta_setopspig-script-set-operations-on-sketches">theta_setops.pig
script: set operations on sketches</h3>
-
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>register datasketches-memory-1.2.0-incubating.jar;
-register datasketches-java-1.2.0-incubating.jar;
-register datasketches-pig-1.0.0-incubating.jar;
-
-define dataToSketch org.apache.datasketches.pig.theta.DataToSketch('32');
-define unionSketch org.apache.datasketches.pig.theta.Union();
-define intersect org.apache.datasketches.pig.theta.Intersect();
-define anotb org.apache.datasketches.pig.theta.AexcludeB();
-define estimate org.apache.datasketches.pig.theta.Estimate();
-
-a = load 'setops_data.txt' as (id1, id2);
-b = group a all;
-c = foreach b generate
- flatten(dataToSketch(a.id1)) as (sketch1),
- flatten(dataToSketch(a.id2)) as (sketch2);
-d = foreach c generate
- sketch1, -- pass sketches through to have all estimates in one place
- sketch2,
- flatten(unionSketch(TOBAG(sketch1, sketch2))) as (a_union_b),
- flatten(intersect(TOBAG(sketch1, sketch2))) as (a_intersect_b),
- flatten(anotb(sketch1, sketch2)) as (a_not_b),
- flatten(anotb(sketch2, sketch1)) as (b_not_a);
-e = foreach d generate
- estimate(sketch1),
- estimate(sketch2),
- estimate(a_union_b),
- estimate(a_intersect_b),
- estimate(a_not_b),
- estimate(b_not_a);
-dump e;
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(60.14445031168714)
</code></pre></div></div>
-<h3 id="setops_datatxt-tab-separated"><a
href="/docs/Theta/setops_data.txt">setops_data.txt</a> (tab separated)</h3>
-
-<p>Result:</p>
+<p>From ‘dump h’ (with error bounds, 99% confidence interval):</p>
-<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(10.0,12.0,18.0,4.0,6.0,8.0)
+<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(60.14445031168714,60.0,63.0)
</code></pre></div></div>
</div> <!-- End content -->
diff --git a/output/docs/HLL/HllHiveUDFs.html b/output/docs/HLL/HllHiveUDFs.html
index 5d71a49..1fe64d6 100644
--- a/output/docs/HLL/HllHiveUDFs.html
+++ b/output/docs/HLL/HllHiveUDFs.html
@@ -478,8 +478,6 @@
-->
<h2 id="hyper-log-log-sketch-hive-udfs">Hyper Log Log sketch Hive UDFs</h2>
-<p>This functionality appeared in sketches-hive-0.10.1. Depends on
sketches-core-0.10.0 and memory-0.10.2.</p>
-
<h3 id="building-sketches-computing-unions-and-getting-estimates">Building
sketches, computing unions and getting estimates</h3>
<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
diff --git a/output/docs/Theta/ThetaHiveUDFs.html
b/output/docs/Theta/ThetaHiveUDFs.html
index 321fd4d..aaaa82a 100644
--- a/output/docs/Theta/ThetaHiveUDFs.html
+++ b/output/docs/Theta/ThetaHiveUDFs.html
@@ -478,8 +478,6 @@
-->
<h2 id="hadoop-hive-udfs">Hadoop Hive UDFs</h2>
-<p>Depends on sketches-core.</p>
-
<h3 id="building-sketches-merging-sketches-and-getting-estimates">Building
sketches, merging sketches and getting estimates</h3>
<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>add jar datasketches-memory-1.2.0-incubating.jar;
diff --git a/output/docs/Theta/ThetaPigUDFs.html
b/output/docs/Theta/ThetaPigUDFs.html
index 3e8ac49..7c3980d 100644
--- a/output/docs/Theta/ThetaPigUDFs.html
+++ b/output/docs/Theta/ThetaPigUDFs.html
@@ -536,7 +536,8 @@ From ‘dump d’:</p>
<div class="highlighter-rouge"><div class="highlight"><pre
class="highlight"><code>(50.415577215639736)
</code></pre></div></div>
-<p>The expected exact result would be (60.0). The estimate has high relative
error because the sketch was configured with only 32 nominal entries.</p>
+<p>The expected exact result would be (60.0). The estimate has high relative
error because the sketch was configured with only 32 nominal entries
+to show the estimation mode for the purposes of this example.</p>
<h3 id="theta_setopspig-script-set-operations-on-sketches">theta_setops.pig
script: set operations on sketches</h3>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]