Repository: incubator-madlib
Updated Branches:
  refs/heads/master 80410635a -> e3c98a094


Sketch: Remove per-tuple checks

Some of the sketch functions have checks running for each tuple in their
aggregate. These checks include invalid transition state and invalid
types for input data. The checks are important for the functions if run
outside an aggregate context, but are a waste of cycles when called as
an agg. The checks include caql calls that were estimated to eat a large
chunk of the runtime. This work removes these checks - the average time
saved is estimated to be around 35% for datasets ranging in size from 10
million to 1 billion tuples.

Closes #135


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/e3c98a09
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/e3c98a09
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/e3c98a09

Branch: refs/heads/master
Commit: e3c98a0942419684d83e8b1a0f7bc777d5e1ddb3
Parents: 8041063
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue May 23 11:44:37 2017 -0700
Committer: Orhan Kislal <okis...@pivotal.io>
Committed: Tue May 23 11:46:31 2017 -0700

----------------------------------------------------------------------
 methods/sketch/src/pg_gp/fm.c           | 15 +++++++--------
 methods/sketch/src/pg_gp/mfvsketch.c    | 17 ++++++++++-------
 methods/sketch/src/pg_gp/sql/fm.sql_in  |  2 +-
 methods/sketch/src/pg_gp/sql/mfv.sql_in |  9 +++++----
 4 files changed, 23 insertions(+), 20 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e3c98a09/methods/sketch/src/pg_gp/fm.c
----------------------------------------------------------------------
diff --git a/methods/sketch/src/pg_gp/fm.c b/methods/sketch/src/pg_gp/fm.c
index 7dcb47d..3160933 100644
--- a/methods/sketch/src/pg_gp/fm.c
+++ b/methods/sketch/src/pg_gp/fm.c
@@ -191,9 +191,6 @@ Datum __fmsketch_trans(PG_FUNCTION_ARGS)
     Datum       retval;
     Datum       inval;
 
-    if (!OidIsValid(element_type))
-        elog(ERROR, "could not determine data type of input");
-
     /*
      * This is Postgres boilerplate for UDFs that modify the data in their own 
context.
      * Such UDFs can only be correctly called in an agg context since regular 
scalar
@@ -212,8 +209,10 @@ Datum __fmsketch_trans(PG_FUNCTION_ARGS)
 
     /* get the provided element, being careful in case it's NULL */
     if (!PG_ARGISNULL(1)) {
-        inval = PG_GETARG_DATUM(1);
+        if (!OidIsValid(element_type))
+            elog(ERROR, "could not determine data type of input");
 
+        inval = PG_GETARG_DATUM(1);
         /*
          * if this is the first call, initialize transval to hold a sortasort
          * on the first call, we should have the empty string (if the agg was 
declared properly!)
@@ -238,12 +237,12 @@ Datum __fmsketch_trans(PG_FUNCTION_ARGS)
                            transval->typByVal);
         }
         else {
-            check_fmtransval(transblob);
+            // check_fmtransval(transblob);
             /* extract the existing transval from the transblob */
             transval = (fmtransval *)VARDATA(transblob);
-            if (transval->typOid != element_type) {
-                elog(ERROR, "cannot aggregate on elements with different 
types");
-            }
+            // if (transval->typOid != element_type) {
+            //     elog(ERROR, "cannot aggregate on elements with different 
types");
+            // }
         }
 
         /*

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e3c98a09/methods/sketch/src/pg_gp/mfvsketch.c
----------------------------------------------------------------------
diff --git a/methods/sketch/src/pg_gp/mfvsketch.c 
b/methods/sketch/src/pg_gp/mfvsketch.c
index 28387ff..0b954cc 100644
--- a/methods/sketch/src/pg_gp/mfvsketch.c
+++ b/methods/sketch/src/pg_gp/mfvsketch.c
@@ -110,24 +110,24 @@ Datum __mfvsketch_trans(PG_FUNCTION_ARGS)
           )))
         elog(ERROR,
              "destructive pass by reference outside agg");
-
     /* initialize if this is first call */
     if (VARSIZE(transblob) <= sizeof(MFV_TRANSVAL_SZ(0))) {
         Oid typOid = get_fn_expr_argtype(fcinfo->flinfo, 1);
         transblob = mfv_init_transval(max_mfvs, typOid);
     }
-    else {
-        check_mfvtransval(transblob);
-    }
+    // else {
+    //     check_mfvtransval(transblob);
+        // }
 
     /* ignore NULL inputs */
     if (PG_ARGISNULL(1) || PG_ARGISNULL(2))
         PG_RETURN_DATUM(PointerGetDatum(transblob));
 
     transval = (mfvtransval *)VARDATA(transblob);
-    if (transval->typOid != get_fn_expr_argtype(fcinfo->flinfo, 1)) {
-        elog(ERROR, "cannot aggregate on elements with different types");
-    }
+    // if (transval->typOid != get_fn_expr_argtype(fcinfo->flinfo, 1)) {
+    //     elog(ERROR, "cannot aggregate on elements with different types");
+    // }
+
     /* insert into the countmin sketch */
     md5_datum = countmin_trans_c(transval->sketch,
                                 newdatum,
@@ -212,6 +212,9 @@ bytea *mfv_init_transval(int max_mfvs, Oid typOid)
     bytea *      transblob;
     mfvtransval *transval;
 
+    if (max_mfvs <= 0) {
+        elog(ERROR, "Invalid entry for number of MFV values");
+    }
     /*
      * initialize mfvtransval, using palloc0 to zero it out.
      * if typlen is positive (fixed), size chosen accurately.

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e3c98a09/methods/sketch/src/pg_gp/sql/fm.sql_in
----------------------------------------------------------------------
diff --git a/methods/sketch/src/pg_gp/sql/fm.sql_in 
b/methods/sketch/src/pg_gp/sql/fm.sql_in
index 7735ff6..16e5405 100644
--- a/methods/sketch/src/pg_gp/sql/fm.sql_in
+++ b/methods/sketch/src/pg_gp/sql/fm.sql_in
@@ -27,7 +27,7 @@ begin
 
        -- DROP TABLE IF EXISTS fm_result_table;
        CREATE TABLE fm_result_table AS
-       SELECT (MADLIB_SCHEMA.fmsketch_dcount(a1)) as val FROM fm_data GROUP BY 
class ORDER BY class;
+       SELECT (fmsketch_dcount(a1)) as val FROM fm_data GROUP BY class ORDER 
BY class;
 
        SELECT array( SELECT val FROM fm_result_table) INTO result;
        IF ((result[1] + result[2]) != 5) THEN

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/e3c98a09/methods/sketch/src/pg_gp/sql/mfv.sql_in
----------------------------------------------------------------------
diff --git a/methods/sketch/src/pg_gp/sql/mfv.sql_in 
b/methods/sketch/src/pg_gp/sql/mfv.sql_in
index d3a0344..9294f0f 100644
--- a/methods/sketch/src/pg_gp/sql/mfv.sql_in
+++ b/methods/sketch/src/pg_gp/sql/mfv.sql_in
@@ -12,12 +12,13 @@
 ---------------------------------------------------------------------------
 
 -- Basic methods
-select mfvsketch_top_histogram(i,5)
-from (select * from generate_series(1,100) union all select * from 
generate_series(10,15)) as T(i);
+create table mfv_test as
+    (select generate_series(1,10000) i union all select generate_series(10,15) 
i);
+
+select mfvsketch_top_histogram(i,5) from mfv_test;
 select mfvsketch_top_histogram(utc_offset,5) from pg_timezone_names;
 select mfvsketch_top_histogram(NULL::bytea,5) from generate_series(1,100);
 
-select mfvsketch_quick_histogram(i,5)
-from (select * from generate_series(1,100) union all select * from 
generate_series(10,15)) as T(i);
+select mfvsketch_quick_histogram(i,5) from mfv_test;
 select mfvsketch_quick_histogram(utc_offset,5) from pg_timezone_names;
 select mfvsketch_quick_histogram(NULL::bytea,5) from generate_series(1,100);

Reply via email to