orhankislal commented on a change in pull request #484: MFVSketch: Remove 
duplicate results on GPDB
URL: https://github.com/apache/madlib/pull/484#discussion_r387366763
 
 

 ##########
 File path: methods/sketch/src/pg_gp/mfvsketch.c
 ##########
 @@ -568,35 +572,77 @@ bytea *mfvsketch_merge_c(bytea *transblob1, bytea 
*transblob2)
     qsort(transval1->mfvs, transval1->next_mfv, sizeof(offsetcnt), 
cnt_cmp_desc);
     qsort(transval2->mfvs, transval2->next_mfv, sizeof(offsetcnt), 
cnt_cmp_desc);
 
+    Datum *values_seen = palloc0((2 * newval->max_mfvs) * sizeof(Datum));
+    unsigned int k;
+
     /* choose top k from transval1 and transval2 */
     for (i = j = cnt = 0;
          cnt < newval->max_mfvs
-         && (j < transval2->next_mfv || i < transval1->next_mfv);
-         cnt++) {
+         && (j < transval2->next_mfv || i < transval1->next_mfv);) {
         Datum iDatum, jDatum;
 
-    if (i < transval1->next_mfv &&
+        if (i < transval1->next_mfv &&
             (j == transval2->next_mfv
              || transval1->mfvs[i].cnt >= transval2->mfvs[j].cnt)) {
-          /* next item comes from transval1 */
-          iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
-                                       transval1->typByVal);
-          newblob = mfv_transval_append(newblob, iDatum);
-          newval = (mfvtransval *)VARDATA(newblob);
-          newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
-          i++;
+            /* next item comes from transval1 */
+            iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
+                                         transval1->typByVal);
+            for (k = 0; k < cnt; k++) {
+                if (datumIsEqual(iDatum, values_seen[k], transval1->typByVal, 
transval1->typLen)) {
+                    break;
+                }
+            }
+
+            if (k == cnt) {
+                newblob = mfv_transval_append(newblob, iDatum);
+                newval = (mfvtransval *)VARDATA(newblob);
+                newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
+                if (cnt < 2 * transval1->max_mfvs) {
+                    values_seen[cnt] = datumCopy(iDatum, transval1->typByVal,
+                        transval1->typLen);
+                    cnt++;
+                } else {
+                    elog(WARNING, "Merge received more than %d values from 
transition functions", 2*transval1->max_mfvs);
+                }
+            }
+            i++;
         }
         else if (j < transval2->next_mfv &&
                  (i == transval1->next_mfv
                   || transval1->mfvs[i].cnt < transval2->mfvs[j].cnt)) {
-          /* next item comes from transval2 */
-          jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
+            /* next item comes from transval2 */
+            jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
                                        transval2->typByVal);
-          newblob = mfv_transval_append(newblob, jDatum);
-          newval = (mfvtransval *)VARDATA(newblob);
-          newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
-          j++;
+
+            for (k = 0; k < cnt; k++) {
+                if (datumIsEqual(jDatum, values_seen[k], transval2->typByVal,
+                                 transval2->typLen)) {
+                    break;
+                }
+            }
+
+            if (k == cnt) {
+                newblob = mfv_transval_append(newblob, jDatum);
+                newval = (mfvtransval *)VARDATA(newblob);
+                newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
+                if (cnt < 2*transval1->max_mfvs) {
+                    values_seen[cnt] = datumCopy(jDatum, transval2->typByVal, 
transval2->typLen);
+                    cnt++;
+                } else {
+                    elog(WARNING, "Merge received more than %d values from 
transition functions", 2*transval2->max_mfvs);
+                }
+            }
+            j++;
         }
     }
+
+    if (!transval1->typByVal){
 
 Review comment:
   `transval1->typByVal` is a boolean and set to `true` or `false` during 
initialization (`mfv_init_transval`) by a db function (`get_typbyval`). We have 
this check here because we want to free the Datums if they were passed by 
reference. Pass by value Datums are like primitive types and they are taken 
care of once the array is freed at line 645

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

Reply via email to