kaknikhil commented on a change in pull request #484: MFVSketch: Remove
duplicate results on GPDB
URL: https://github.com/apache/madlib/pull/484#discussion_r386713689
##########
File path: methods/sketch/src/pg_gp/mfvsketch.c
##########
@@ -568,35 +572,77 @@ bytea *mfvsketch_merge_c(bytea *transblob1, bytea
*transblob2)
qsort(transval1->mfvs, transval1->next_mfv, sizeof(offsetcnt),
cnt_cmp_desc);
qsort(transval2->mfvs, transval2->next_mfv, sizeof(offsetcnt),
cnt_cmp_desc);
+ Datum *values_seen = palloc0((2 * newval->max_mfvs) * sizeof(Datum));
+ unsigned int k;
+
/* choose top k from transval1 and transval2 */
for (i = j = cnt = 0;
cnt < newval->max_mfvs
- && (j < transval2->next_mfv || i < transval1->next_mfv);
- cnt++) {
+ && (j < transval2->next_mfv || i < transval1->next_mfv);) {
Datum iDatum, jDatum;
- if (i < transval1->next_mfv &&
+ if (i < transval1->next_mfv &&
(j == transval2->next_mfv
|| transval1->mfvs[i].cnt >= transval2->mfvs[j].cnt)) {
- /* next item comes from transval1 */
- iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
- transval1->typByVal);
- newblob = mfv_transval_append(newblob, iDatum);
- newval = (mfvtransval *)VARDATA(newblob);
- newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
- i++;
+ /* next item comes from transval1 */
+ iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
+ transval1->typByVal);
+ for (k = 0; k < cnt; k++) {
+ if (datumIsEqual(iDatum, values_seen[k], transval1->typByVal,
transval1->typLen)) {
+ break;
+ }
+ }
+
+ if (k == cnt) {
+ newblob = mfv_transval_append(newblob, iDatum);
+ newval = (mfvtransval *)VARDATA(newblob);
+ newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
+ if (cnt < 2 * transval1->max_mfvs) {
+ values_seen[cnt] = datumCopy(iDatum, transval1->typByVal,
+ transval1->typLen);
+ cnt++;
+ } else {
+ elog(WARNING, "Merge received more than %d values from
transition functions", 2*transval1->max_mfvs);
+ }
+ }
+ i++;
}
else if (j < transval2->next_mfv &&
(i == transval1->next_mfv
|| transval1->mfvs[i].cnt < transval2->mfvs[j].cnt)) {
- /* next item comes from transval2 */
- jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
+ /* next item comes from transval2 */
+ jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
transval2->typByVal);
- newblob = mfv_transval_append(newblob, jDatum);
- newval = (mfvtransval *)VARDATA(newblob);
- newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
- j++;
+
+ for (k = 0; k < cnt; k++) {
+ if (datumIsEqual(jDatum, values_seen[k], transval2->typByVal,
+ transval2->typLen)) {
+ break;
+ }
+ }
+
+ if (k == cnt) {
+ newblob = mfv_transval_append(newblob, jDatum);
+ newval = (mfvtransval *)VARDATA(newblob);
+ newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
+ if (cnt < 2*transval1->max_mfvs) {
+ values_seen[cnt] = datumCopy(jDatum, transval2->typByVal,
transval2->typLen);
+ cnt++;
+ } else {
+ elog(WARNING, "Merge received more than %d values from
transition functions", 2*transval2->max_mfvs);
+ }
+ }
+ j++;
}
}
+
+ if (!transval1->typByVal){
Review comment:
Why do we have this check here ? Is it possible that `transval1->typByVal`
is null but values_seen has also been allocated ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services