[I] Inaccurate aggregation results when using `doubleSum` aggregator when doing a topN query for versions 31.0.0 and above (druid)

via GitHub Tue, 18 Mar 2025 09:59:07 -0700


climbablebug44 opened a new issue, #17813:
URL: https://github.com/apache/druid/issues/17813


   Inaccurate aggregation results when using `doubleSum` aggregator when doing 
a topN query
   
   ### Affected Version
   31.0.0 and above
   
   ### Description
   - when using rollup metric aggregations on the data(longSum/doubleSum) 
during ingestion, topN queries with granularities smaller than 15 mins provide 
inflated results when using `doubleSum` aggregator alone. 
   - when the metric aggregators have a `longSum` or `floatSum` along with the 
`doubleSum`, it provides correct results.
   
   - Cluster size 
       - mid-size cluster with 3 master services, 6 historicals, 2 brokers
   - Steps to reproduce the problem
       - input data
   ```csv
   2025-02-19T00:00:00.000Z,a, abcd, 1
   2025-02-19T00:00:00.000Z,b, abcd, 1
   2025-02-19T00:00:00.000Z,c, abcd, 1
   2025-02-19T00:01:00.000Z,d, abcd, 2
   2025-02-19T00:01:00.000Z,e, abcd, 2
   2025-02-19T00:01:00.000Z,f, abcd, 2
   2025-02-19T00:02:00.000Z,g, abcd, 3
   2025-02-19T00:02:00.000Z,h, abcd, 3
   2025-02-19T00:02:00.000Z,i, abcd, 3
   ```
   - ingestion payload for the data
   
   ```json
   {
     "type": "index_parallel",
     "id": "index_parallel_topN-bug-check-4_djegegba_2025-02-20T15:54:55.042Z",
     "groupId": 
"index_parallel_topN-bug-check-4_djegegba_2025-02-20T15:54:55.042Z",
     "resource": {
       "availabilityGroup": 
"index_parallel_topN-bug-check-4_djegegba_2025-02-20T15:54:55.042Z",
       "requiredCapacity": 1
     },
     "spec": {
       "dataSchema": {
         "dataSource": "topN-bug-check-4",
         "timestampSpec": {
           "column": "time",
           "format": "iso",
           "missingValue": null
         },
         "dimensionsSpec": {
           "dimensions": [],
           "dimensionExclusions": [
             "val",
             "__time",
             "count",
             "time",
             "sum_val"
           ],
           "includeAllDimensions": false,
           "useSchemaDiscovery": false
         },
         "metricsSpec": [
           {
             "type": "count",
             "name": "count"
           },
           {
             "type": "longSum",
             "name": "sum_val",
             "fieldName": "val"
           }
         ],
         "granularitySpec": {
           "type": "uniform",
           "segmentGranularity": "HOUR",
           "queryGranularity": "MINUTE",
           "rollup": true,
           "intervals": []
         },
         "transformSpec": {
           "filter": null,
           "transforms": []
         }
       },
       "ioConfig": {
         "type": "index_parallel",
         "inputSource": {
           "type": "inline",
           "data": "2025-02-19T00:00:00.000Z,a, east_sc, 
1\n2025-02-19T00:00:00.000Z,b, east_sc, 1\n2025-02-19T00:00:00.000Z,c, east_sc, 
1\n2025-02-19T00:01:00.000Z,d, east_sc, 2\n2025-02-19T00:01:00.000Z,e, east_sc, 
2\n2025-02-19T00:01:00.000Z,f, east_sc, 2\n2025-02-19T00:02:00.000Z,g, east_sc, 
3\n2025-02-19T00:02:00.000Z,h, east_sc, 3\n2025-02-19T00:02:00.000Z,i, east_sc, 
3\n"
         },
         "inputFormat": {
           "type": "csv",
           "columns": [
             "time",
             "tmp",
             "dc",
             "val"
           ]
         },
         "appendToExisting": false,
         "dropExisting": false
       },
       "tuningConfig": {
         "type": "index_parallel",
         "maxRowsPerSegment": 5000000,
         "appendableIndexSpec": {
           "type": "onheap",
           "preserveExistingMetrics": false
         },
         "maxRowsInMemory": 1000000,
         "maxBytesInMemory": 0,
         "skipBytesInMemoryOverheadCheck": false,
         "maxTotalRows": null,
         "numShards": null,
         "splitHintSpec": null,
         "partitionsSpec": {
           "type": "hashed",
           "numShards": null,
           "partitionDimensions": [],
           "partitionFunction": "murmur3_32_abs",
           "maxRowsPerSegment": 5000000
         },
         "indexSpec": {
           "bitmap": {
             "type": "roaring"
           },
           "dimensionCompression": "lz4",
           "stringDictionaryEncoding": {
             "type": "utf8"
           },
           "metricCompression": "lz4",
           "longEncoding": "longs"
         },
         "indexSpecForIntermediatePersists": {
           "bitmap": {
             "type": "roaring"
           },
           "dimensionCompression": "lz4",
           "stringDictionaryEncoding": {
             "type": "utf8"
           },
           "metricCompression": "lz4",
           "longEncoding": "longs"
         },
         "maxPendingPersists": 0,
         "forceGuaranteedRollup": true,
         "reportParseExceptions": false,
         "pushTimeout": 0,
         "segmentWriteOutMediumFactory": null,
         "maxNumConcurrentSubTasks": 1,
         "maxRetry": 3,
         "taskStatusCheckPeriodMs": 1000,
         "chatHandlerTimeout": "PT10S",
         "chatHandlerNumRetries": 5,
         "maxNumSegmentsToMerge": 100,
         "totalNumMergeTasks": 10,
         "logParseExceptions": false,
         "maxParseExceptions": 2147483647,
         "maxSavedParseExceptions": 0,
         "maxColumnsToMerge": -1,
         "awaitSegmentAvailabilityTimeoutMillis": 0,
         "maxAllowedLockCount": -1,
         "numPersistThreads": 1,
         "partitionDimensions": []
       }
     },
     "context": {
       "forceTimeChunkLock": true,
       "useLineageBasedSegmentAllocation": true
     },
     "dataSource": "topN-bug-check-4"
   }
   ```
   
   - query payload
   ```json
   {
       "dimension": {
           "type": "default",
           "dimension": "dc",
           "outputName": "Data center"
       },
       "metric": "val",
       "threshold": 5,
       "dataSource": "topN-bug-check-4",
       "granularity": "minute",
       "intervals": [
           "2025-02-19T00:00Z/2025-02-19T00:03Z"
       ],
       "aggregations": [
           {
               "fieldName": "sum_val",
               "type": "doubleSum",
               "name": "val"
           }
       ],
       "postAggregations": [],
       "context": {
           "queryId": "tbc",
           "vectorize": true
       },
       "queryType": "topN"
   }
   ```
   
   - wrong aggregation results - there is a mismatch in the original data and 
the aggregated data when doubleSum is used alone.
   
![Image](https://github.com/user-attachments/assets/1ad47145-e30f-46ba-bad3-5ab0e7cea91e)
   
   - this shows the correct aggregation results
   
![Image](https://github.com/user-attachments/assets/5aea3ede-022c-4c59-af36-e5df1031298b)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[I] Inaccurate aggregation results when using `doubleSum` aggregator when doing a topN query for versions 31.0.0 and above (druid)

Reply via email to