prabcs opened a new issue #10693:
URL: https://github.com/apache/druid/issues/10693


   Please provide a detailed title (e.g. "Broker crashes when using TopN query 
with Bound filter" instead of just "Broker crashes").
   
   ### Affected Version
   
   0.20.0
   
   
   ### Description
   
   <details>
   <summary>
   Click here to see ingest spec with maxRowsPerSegment
   </summary>
   
   
   ```json
   {
     "type": "index_parallel",
     "spec": {
       "dataSchema": {
         "dataSource": "test_s_d_prabh_1",
         "timestampSpec": {
           "column": "timestamp",
           "format": "iso",
           "missingValue": null
         },
         "dimensionsSpec": {
           "dimensions": [
             {
               "type": "string",
               "name": "countryIsoCode",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "countryName",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "flags",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isAnonymous",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isMinor",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isNew",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isRobot",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isUnpatrolled",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "metroCode",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "namespace",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "regionIsoCode",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             }
           ],
           "dimensionExclusions": [
             "sum_commentLength",
             "added",
             "count",
             "delta",
             "sum_deleted",
             "deltaBucket",
             "deleted",
             "sum_deltaBucket",
             "commentLength",
             "sum_added",
             "timestamp",
             "sum_delta"
           ]
         },
         "metricsSpec": [
           {
             "type": "count",
             "name": "count"
           },
           {
             "type": "longSum",
             "name": "sum_added",
             "fieldName": "added",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_commentLength",
             "fieldName": "commentLength",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_deleted",
             "fieldName": "deleted",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_delta",
             "fieldName": "delta",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_deltaBucket",
             "fieldName": "deltaBucket",
             "expression": null
           }
         ],
         "granularitySpec": {
           "type": "uniform",
           "segmentGranularity": "DAY",
           "queryGranularity": "DAY",
           "rollup": true,
           "intervals": [
             "2015-01-01T00:00:00.000Z/2017-01-01T00:00:00.000Z"
           ]
         },
         "transformSpec": {
           "filter": null,
           "transforms": []
         }
       },
       "ioConfig": {
         "type": "index_parallel",
         "inputSource": {
           "type": "http",
           "uris": [
             "https://druid.apache.org/data/wikipedia.json.gz";
           ],
           "httpAuthenticationUsername": null,
           "httpAuthenticationPassword": null
         },
         "inputFormat": {
           "type": "json",
           "flattenSpec": {
             "useFieldDiscovery": true,
             "fields": []
           },
           "featureSpec": {}
         },
         "appendToExisting": false
       },
       "tuningConfig": {
         "type": "index_parallel",
         "maxRowsPerSegment": 10,
         "maxRowsInMemory": 1000000,
         "maxBytesInMemory": 0,
         "maxTotalRows": null,
         "numShards": null,
         "splitHintSpec": null,
         "partitionsSpec": {
           "type": "single_dim",
           "targetRowsPerSegment": null,
           "maxRowsPerSegment": 10,
           "partitionDimension": "namespace",
           "assumeGrouped": false
         },
         "indexSpec": {
           "bitmap": {
             "type": "roaring",
             "compressRunOnSerialization": true
           },
           "dimensionCompression": "lz4",
           "metricCompression": "lz4",
           "longEncoding": "longs",
           "segmentLoader": null
         },
         "indexSpecForIntermediatePersists": {
           "bitmap": {
             "type": "roaring",
             "compressRunOnSerialization": true
           },
           "dimensionCompression": "lz4",
           "metricCompression": "lz4",
           "longEncoding": "longs",
           "segmentLoader": null
         },
         "maxPendingPersists": 0,
         "forceGuaranteedRollup": true,
         "reportParseExceptions": false,
         "pushTimeout": 0,
         "segmentWriteOutMediumFactory": null,
         "maxNumConcurrentSubTasks": 30,
         "maxRetry": 3,
         "taskStatusCheckPeriodMs": 1000,
         "chatHandlerTimeout": "PT10S",
         "chatHandlerNumRetries": 5,
         "maxNumSegmentsToMerge": 100,
         "totalNumMergeTasks": 10,
         "logParseExceptions": false,
         "maxParseExceptions": 2147483647,
         "maxSavedParseExceptions": 0,
         "buildV9Directly": true,
         "partitionDimensions": []
       }
     }
   }
   ```
   </details>
   
   Check the result dataset after ingesting the above in Druid v 0.20.0 using 
this query:
   
    ```sql
   select * from sys.segments 
   where datasource = 'test_s_d_prabh_1'
   order by "num_rows" desc
   ```
   
   You'll notice the skewed segment sizes and number of rows.
   
   <img width="1153" alt="Screen Shot 2020-12-13 at 10 51 13 PM" 
src="https://user-images.githubusercontent.com/4270624/102038907-76a75a80-3d96-11eb-9258-decd98501450.png";>
   
   
   Notice that [as per 
docs](https://druid.apache.org/docs/latest/ingestion/native-batch.html#single-dimension-range-partitioning),
 the config is a soft max, not hard:
   
   > maxRowsPerSegment | Soft max for the number of rows to include in a 
partition
   
   -------------------
   
   Now run another ingest with this ingest spec, that utilizes 
`targetRowsPerSegment`:
   
   <details>
   <summary>
   Click here to see the ingest spec with that new config
   </summary>
   
   
   ```json
   {
     "type": "index_parallel",
     "spec": {
       "dataSchema": {
         "dataSource": "test_s_d_prabh_2",
         "timestampSpec": {
           "column": "timestamp",
           "format": "iso",
           "missingValue": null
         },
         "dimensionsSpec": {
           "dimensions": [
             {
               "type": "string",
               "name": "countryIsoCode",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "countryName",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "flags",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isAnonymous",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isMinor",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isNew",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isRobot",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "isUnpatrolled",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "metroCode",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "namespace",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             },
             {
               "type": "string",
               "name": "regionIsoCode",
               "multiValueHandling": "SORTED_ARRAY",
               "createBitmapIndex": true
             }
           ],
           "dimensionExclusions": [
             "sum_commentLength",
             "added",
             "count",
             "delta",
             "sum_deleted",
             "deltaBucket",
             "deleted",
             "sum_deltaBucket",
             "commentLength",
             "sum_added",
             "timestamp",
             "sum_delta"
           ]
         },
         "metricsSpec": [
           {
             "type": "count",
             "name": "count"
           },
           {
             "type": "longSum",
             "name": "sum_added",
             "fieldName": "added",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_commentLength",
             "fieldName": "commentLength",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_deleted",
             "fieldName": "deleted",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_delta",
             "fieldName": "delta",
             "expression": null
           },
           {
             "type": "longSum",
             "name": "sum_deltaBucket",
             "fieldName": "deltaBucket",
             "expression": null
           }
         ],
         "granularitySpec": {
           "type": "uniform",
           "segmentGranularity": "DAY",
           "queryGranularity": "DAY",
           "rollup": true,
           "intervals": [
             "2015-01-01T00:00:00.000Z/2017-01-01T00:00:00.000Z"
           ]
         },
         "transformSpec": {
           "filter": null,
           "transforms": []
         }
       },
       "ioConfig": {
         "type": "index_parallel",
         "inputSource": {
           "type": "http",
           "uris": [
             "https://druid.apache.org/data/wikipedia.json.gz";
           ],
           "httpAuthenticationUsername": null,
           "httpAuthenticationPassword": null
         },
         "inputFormat": {
           "type": "json",
           "flattenSpec": {
             "useFieldDiscovery": true,
             "fields": []
           },
           "featureSpec": {}
         },
         "appendToExisting": false
       },
       "tuningConfig": {
         "type": "index_parallel",
         "maxRowsPerSegment": 30,
         "maxRowsInMemory": 1000000,
         "maxBytesInMemory": 0,
         "maxTotalRows": null,
         "numShards": null,
         "splitHintSpec": null,
         "partitionsSpec": {
           "type": "single_dim",
           "targetRowsPerSegment": 20,
           "maxRowsPerSegment": null,
           "partitionDimension": "namespace",
           "assumeGrouped": false
         },
         "indexSpec": {
           "bitmap": {
             "type": "roaring",
             "compressRunOnSerialization": true
           },
           "dimensionCompression": "lz4",
           "metricCompression": "lz4",
           "longEncoding": "longs",
           "segmentLoader": null
         },
         "indexSpecForIntermediatePersists": {
           "bitmap": {
             "type": "roaring",
             "compressRunOnSerialization": true
           },
           "dimensionCompression": "lz4",
           "metricCompression": "lz4",
           "longEncoding": "longs",
           "segmentLoader": null
         },
         "maxPendingPersists": 0,
         "forceGuaranteedRollup": true,
         "reportParseExceptions": false,
         "pushTimeout": 0,
         "segmentWriteOutMediumFactory": null,
         "maxNumConcurrentSubTasks": 10,
         "maxRetry": 3,
         "taskStatusCheckPeriodMs": 1000,
         "chatHandlerTimeout": "PT10S",
         "chatHandlerNumRetries": 5,
         "maxNumSegmentsToMerge": 100,
         "totalNumMergeTasks": 10,
         "logParseExceptions": false,
         "maxParseExceptions": 2147483647,
         "maxSavedParseExceptions": 0,
         "buildV9Directly": true,
         "partitionDimensions": []
       }
     }
   }
   ```
   
   </details>
   
   Now, run this query:
   
   ```sql
   select * from sys.segments 
   where datasource = 'test_s_d_prabh_2'
   order by "num_rows" desc
   ```
   
   You'll still notice skewed segments in terms of size and number of rows.
   
   <img width="1154" alt="Screen Shot 2020-12-13 at 11 12 36 PM" 
src="https://user-images.githubusercontent.com/4270624/102039729-b3745100-3d98-11eb-8152-b3d6ef42765b.png";>
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to