prabcs opened a new issue #10693:
URL: https://github.com/apache/druid/issues/10693
Please provide a detailed title (e.g. "Broker crashes when using TopN query
with Bound filter" instead of just "Broker crashes").
### Affected Version
0.20.0
### Description
<details>
<summary>
Click here to see ingest spec with maxRowsPerSegment
</summary>
```json
{
"type": "index_parallel",
"spec": {
"dataSchema": {
"dataSource": "test_s_d_prabh_1",
"timestampSpec": {
"column": "timestamp",
"format": "iso",
"missingValue": null
},
"dimensionsSpec": {
"dimensions": [
{
"type": "string",
"name": "countryIsoCode",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "countryName",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "flags",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isAnonymous",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isMinor",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isNew",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isRobot",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isUnpatrolled",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "metroCode",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "namespace",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "regionIsoCode",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
}
],
"dimensionExclusions": [
"sum_commentLength",
"added",
"count",
"delta",
"sum_deleted",
"deltaBucket",
"deleted",
"sum_deltaBucket",
"commentLength",
"sum_added",
"timestamp",
"sum_delta"
]
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "longSum",
"name": "sum_added",
"fieldName": "added",
"expression": null
},
{
"type": "longSum",
"name": "sum_commentLength",
"fieldName": "commentLength",
"expression": null
},
{
"type": "longSum",
"name": "sum_deleted",
"fieldName": "deleted",
"expression": null
},
{
"type": "longSum",
"name": "sum_delta",
"fieldName": "delta",
"expression": null
},
{
"type": "longSum",
"name": "sum_deltaBucket",
"fieldName": "deltaBucket",
"expression": null
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "DAY",
"rollup": true,
"intervals": [
"2015-01-01T00:00:00.000Z/2017-01-01T00:00:00.000Z"
]
},
"transformSpec": {
"filter": null,
"transforms": []
}
},
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "http",
"uris": [
"https://druid.apache.org/data/wikipedia.json.gz"
],
"httpAuthenticationUsername": null,
"httpAuthenticationPassword": null
},
"inputFormat": {
"type": "json",
"flattenSpec": {
"useFieldDiscovery": true,
"fields": []
},
"featureSpec": {}
},
"appendToExisting": false
},
"tuningConfig": {
"type": "index_parallel",
"maxRowsPerSegment": 10,
"maxRowsInMemory": 1000000,
"maxBytesInMemory": 0,
"maxTotalRows": null,
"numShards": null,
"splitHintSpec": null,
"partitionsSpec": {
"type": "single_dim",
"targetRowsPerSegment": null,
"maxRowsPerSegment": 10,
"partitionDimension": "namespace",
"assumeGrouped": false
},
"indexSpec": {
"bitmap": {
"type": "roaring",
"compressRunOnSerialization": true
},
"dimensionCompression": "lz4",
"metricCompression": "lz4",
"longEncoding": "longs",
"segmentLoader": null
},
"indexSpecForIntermediatePersists": {
"bitmap": {
"type": "roaring",
"compressRunOnSerialization": true
},
"dimensionCompression": "lz4",
"metricCompression": "lz4",
"longEncoding": "longs",
"segmentLoader": null
},
"maxPendingPersists": 0,
"forceGuaranteedRollup": true,
"reportParseExceptions": false,
"pushTimeout": 0,
"segmentWriteOutMediumFactory": null,
"maxNumConcurrentSubTasks": 30,
"maxRetry": 3,
"taskStatusCheckPeriodMs": 1000,
"chatHandlerTimeout": "PT10S",
"chatHandlerNumRetries": 5,
"maxNumSegmentsToMerge": 100,
"totalNumMergeTasks": 10,
"logParseExceptions": false,
"maxParseExceptions": 2147483647,
"maxSavedParseExceptions": 0,
"buildV9Directly": true,
"partitionDimensions": []
}
}
}
```
</details>
Check the result dataset after ingesting the above in Druid v 0.20.0 using
this query:
```sql
select * from sys.segments
where datasource = 'test_s_d_prabh_1'
order by "num_rows" desc
```
You'll notice the skewed segment sizes and number of rows.
<img width="1153" alt="Screen Shot 2020-12-13 at 10 51 13 PM"
src="https://user-images.githubusercontent.com/4270624/102038907-76a75a80-3d96-11eb-9258-decd98501450.png">
Notice that [as per
docs](https://druid.apache.org/docs/latest/ingestion/native-batch.html#single-dimension-range-partitioning),
the config is a soft max, not hard:
> maxRowsPerSegment | Soft max for the number of rows to include in a
partition
-------------------
Now run another ingest with this ingest spec, that utilizes
`targetRowsPerSegment`:
<details>
<summary>
Click here to see the ingest spec with that new config
</summary>
```json
{
"type": "index_parallel",
"spec": {
"dataSchema": {
"dataSource": "test_s_d_prabh_2",
"timestampSpec": {
"column": "timestamp",
"format": "iso",
"missingValue": null
},
"dimensionsSpec": {
"dimensions": [
{
"type": "string",
"name": "countryIsoCode",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "countryName",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "flags",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isAnonymous",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isMinor",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isNew",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isRobot",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "isUnpatrolled",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "metroCode",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "namespace",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
},
{
"type": "string",
"name": "regionIsoCode",
"multiValueHandling": "SORTED_ARRAY",
"createBitmapIndex": true
}
],
"dimensionExclusions": [
"sum_commentLength",
"added",
"count",
"delta",
"sum_deleted",
"deltaBucket",
"deleted",
"sum_deltaBucket",
"commentLength",
"sum_added",
"timestamp",
"sum_delta"
]
},
"metricsSpec": [
{
"type": "count",
"name": "count"
},
{
"type": "longSum",
"name": "sum_added",
"fieldName": "added",
"expression": null
},
{
"type": "longSum",
"name": "sum_commentLength",
"fieldName": "commentLength",
"expression": null
},
{
"type": "longSum",
"name": "sum_deleted",
"fieldName": "deleted",
"expression": null
},
{
"type": "longSum",
"name": "sum_delta",
"fieldName": "delta",
"expression": null
},
{
"type": "longSum",
"name": "sum_deltaBucket",
"fieldName": "deltaBucket",
"expression": null
}
],
"granularitySpec": {
"type": "uniform",
"segmentGranularity": "DAY",
"queryGranularity": "DAY",
"rollup": true,
"intervals": [
"2015-01-01T00:00:00.000Z/2017-01-01T00:00:00.000Z"
]
},
"transformSpec": {
"filter": null,
"transforms": []
}
},
"ioConfig": {
"type": "index_parallel",
"inputSource": {
"type": "http",
"uris": [
"https://druid.apache.org/data/wikipedia.json.gz"
],
"httpAuthenticationUsername": null,
"httpAuthenticationPassword": null
},
"inputFormat": {
"type": "json",
"flattenSpec": {
"useFieldDiscovery": true,
"fields": []
},
"featureSpec": {}
},
"appendToExisting": false
},
"tuningConfig": {
"type": "index_parallel",
"maxRowsPerSegment": 30,
"maxRowsInMemory": 1000000,
"maxBytesInMemory": 0,
"maxTotalRows": null,
"numShards": null,
"splitHintSpec": null,
"partitionsSpec": {
"type": "single_dim",
"targetRowsPerSegment": 20,
"maxRowsPerSegment": null,
"partitionDimension": "namespace",
"assumeGrouped": false
},
"indexSpec": {
"bitmap": {
"type": "roaring",
"compressRunOnSerialization": true
},
"dimensionCompression": "lz4",
"metricCompression": "lz4",
"longEncoding": "longs",
"segmentLoader": null
},
"indexSpecForIntermediatePersists": {
"bitmap": {
"type": "roaring",
"compressRunOnSerialization": true
},
"dimensionCompression": "lz4",
"metricCompression": "lz4",
"longEncoding": "longs",
"segmentLoader": null
},
"maxPendingPersists": 0,
"forceGuaranteedRollup": true,
"reportParseExceptions": false,
"pushTimeout": 0,
"segmentWriteOutMediumFactory": null,
"maxNumConcurrentSubTasks": 10,
"maxRetry": 3,
"taskStatusCheckPeriodMs": 1000,
"chatHandlerTimeout": "PT10S",
"chatHandlerNumRetries": 5,
"maxNumSegmentsToMerge": 100,
"totalNumMergeTasks": 10,
"logParseExceptions": false,
"maxParseExceptions": 2147483647,
"maxSavedParseExceptions": 0,
"buildV9Directly": true,
"partitionDimensions": []
}
}
}
```
</details>
Now, run this query:
```sql
select * from sys.segments
where datasource = 'test_s_d_prabh_2'
order by "num_rows" desc
```
You'll still notice skewed segments in terms of size and number of rows.
<img width="1154" alt="Screen Shot 2020-12-13 at 11 12 36 PM"
src="https://user-images.githubusercontent.com/4270624/102039729-b3745100-3d98-11eb-8152-b3d6ef42765b.png">
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]