mihai-cazacu-adswizz opened a new issue #7411: [materialized view] Too many 
Hadoop jobs doing the same thing
URL: https://github.com/apache/incubator-druid/issues/7411
 
 
   ### Affected Version
   
   `0.13.0-incubating`
   
   ### Description
   
   I have created this MV specification:
   
   ```
   {
     "type": "derivativeDataSource",
     "baseDataSource": "the-base-data-source",
     "dimensionsSpec": {
       "dimensions": [
                ...
       ],
       "dimensionExclusions": []
     },
     "metricsSpec": [
       ...
     ],
     "tuningConfig": {
       "type": "hadoop",
       "workingPath": null,
       "version": "2019-03-20T06:44:16.593Z",
       "partitionsSpec": {
         "type": "hashed",
         "targetPartitionSize": -1,
         "maxPartitionSize": -1,
         "assumeGrouped": false,
         "numShards": -1,
         "partitionDimensions": []
       },
       "shardSpecs": {},
       "indexSpec": {
         "bitmap": {
           "type": "concise"
         },
         "dimensionCompression": "lz4",
         "metricCompression": "lz4",
         "longEncoding": "longs"
       },
       "maxRowsInMemory": 1000000,
       "maxBytesInMemory": 0,
       "leaveIntermediate": false,
       "cleanupOnFailure": true,
       "overwriteFiles": false,
       "ignoreInvalidRows": false,
       "jobProperties": {
         "fs.s3.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
         "fs.s3n.impl": "org.apache.hadoop.fs.s3native.NativeS3FileSystem",
         "io.compression.codecs": 
"org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.SnappyCodec",
         "mapreduce.job.classloader": "true",
         "mapreduce.job.queuename": "mv"
       },
       "combineText": false,
       "useCombiner": false,
       "buildV9Directly": true,
       "numBackgroundPersistThreads": 0,
       "forceExtendableShardSpecs": false,
       "useExplicitVersion": false,
       "allowedHadoopPrefix": [],
       "logParseExceptions": false,
       "maxParseExceptions": 0
     },
     "dataSource": "the-data-source",
     "hadoopCoordinates": null,
     "hadoopDependencyCoordinates": [
       "org.apache.hadoop:hadoop-client:2.7.3"
     ],
     "classpathPrefix": null,
     "context": {
       "maxTaskCount": 1,
       "minDataLagMs": 172800000
     },
     "suspended": false
   }
   ```
   for a data source like this one:
   
   ```
   {
     "type": "kafka",
     "dataSchema": {
       "dataSource": "the-data-source",
       "parser": {
         "type": "string",
         "parseSpec": {
           "format": "json",
           "timestampSpec": {
             "column": "timestamp",
             "format": "auto"
           },
           "dimensionsSpec": {
             "dimensions": [],
             "dimensionExclusions": [
               ...
             ],
             "spatialDimensions": []
           }
         }
       },
       "metricsSpec": [
         ...
       ],
       "granularitySpec": {
         "type": "uniform",
         "segmentGranularity": "HOUR",
         "queryGranularity": "HOUR",
         "rollup": true,
         "intervals": null
       },
       "transformSpec": {
         "filter": null,
         "transforms": []
       }
     },
     "tuningConfig": {
       "type": "kafka",
       "maxRowsInMemory": 40000,
       "maxBytesInMemory": 0,
       "maxRowsPerSegment": 5000000,
       "maxTotalRows": null,
       "intermediatePersistPeriod": "PT10M",
       "basePersistDirectory": "/mnt/druid/tmp/1554352333133-0",
       "maxPendingPersists": 0,
       "indexSpec": {
         "bitmap": {
           "type": "concise"
         },
         "dimensionCompression": "lz4",
         "metricCompression": "lz4",
         "longEncoding": "longs"
       },
       "buildV9Directly": true,
       "reportParseExceptions": false,
       "handoffConditionTimeout": 0,
       "resetOffsetAutomatically": true,
       "segmentWriteOutMediumFactory": null,
       "workerThreads": null,
       "chatThreads": null,
       "chatRetries": 20,
       "httpTimeout": "PT60S",
       "shutdownTimeout": "PT180S",
       "offsetFetchPeriod": "PT30S",
       "intermediateHandoffPeriod": "P2147483647D",
       "logParseExceptions": false,
       "maxParseExceptions": 2147483647,
       "maxSavedParseExceptions": 0
     },
     "ioConfig": {
       "topic": "the-topic",
       "replicas": 1,
       "taskCount": 4,
       "taskDuration": "PT3960S",
       "consumerProperties": {
         "bootstrap.servers": "kafka-test-broker.host.com:9092",
         "group.id": "IngestionDruid",
         "auto.offset.reset": "latest",
         "max.partition.fetch.bytes": 4000000
       },
       "startDelay": "PT0S",
       "period": "PT30S",
       "useEarliestOffset": false,
       "completionTimeout": "PT1800S",
       "lateMessageRejectionPeriod": null,
       "earlyMessageRejectionPeriod": null,
       "skipOffsetGaps": false
     },
     "context": null,
     "suspended": false
   }
   ```
   and, once the view was created, there is a Hadoop jobs (started every ~30 
seconds and having a duration of ~30 seconds), which does some repetitive work:
   
   ```
   
http://druid-test-master.extc.adswizz.com:8090/druid/indexer/v1/task/index_materialized_view_mv_order-and-campaign_group-uat_2019-04-04T00%3A01%3A56.850Z/log
        Segment 0/1 for dataSource[meru-group-uat-druid] has 
identifier[meru-group-uat-druid_2019-03-12T11:00:00.000Z_2019-03-12T12:00:00.000Z_2019-03-14T13:06:36.212Z],
 interval[2019-03-12T11:00:00.000Z/2019-03-12T12:00:00.000Z]
   
        2019-04-04T00:02:14,093 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Running job: job_1553777733785_5208
        2019-04-04T00:02:22,211 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5208 running in uber 
mode : false
        2019-04-04T00:02:22,212 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 0% reduce 0%
        2019-04-04T00:02:37,511 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 23% reduce 0%
        2019-04-04T00:02:40,533 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 50% reduce 0%
        2019-04-04T00:02:42,562 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 0%
        2019-04-04T00:02:55,631 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 71%
        2019-04-04T00:02:58,646 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 90%
        2019-04-04T00:03:01,660 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 100%
        2019-04-04T00:03:02,673 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5208 completed 
successfully
   
   
http://druid-test-master.extc.adswizz.com:8090/druid/indexer/v1/task/index_materialized_view_mv_order-and-campaign_group-uat_2019-04-04T00%3A03%3A56.874Z/log
        Segment 0/1 for dataSource[meru-group-uat-druid] has 
identifier[meru-group-uat-druid_2019-03-12T11:00:00.000Z_2019-03-12T12:00:00.000Z_2019-03-14T13:06:36.212Z],
 interval[2019-03-12T11:00:00.000Z/2019-03-12T12:00:00.000Z]
   
        2019-04-04T00:04:14,150 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Running job: job_1553777733785_5211
        2019-04-04T00:04:21,254 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5211 running in uber 
mode : false
        2019-04-04T00:04:21,255 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 0% reduce 0%
        2019-04-04T00:04:36,697 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 20% reduce 0%
        2019-04-04T00:04:39,714 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 46% reduce 0%
        2019-04-04T00:04:42,748 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 0%
        2019-04-04T00:04:54,802 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 69%
        2019-04-04T00:04:57,815 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 82%
        2019-04-04T00:05:00,827 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 97%
        2019-04-04T00:05:02,836 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 100%
        2019-04-04T00:05:03,847 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5211 completed 
successfully
   
   
http://druid-test-master.extc.adswizz.com:8090/druid/indexer/v1/task/index_materialized_view_mv_order-and-campaign_group-uat_2019-04-04T00%3A05%3A56.898Z/log
        Segment 0/1 for dataSource[meru-group-uat-druid] has 
identifier[meru-group-uat-druid_2019-03-12T11:00:00.000Z_2019-03-12T12:00:00.000Z_2019-03-14T13:06:36.212Z],
 interval[2019-03-12T11:00:00.000Z/2019-03-12T12:00:00.000Z]
   
        2019-04-04T00:06:13,814 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Running job: job_1553777733785_5214
        2019-04-04T00:06:20,947 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5214 running in uber 
mode : false
        2019-04-04T00:06:20,948 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 0% reduce 0%
        2019-04-04T00:06:36,844 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 19% reduce 0%
        2019-04-04T00:06:39,866 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 43% reduce 0%
        2019-04-04T00:06:42,904 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 0%
        2019-04-04T00:06:54,967 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 68%
        2019-04-04T00:06:57,984 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 83%
        2019-04-04T00:07:00,998 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 100%
        2019-04-04T00:07:03,016 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5214 completed 
successfully
   
   
   
http://druid-test-master.extc.adswizz.com:8090/druid/indexer/v1/task/index_materialized_view_mv_order-and-campaign_group-uat_2019-04-04T00%3A07%3A56.930Z/log
        Segment 0/1 for dataSource[meru-group-uat-druid] has 
identifier[meru-group-uat-druid_2019-03-12T11:00:00.000Z_2019-03-12T12:00:00.000Z_2019-03-14T13:06:36.212Z],
 interval[2019-03-12T11:00:00.000Z/2019-03-12T12:00:00.000Z]   
                
        2019-04-04T00:08:14,401 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Running job: job_1553777733785_5217
        2019-04-04T00:08:22,494 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5217 running in uber 
mode : false
        2019-04-04T00:08:22,495 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 0% reduce 0%
        2019-04-04T00:08:37,777 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 18% reduce 0%
        2019-04-04T00:08:40,794 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 41% reduce 0%
        2019-04-04T00:08:43,811 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 67% reduce 0%
        2019-04-04T00:08:44,826 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 0%
        2019-04-04T00:08:56,885 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 74%
        2019-04-04T00:08:59,899 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 93%
        2019-04-04T00:09:02,915 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job -  map 100% reduce 100%
        2019-04-04T00:09:02,924 INFO [task-runner-0-priority-0] 
org.apache.hadoop.mapreduce.Job - Job job_1553777733785_5217 completed 
successfully
   ```
   
   Is this the right behavior of the MV Supervisor? I don't remember having 
these kind of jobs for other materialized views.
   
   @zhangxinyu1: Can I have your help, please?
   
   Thank you!

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@druid.apache.org
For additional commands, e-mail: commits-h...@druid.apache.org

Reply via email to