Re: Foreman Parallelizer not working with compressed csv file?

Juergen Kneissl Thu, 23 Jul 2015 09:45:18 -0700

Yes of course:

I add the SQL and the output of EXPLAIN PLAN FOR:


-----------------------------------------

jdbc:drill:schema=dfs> explain plan for SELECT columns[4] stichtag,
columns[10] geschlecht, count(columns[0]) anzahl  FROM
dfs.`/mon_ew_xt_uni_bus_11.csv.gz` where 1 = 1 and columns[23] = 1  and
columns[5] = 'AA' group by columns[4], columns[10];

+------+------+

| text | json |

+------+------+

| 00-00    Screen

00-01      Project(stichtag=[$0], geschlecht=[$1], anzahl=[$2])

00-02        UnionExchange

01-01          HashAgg(group=[{0, 1}], anzahl=[$SUM0($2)])

01-02            Project(stichtag=[$0], geschlecht=[$1], anzahl=[$2])

01-03              HashToRandomExchange(dist0=[[$0]], dist1=[[$1]])

02-01                UnorderedMuxExchange

03-01                  Project(stichtag=[$0], geschlecht=[$1],
anzahl=[$2], E_X_P_R_H_A_S_H_F_I_E_L_D=[castInt(hash64AsDouble($1,
hash64AsDouble($0)))])
03-02                    HashAgg(group=[{0, 1}], anzahl=[COUNT($2)])

03-03                      Project(stichtag=[$0], geschlecht=[$1],
$f2=[$2])
03-04                        SelectionVectorRemover

03-05                          Filter(condition=[AND(true, =($3, 1),
=($4, 'AA'))])

03-06                            Project(ITEM=[ITEM($0, 4)],
ITEM1=[ITEM($0, 10)], ITEM2=[ITEM($0, 0)], ITEM3=[ITEM($0, 23)],
ITEM4=[ITEM($0, 5)])
03-07                              Scan(groupscan=[EasyGroupScan
[selectionRoot=hdfs://master:8020/mon_ew_xt_uni_bus_11.csv.gz,
numFiles=1, columns=[`columns`[4], `columns`[10], `columns`[0],
`columns`[23], `columns`[5]],
files=[hdfs://master:8020/mon_ew_xt_uni_bus_11.csv.gz]]])

 | {

  "head" : {

    "version" : 1,

    "generator" : {

      "type" : "ExplainHandler",

      "info" : ""

    },

    "type" : "APACHE_DRILL_PHYSICAL",

    "options" : [ {

      "name" : "planner.slice_target",

      "kind" : "LONG",

      "type" : "SESSION",

      "num_val" : 10000

    } ],

    "queue" : 0,

    "resultMode" : "EXEC"

  },

  "graph" : [ {

    "pop" : "fs-scan",

    "@id" : 196615,

    "userName" : "drill",

    "files" : [ "hdfs://master:8020/mon_ew_xt_uni_bus_11.csv.gz" ],

    "storage" : {

      "type" : "file",

      "enabled" : true,

      "connection" : "hdfs://master:8020",

      "workspaces" : {

        "root" : {

          "location" : "/srv/hadoop/dfs/data",

          "writable" : true,

          "defaultInputFormat" : null

        },

        "tmp" : {

          "location" : "/tmp",

          "writable" : true,

          "defaultInputFormat" : null

        }

      },

      "formats" : {

        "psv" : {

          "type" : "text",

          "extensions" : [ "tbl" ],

          "delimiter" : "|"

        },

        "csv" : {

          "type" : "text",

          "extensions" : [ "csv" ],

          "delimiter" : ";"

        },

        "tsv" : {

          "type" : "text",

          "extensions" : [ "tsv" ],

          "delimiter" : "\t"

        },

        "parquet" : {

          "type" : "parquet"

        },

        "json" : {

          "type" : "json"

        },

        "avro" : {

          "type" : "avro"

        }

      }

    },

    "format" : {

      "type" : "text",

      "extensions" : [ "csv" ],

      "delimiter" : ";"

    },

    "columns" : [ "`columns`[4]", "`columns`[10]", "`columns`[0]",
"`columns`[23]", "`columns`[5]" ],

    "selectionRoot" : "hdfs://master:8020/mon_ew_xt_uni_bus_11.csv.gz",

    "cost" : 1.4826017E7

  }, {

    "pop" : "project",

    "@id" : 196614,

    "exprs" : [ {

      "ref" : "`ITEM`",

      "expr" : "`columns`[4]"

    }, {

      "ref" : "`ITEM1`",

      "expr" : "`columns`[10]"

    }, {

      "ref" : "`ITEM2`",

      "expr" : "`columns`[0]"

    }, {

      "ref" : "`ITEM3`",

      "expr" : "`columns`[23]"

    }, {

      "ref" : "`ITEM4`",

      "expr" : "`columns`[5]"

    } ],

    "child" : 196615,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 1.4826017E7

  }, {

    "pop" : "filter",

    "@id" : 196613,

    "child" : 196614,

    "expr" : "booleanAnd(true, equal(`ITEM3`, 1) , equal(`ITEM4`, 'AA')
) ",
    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 333585.3825

  }, {

    "pop" : "selection-vector-remover",

    "@id" : 196612,

    "child" : 196613,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 333585.3825

  }, {

    "pop" : "project",

    "@id" : 196611,

    "exprs" : [ {

      "ref" : "`stichtag`",

      "expr" : "`ITEM`"

    }, {

      "ref" : "`geschlecht`",

      "expr" : "`ITEM1`"

    }, {

      "ref" : "`$f2`",

      "expr" : "`ITEM2`"

    } ],

    "child" : 196612,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 333585.3825

  }, {

    "pop" : "hash-aggregate",

    "@id" : 196610,

    "child" : 196611,

    "cardinality" : 1.0,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "groupByExprs" : [ {

      "ref" : "`stichtag`",

      "expr" : "`stichtag`"

    }, {

      "ref" : "`geschlecht`",

      "expr" : "`geschlecht`"

    } ],

    "aggrExprs" : [ {

      "ref" : "`anzahl`",

      "expr" : "count(`$f2`) "

    } ],

    "cost" : 250189.036875

  }, {

    "pop" : "project",

    "@id" : 196609,

    "exprs" : [ {

      "ref" : "`stichtag`",

      "expr" : "`stichtag`"

    }, {

      "ref" : "`geschlecht`",

      "expr" : "`geschlecht`"

    }, {

      "ref" : "`anzahl`",

      "expr" : "`anzahl`"

    }, {

      "ref" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",

      "expr" : "castint(hash64asdouble(`geschlecht`,
hash64asdouble(`stichtag`) ) ) "

    } ],

    "child" : 196610,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 33358.53825

  }, {

    "pop" : "unordered-mux-exchange",

    "@id" : 131073,

    "child" : 196609,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 33358.53825

  }, {

    "pop" : "hash-to-random-exchange",

    "@id" : 65539,

    "child" : 131073,

    "expr" : "`E_X_P_R_H_A_S_H_F_I_E_L_D`",

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 33358.53825

  }, {

    "pop" : "project",

    "@id" : 65538,

    "exprs" : [ {

      "ref" : "`stichtag`",

      "expr" : "`stichtag`"

    }, {

      "ref" : "`geschlecht`",

      "expr" : "`geschlecht`"

    }, {

      "ref" : "`anzahl`",

      "expr" : "`anzahl`"

    } ],

    "child" : 65539,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "cost" : 33358.53825

  }, {

    "pop" : "hash-aggregate",

    "@id" : 65537,

    "child" : 65538,

    "cardinality" : 1.0,

    "initialAllocation" : 1000000,

    "maxAllocation" : 10000000000,

    "groupByExprs" : [ {

      "ref" : "`stichtag`",

      "expr" : "`stichtag`"
    }, {
      "ref" : "`geschlecht`",
      "expr" : "`geschlecht`"
    } ],
    "aggrExprs" : [ {
      "ref" : "`anzahl`",
      "expr" : "$sum0(`anzahl`) "
    } ],
    "cost" : 25018.9036875
  }, {
    "pop" : "union-exchange",
    "@id" : 2,
    "child" : 65537,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 3335.8538249999997
  }, {
    "pop" : "project",
    "@id" : 1,
    "exprs" : [ {
      "ref" : "`stichtag`",
      "expr" : "`stichtag`"
    }, {
      "ref" : "`geschlecht`",
      "expr" : "`geschlecht`"
    }, {
      "ref" : "`anzahl`",
      "expr" : "`anzahl`"
    } ],
    "child" : 2,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 3335.8538249999997
  }, {
    "pop" : "screen",
    "@id" : 0,
    "child" : 1,
    "initialAllocation" : 1000000,
    "maxAllocation" : 10000000000,
    "cost" : 3335.8538249999997
  } ]
} |
+------+------+
1 row selected (0,764 seconds)


On 07/23/15 18:10, Abdel Hakim Deneche wrote:
> Hi Juergen,
> 
> can you share the query you tried to run ?
> 
> Thanks
> 
> On Thu, Jul 23, 2015 at 9:10 AM, Juergen Kneissl <[email protected]> wrote:
> 
>> Hi everybody,
>>
>> I installed and configured a small cluster with two machines (gnu/linux)
>> with the following setup:
>>
>> zookeeper in version 3.4.6 , drill in version 1.1.0 and also using
>> hadoop (version 2.7.1) hdfs as dist. filesystem.
>>
>> So, I am playing around a bit, but what I am still not understanding is
>> why my drill Foreman  bit1 (or whoever that is in the situation) is not
>> "really" parallelizing my request. (or do I expect something from the
>> architecture that is not intended?)
>>
>>
>> I select and aggregate on a 1,4 GB gzipped csv file, and I thought at
>> least part of the query  would be processed on the other drillbit.
>> (bit 2)
>>
>> For instance, in the profiles I see that Major Fragment 01 was divided
>> into four Minor Fragments (of which two were forwarded to bit 2)
>>
>> If I check on the drillbit.log file of the bit2 (in the above
>> configuration) a debug message tells me that the incoming record count
>> is 0?
>>
>> The question is: What am I doing wrong in my configuration? Has it
>> something todo with using a csv file?
>>
>> The query is also set in a way that it is clear the whole file has to be
>> read in memory. That does not concern me that much, now I just wanted to
>> check how the Foreman does the "Parallelization"
>>
>> Best Regards & Thanks for any hint
>>
>>
>> Juergen
>>
> 
> 
>

Re: Foreman Parallelizer not working with compressed csv file?

Reply via email to