I have 400k rows in table A, about 50 bytes each row, now I want to split
all the rows in A and insert into B, which is the same table but
partitioned on the row_id.
I fired off my hive query, but it only generated 1 mapper and 1 reducer,
so it's very slow. what settings can I set to use more mappers and
reducers?
currrently I have
set hive.auto.convert.join.noconditionaltask = false;
set hive.auto.convert.join=false;
set hive.auto.convert.sortmerge.join=false;
set hive.metastore.client.socket.timeout=300;
set hive.exec.parallel=true;
set hive.exec.parallel.thread.number=8;
set hive.exec.reducers.max=20;
-- set mapred.job.queue.name=${hiveconf:QUEUE_NAME};
-- set mapreduce.job.queuename=${hiveconf:QUEUE_NAME};
SET mapred.child.java.opts=-Xmx3G -XX:+UseConcMarkSweepGC;
SET hive.tez.java.opts="-server -Xmx6666m -Djava.net.preferIPv4Stack=true
-XX:NewRatio=8 -XX:+UseNUMA -XX:+UseParallelGC";
SET mapred.child.java.opts=-Xmx2000m;
SET mapreduce.map.java.opts=-server -Xmx5911m
-Djava.net.preferIPv4Stack=true;
SET mapreduce.reduce.java.opts=-server -Xmx3072m
-Djava.net.preferIPv4Stack=true;
SET yarn.app.mapreduce.am.command-opts=-Xmx3058m ;
set mapred.max.split.size=100;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set mapred.map.tasks = 20;
set hive.merge.mapfiles=false;
SET hive.stats.map.parallelism=10;
set mapreduce.map.memory.mb=5678;
the query is
INSERT OVERWRITE TABLE first_level_cluster_mapping PARTITION( cluster_id )
SELECT item_id, title, cluster_id
FROM tmp_cluster_mapping
;