Author: gunther
Date: Tue Oct 1 01:47:33 2013
New Revision: 1527855
URL: http://svn.apache.org/r1527855
Log:
HIVE-5389: custom LogicalIOProcessor - map record processor (Thejas M Nair via
Gunther Hagleitner)
Added:
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
Modified:
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
Modified:
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
URL:
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java?rev=1527855&r1=1527854&r2=1527855&view=diff
==============================================================================
---
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
(original)
+++
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
Tue Oct 1 01:47:33 2013
@@ -227,7 +227,7 @@ public class DagUtils {
byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf);
if (inputSplitInfo.getNumTasks() != 0) {
map = new Vertex("Map "+seqNo,
- new ProcessorDescriptor(MapProcessor.class.getName()).
+ new ProcessorDescriptor(TezProcessor.class.getName()).
setUserPayload(serializedConf),
inputSplitInfo.getNumTasks(), MRHelpers.getMapResource(conf));
Map<String, String> environment = new HashMap<String, String>();
Added:
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
URL:
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java?rev=1527855&view=auto
==============================================================================
---
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
(added)
+++
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
Tue Oct 1 01:47:33 2013
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.MapOperator;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
+import org.apache.hadoop.hive.ql.exec.ObjectCache;
+import org.apache.hadoop.hive.ql.exec.ObjectCacheFactory;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.reportStats;
+import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
+import org.apache.hadoop.hive.ql.plan.MapWork;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.tez.mapreduce.input.MRInput;
+import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.LogicalInput;
+import org.apache.tez.runtime.library.api.KeyValueReader;
+
+/**
+ * Process input from tez LogicalInput and write output - for a map plan
+ * Just pump the records through the query plan.
+ */
+public class MapRecordProcessor extends RecordProcessor{
+
+ private static final String PLAN_KEY = "__MAP_PLAN__";
+ private MapOperator mapOp;
+ public static final Log l4j = LogFactory.getLog(MapRecordProcessor.class);
+ private final ExecMapperContext execContext = new ExecMapperContext();
+ private boolean abort = false;
+
+ @Override
+ void init(JobConf jconf, MRTaskReporter mrReporter, Collection<LogicalInput>
inputs,
+ OutputCollector out){
+ super.init(jconf, mrReporter, inputs, out);
+
+ ObjectCache cache = ObjectCacheFactory.getCache(jconf);
+ try {
+
+ execContext.setJc(jconf);
+ // create map and fetch operators
+ MapWork mrwork = (MapWork) cache.retrieve(PLAN_KEY);
+ if (mrwork == null) {
+ mrwork = Utilities.getMapWork(jconf);
+ cache.cache(PLAN_KEY, mrwork);
+ }
+ mapOp = new MapOperator();
+
+ // initialize map operator
+ mapOp.setConf(mrwork);
+ mapOp.setChildren(jconf);
+ l4j.info(mapOp.dump(0));
+
+ MapredContext.init(true, new JobConf(jconf));
+ mapOp.setExecContext(execContext);
+ mapOp.initializeLocalWork(jconf);
+ mapOp.initialize(jconf, null);
+
+ mapOp.setOutputCollector(out);
+ mapOp.setReporter(reporter);
+ MapredContext.get().setReporter(reporter);
+
+ } catch (Throwable e) {
+ abort = true;
+ if (e instanceof OutOfMemoryError) {
+ // will this be true here?
+ // Don't create a new object if we are already out of memory
+ throw (OutOfMemoryError) e;
+ } else {
+ throw new RuntimeException("Map operator initialization failed", e);
+ }
+ }
+ }
+
+ @Override
+ void run() throws IOException{
+ if (inputs.size() != 1) {
+ throw new IllegalArgumentException("MapRecordProcessor expects single
input"
+ + ", inputCount=" + inputs.size());
+ }
+
+ MRInput in = (MRInput)inputs.iterator().next();
+ KeyValueReader reader = in.getReader();
+
+ //process records until done
+ while(reader.next()){
+ //ignore the key for maps - reader.getCurrentKey();
+ Object value = reader.getCurrentValue();
+ boolean needMore = processRow(value);
+ if(!needMore){
+ break;
+ }
+ }
+ }
+
+
+ /**
+ * @param value value to process
+ * @return true if it is not done and can take more inputs
+ */
+ private boolean processRow(Object value) {
+ // reset the execContext for each new row
+ execContext.resetRow();
+
+ try {
+ if (mapOp.getDone()) {
+ return false; //done
+ } else {
+ // Since there is no concept of a group, we don't invoke
+ // startGroup/endGroup for a mapper
+ mapOp.process((Writable)value);
+ if (isLogInfoEnabled) {
+ logProgress();
+ }
+ }
+ } catch (Throwable e) {
+ abort = true;
+ if (e instanceof OutOfMemoryError) {
+ // Don't create a new object if we are already out of memory
+ throw (OutOfMemoryError) e;
+ } else {
+ l4j.fatal(StringUtils.stringifyException(e));
+ throw new RuntimeException(e);
+ }
+ }
+ return true; //give me more
+ }
+
+ @Override
+ void close(){
+ // check if there are IOExceptions
+ if (!abort) {
+ abort = execContext.getIoCxt().getIOExceptions();
+ }
+
+ // detecting failed executions by exceptions thrown by the operator tree
+ try {
+ mapOp.close(abort);
+ if (isLogInfoEnabled) {
+ logCloseInfo();
+ }
+ reportStats rps = new reportStats(reporter);
+ mapOp.preorderMap(rps);
+ return;
+ } catch (Exception e) {
+ if (!abort) {
+ // signal new failure to map-reduce
+ l4j.error("Hit error while closing operators - failing tree");
+ throw new RuntimeException("Hive Runtime Error while closing
operators", e);
+ }
+ } finally {
+ MapredContext.close();
+ }
+ }
+
+}
Added:
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
URL:
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java?rev=1527855&view=auto
==============================================================================
---
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
(added)
+++
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
Tue Oct 1 01:47:33 2013
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+import java.io.IOException;
+import java.lang.management.ManagementFactory;
+import java.lang.management.MemoryMXBean;
+import java.net.URLClassLoader;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.LogicalInput;
+
+/**
+ * Process input from tez LogicalInput and write output
+ * It has different subclasses for map and reduce processing
+ */
+public abstract class RecordProcessor {
+
+ protected JobConf jconf;
+ protected Collection<LogicalInput> inputs;
+ protected OutputCollector out;
+
+ public static final Log l4j = LogFactory.getLog(RecordProcessor.class);
+
+
+ // used to log memory usage periodically
+ public static MemoryMXBean memoryMXBean;
+ protected boolean isLogInfoEnabled = false;
+ protected MRTaskReporter reporter;
+
+ private long numRows = 0;
+ private long nextUpdateCntr = 1;
+
+
+ /**
+ * Common initialization code for RecordProcessors
+ * @param jconf
+ * @param mrReporter
+ * @param inputs
+ * @param out
+ */
+ void init(JobConf jconf, MRTaskReporter mrReporter, Collection<LogicalInput>
inputs,
+ OutputCollector out){
+ this.jconf = jconf;
+ this.reporter = mrReporter;
+ this.inputs = inputs;
+ this.out = out;
+
+ // Allocate the bean at the beginning -
+ memoryMXBean = ManagementFactory.getMemoryMXBean();
+
+
+ l4j.info("maximum memory = " + memoryMXBean.getHeapMemoryUsage().getMax());
+
+ isLogInfoEnabled = l4j.isInfoEnabled();
+
+ //log classpaths
+ try {
+ l4j.info("conf classpath = "
+ + Arrays.asList(((URLClassLoader)
jconf.getClassLoader()).getURLs()));
+ l4j.info("thread classpath = "
+ + Arrays.asList(((URLClassLoader) Thread.currentThread()
+ .getContextClassLoader()).getURLs()));
+ } catch (Exception e) {
+ l4j.info("cannot get classpath: " + e.getMessage());
+ }
+
+ }
+
+ /**
+ * start processing the inputs and writing output
+ * @throws IOException
+ */
+ abstract void run() throws IOException;
+
+
+ abstract void close();
+
+ /**
+ * Log information to be logged at the end
+ */
+ protected void logCloseInfo() {
+ long used_memory = memoryMXBean.getHeapMemoryUsage().getUsed();
+ l4j.info("ExecMapper: processed " + numRows + " rows: used memory = "
+ + used_memory);
+ }
+
+ /**
+ * Log number of records processed and memory used after processing many
records
+ */
+ protected void logProgress() {
+ numRows++;
+ if (numRows == nextUpdateCntr) {
+ long used_memory = memoryMXBean.getHeapMemoryUsage().getUsed();
+ l4j.info("ExecMapper: processing " + numRows
+ + " rows: used memory = " + used_memory);
+ nextUpdateCntr = getNextUpdateRecordCounter(numRows);
+ }
+ }
+
+ private long getNextUpdateRecordCounter(long cntr) {
+ // A very simple counter to keep track of number of rows processed by the
+ // reducer. It dumps
+ // every 1 million times, and quickly before that
+ if (cntr >= 1000000) {
+ return cntr + 1000000;
+ }
+
+ return 10 * cntr;
+ }
+
+}
Added:
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
URL:
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java?rev=1527855&view=auto
==============================================================================
---
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
(added)
+++
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
Tue Oct 1 01:47:33 2013
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.tez.common.TezUtils;
+import org.apache.tez.mapreduce.input.MRInput;
+import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.Event;
+import org.apache.tez.runtime.api.LogicalIOProcessor;
+import org.apache.tez.runtime.api.LogicalInput;
+import org.apache.tez.runtime.api.LogicalOutput;
+import org.apache.tez.runtime.api.TezProcessorContext;
+import org.apache.tez.runtime.library.api.KeyValueWriter;
+
+/**
+ * Hive processor for Tez that forms the vertices in Tez and processes the
data.
+ * Does what ExecMapper and ExecReducer does for hive in MR framework.
+ */
+public class TezProcessor implements LogicalIOProcessor {
+ private static final Log LOG = LogFactory.getLog(TezProcessor.class);
+
+ boolean isMap;
+ RecordProcessor rproc = null;
+
+ private JobConf jobConf;
+
+ private TezProcessorContext processorContext;
+
+ public TezProcessor() {
+ this.isMap = true;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if(rproc != null){
+ rproc.close();
+ }
+ }
+
+ @Override
+ public void handleEvents(List<Event> arg0) {
+ //this is not called by tez, so nothing to be done here
+ }
+
+ @Override
+ public void initialize(TezProcessorContext processorContext)
+ throws IOException {
+ this.processorContext = processorContext;
+ //get the jobconf
+ byte[] userPayload = processorContext.getUserPayload();
+ Configuration conf = TezUtils.createConfFromUserPayload(userPayload);
+ this.jobConf = new JobConf(conf);
+ }
+
+ @Override
+ public void run(Map<String, LogicalInput> inputs, Map<String, LogicalOutput>
outputs)
+ throws Exception {
+ // in case of broadcast-join read the broadcast edge inputs
+ // (possibly asynchronously)
+
+ LOG.info("Running map: " + processorContext.getUniqueIdentifier());
+
+ //this will change when TezProcessor has support for shuffle joins and
broadcast joins
+ if (inputs.size() != 1){
+ throw new IOException("Cannot handle multiple inputs "
+ + " inputCount=" + inputs.size());
+ }
+
+ if(outputs.size() > 1) {
+ throw new IOException("Cannot handle more than one output"
+ + ", outputCount=" + outputs.size());
+ }
+ LogicalInput in = inputs.values().iterator().next();
+ LogicalOutput out = outputs.values().iterator().next();
+
+ MRInput input = (MRInput)in;
+
+ //update config
+ Configuration updatedConf = input.getConfigUpdates();
+ if (updatedConf != null) {
+ for (Entry<String, String> entry : updatedConf) {
+ this.jobConf.set(entry.getKey(), entry.getValue());
+ }
+ }
+
+ KeyValueWriter kvWriter = (KeyValueWriter)out.getWriter();
+ OutputCollector collector = new KVOutputCollector(kvWriter);
+
+ if(isMap){
+ rproc = new MapRecordProcessor();
+ }
+ else{
+ throw new UnsupportedOperationException("Reduce is yet to be
implemented");
+ }
+
+ MRTaskReporter mrReporter = new MRTaskReporter(processorContext);
+ rproc.init(jobConf, mrReporter, inputs.values(), collector);
+ rproc.run();
+
+ //done - output does not need to be committed as hive does not use
outputcommitter
+ }
+
+ /**
+ * KVOutputCollector. OutputCollector that writes using KVWriter
+ *
+ */
+ static class KVOutputCollector implements OutputCollector {
+ private final KeyValueWriter output;
+
+ KVOutputCollector(KeyValueWriter output) {
+ this.output = output;
+ }
+
+ public void collect(Object key, Object value) throws IOException {
+ output.write(key, value);
+ }
+ }
+
+}