Author: gunther
Date: Tue Oct  1 01:47:33 2013
New Revision: 1527855

URL: http://svn.apache.org/r1527855
Log:
HIVE-5389: custom LogicalIOProcessor - map record processor (Thejas M Nair via 
Gunther Hagleitner)

Added:
    
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
    
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
    
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
Modified:
    
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java

Modified: 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java
URL: 
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java?rev=1527855&r1=1527854&r2=1527855&view=diff
==============================================================================
--- 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java 
(original)
+++ 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/DagUtils.java 
Tue Oct  1 01:47:33 2013
@@ -227,7 +227,7 @@ public class DagUtils {
     byte[] serializedConf = MRHelpers.createUserPayloadFromConf(conf);
     if (inputSplitInfo.getNumTasks() != 0) {
       map = new Vertex("Map "+seqNo,
-          new ProcessorDescriptor(MapProcessor.class.getName()).
+          new ProcessorDescriptor(TezProcessor.class.getName()).
                setUserPayload(serializedConf),
           inputSplitInfo.getNumTasks(), MRHelpers.getMapResource(conf));
       Map<String, String> environment = new HashMap<String, String>();

Added: 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
URL: 
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java?rev=1527855&view=auto
==============================================================================
--- 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
 (added)
+++ 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/MapRecordProcessor.java
 Tue Oct  1 01:47:33 2013
@@ -0,0 +1,178 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+
+import java.io.IOException;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.ql.exec.MapOperator;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
+import org.apache.hadoop.hive.ql.exec.ObjectCache;
+import org.apache.hadoop.hive.ql.exec.ObjectCacheFactory;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.exec.mr.ExecMapper.reportStats;
+import org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext;
+import org.apache.hadoop.hive.ql.plan.MapWork;
+import org.apache.hadoop.io.Writable;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.hadoop.util.StringUtils;
+import org.apache.tez.mapreduce.input.MRInput;
+import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.LogicalInput;
+import org.apache.tez.runtime.library.api.KeyValueReader;
+
+/**
+ * Process input from tez LogicalInput and write output - for a map plan
+ * Just pump the records through the query plan.
+ */
+public class MapRecordProcessor  extends RecordProcessor{
+
+  private static final String PLAN_KEY = "__MAP_PLAN__";
+  private MapOperator mapOp;
+  public static final Log l4j = LogFactory.getLog(MapRecordProcessor.class);
+  private final ExecMapperContext execContext = new ExecMapperContext();
+  private boolean abort = false;
+
+  @Override
+  void init(JobConf jconf, MRTaskReporter mrReporter, Collection<LogicalInput> 
inputs,
+      OutputCollector out){
+    super.init(jconf, mrReporter, inputs, out);
+
+    ObjectCache cache = ObjectCacheFactory.getCache(jconf);
+    try {
+
+      execContext.setJc(jconf);
+      // create map and fetch operators
+      MapWork mrwork = (MapWork) cache.retrieve(PLAN_KEY);
+      if (mrwork == null) {
+        mrwork = Utilities.getMapWork(jconf);
+        cache.cache(PLAN_KEY, mrwork);
+      }
+      mapOp = new MapOperator();
+
+      // initialize map operator
+      mapOp.setConf(mrwork);
+      mapOp.setChildren(jconf);
+      l4j.info(mapOp.dump(0));
+
+      MapredContext.init(true, new JobConf(jconf));
+      mapOp.setExecContext(execContext);
+      mapOp.initializeLocalWork(jconf);
+      mapOp.initialize(jconf, null);
+
+      mapOp.setOutputCollector(out);
+      mapOp.setReporter(reporter);
+      MapredContext.get().setReporter(reporter);
+
+    } catch (Throwable e) {
+      abort = true;
+      if (e instanceof OutOfMemoryError) {
+        // will this be true here?
+        // Don't create a new object if we are already out of memory
+        throw (OutOfMemoryError) e;
+      } else {
+        throw new RuntimeException("Map operator initialization failed", e);
+      }
+    }
+  }
+
+  @Override
+  void run() throws IOException{
+    if (inputs.size() != 1) {
+      throw new IllegalArgumentException("MapRecordProcessor expects single 
input"
+          + ", inputCount=" + inputs.size());
+    }
+
+    MRInput in = (MRInput)inputs.iterator().next();
+    KeyValueReader reader = in.getReader();
+
+    //process records until done
+    while(reader.next()){
+      //ignore the key for maps -  reader.getCurrentKey();
+      Object value = reader.getCurrentValue();
+      boolean needMore = processRow(value);
+      if(!needMore){
+        break;
+      }
+    }
+  }
+
+
+  /**
+   * @param value  value to process
+   * @return true if it is not done and can take more inputs
+   */
+  private boolean processRow(Object value) {
+    // reset the execContext for each new row
+    execContext.resetRow();
+
+    try {
+      if (mapOp.getDone()) {
+        return false; //done
+      } else {
+        // Since there is no concept of a group, we don't invoke
+        // startGroup/endGroup for a mapper
+        mapOp.process((Writable)value);
+        if (isLogInfoEnabled) {
+          logProgress();
+        }
+      }
+    } catch (Throwable e) {
+      abort = true;
+      if (e instanceof OutOfMemoryError) {
+        // Don't create a new object if we are already out of memory
+        throw (OutOfMemoryError) e;
+      } else {
+        l4j.fatal(StringUtils.stringifyException(e));
+        throw new RuntimeException(e);
+      }
+    }
+    return true; //give me more
+  }
+
+  @Override
+  void close(){
+    // check if there are IOExceptions
+    if (!abort) {
+      abort = execContext.getIoCxt().getIOExceptions();
+    }
+
+    // detecting failed executions by exceptions thrown by the operator tree
+    try {
+      mapOp.close(abort);
+      if (isLogInfoEnabled) {
+        logCloseInfo();
+      }
+      reportStats rps = new reportStats(reporter);
+      mapOp.preorderMap(rps);
+      return;
+    } catch (Exception e) {
+      if (!abort) {
+        // signal new failure to map-reduce
+        l4j.error("Hit error while closing operators - failing tree");
+        throw new RuntimeException("Hive Runtime Error while closing 
operators", e);
+      }
+    } finally {
+      MapredContext.close();
+    }
+  }
+
+}

Added: 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
URL: 
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java?rev=1527855&view=auto
==============================================================================
--- 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
 (added)
+++ 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/RecordProcessor.java
 Tue Oct  1 01:47:33 2013
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+import java.io.IOException;
+import java.lang.management.ManagementFactory;
+import java.lang.management.MemoryMXBean;
+import java.net.URLClassLoader;
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.LogicalInput;
+
+/**
+ * Process input from tez LogicalInput and write output
+ * It has different subclasses for map and reduce processing
+ */
+public abstract class RecordProcessor  {
+
+  protected JobConf jconf;
+  protected Collection<LogicalInput> inputs;
+  protected OutputCollector out;
+
+  public static final Log l4j = LogFactory.getLog(RecordProcessor.class);
+
+
+  // used to log memory usage periodically
+  public static MemoryMXBean memoryMXBean;
+  protected boolean isLogInfoEnabled = false;
+  protected MRTaskReporter reporter;
+
+  private long numRows = 0;
+  private long nextUpdateCntr = 1;
+
+
+  /**
+   * Common initialization code for RecordProcessors
+   * @param jconf
+   * @param mrReporter
+   * @param inputs
+   * @param out
+   */
+  void init(JobConf jconf, MRTaskReporter mrReporter, Collection<LogicalInput> 
inputs,
+      OutputCollector out){
+    this.jconf = jconf;
+    this.reporter = mrReporter;
+    this.inputs = inputs;
+    this.out = out;
+
+    // Allocate the bean at the beginning -
+    memoryMXBean = ManagementFactory.getMemoryMXBean();
+
+
+    l4j.info("maximum memory = " + memoryMXBean.getHeapMemoryUsage().getMax());
+
+    isLogInfoEnabled = l4j.isInfoEnabled();
+
+    //log classpaths
+    try {
+      l4j.info("conf classpath = "
+          + Arrays.asList(((URLClassLoader) 
jconf.getClassLoader()).getURLs()));
+      l4j.info("thread classpath = "
+          + Arrays.asList(((URLClassLoader) Thread.currentThread()
+          .getContextClassLoader()).getURLs()));
+    } catch (Exception e) {
+      l4j.info("cannot get classpath: " + e.getMessage());
+    }
+
+  }
+
+  /**
+   * start processing the inputs and writing output
+   * @throws IOException
+   */
+  abstract void run() throws IOException;
+
+
+  abstract void close();
+
+  /**
+   * Log information to be logged at the end
+   */
+  protected void logCloseInfo() {
+    long used_memory = memoryMXBean.getHeapMemoryUsage().getUsed();
+    l4j.info("ExecMapper: processed " + numRows + " rows: used memory = "
+        + used_memory);
+  }
+
+  /**
+   * Log number of records processed and memory used after processing many 
records
+   */
+  protected void logProgress() {
+    numRows++;
+    if (numRows == nextUpdateCntr) {
+      long used_memory = memoryMXBean.getHeapMemoryUsage().getUsed();
+      l4j.info("ExecMapper: processing " + numRows
+          + " rows: used memory = " + used_memory);
+      nextUpdateCntr = getNextUpdateRecordCounter(numRows);
+    }
+  }
+
+  private long getNextUpdateRecordCounter(long cntr) {
+    // A very simple counter to keep track of number of rows processed by the
+    // reducer. It dumps
+    // every 1 million times, and quickly before that
+    if (cntr >= 1000000) {
+      return cntr + 1000000;
+    }
+
+    return 10 * cntr;
+  }
+
+}

Added: 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
URL: 
http://svn.apache.org/viewvc/hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java?rev=1527855&view=auto
==============================================================================
--- 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
 (added)
+++ 
hive/branches/tez/ql/src/java/org/apache/hadoop/hive/ql/exec/tez/TezProcessor.java
 Tue Oct  1 01:47:33 2013
@@ -0,0 +1,143 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.exec.tez;
+import java.io.IOException;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.mapred.JobConf;
+import org.apache.hadoop.mapred.OutputCollector;
+import org.apache.tez.common.TezUtils;
+import org.apache.tez.mapreduce.input.MRInput;
+import org.apache.tez.mapreduce.processor.MRTaskReporter;
+import org.apache.tez.runtime.api.Event;
+import org.apache.tez.runtime.api.LogicalIOProcessor;
+import org.apache.tez.runtime.api.LogicalInput;
+import org.apache.tez.runtime.api.LogicalOutput;
+import org.apache.tez.runtime.api.TezProcessorContext;
+import org.apache.tez.runtime.library.api.KeyValueWriter;
+
+/**
+ * Hive processor for Tez that forms the vertices in Tez and processes the 
data.
+ * Does what ExecMapper and ExecReducer does for hive in MR framework.
+ */
+public class TezProcessor implements LogicalIOProcessor {
+  private static final Log LOG = LogFactory.getLog(TezProcessor.class);
+
+  boolean isMap;
+  RecordProcessor rproc = null;
+
+  private JobConf jobConf;
+
+  private TezProcessorContext processorContext;
+
+  public TezProcessor() {
+    this.isMap = true;
+  }
+
+  @Override
+  public void close() throws IOException {
+    if(rproc != null){
+      rproc.close();
+    }
+  }
+
+  @Override
+  public void handleEvents(List<Event> arg0) {
+    //this is not called by tez, so nothing to be done here
+  }
+
+  @Override
+  public void initialize(TezProcessorContext processorContext)
+      throws IOException {
+    this.processorContext = processorContext;
+    //get the jobconf
+    byte[] userPayload = processorContext.getUserPayload();
+    Configuration conf = TezUtils.createConfFromUserPayload(userPayload);
+    this.jobConf = new JobConf(conf);
+  }
+
+  @Override
+  public void run(Map<String, LogicalInput> inputs, Map<String, LogicalOutput> 
outputs)
+      throws Exception {
+    // in case of broadcast-join read the broadcast edge inputs
+    // (possibly asynchronously)
+
+    LOG.info("Running map: " + processorContext.getUniqueIdentifier());
+
+    //this will change when TezProcessor has support for shuffle joins and 
broadcast joins
+    if (inputs.size() != 1){
+      throw new IOException("Cannot handle multiple inputs "
+          + " inputCount=" + inputs.size());
+    }
+
+    if(outputs.size() > 1) {
+          throw new IOException("Cannot handle more than one output"
+          + ", outputCount=" + outputs.size());
+    }
+    LogicalInput in = inputs.values().iterator().next();
+    LogicalOutput out = outputs.values().iterator().next();
+
+    MRInput input = (MRInput)in;
+
+    //update config
+    Configuration updatedConf = input.getConfigUpdates();
+    if (updatedConf != null) {
+      for (Entry<String, String> entry : updatedConf) {
+        this.jobConf.set(entry.getKey(), entry.getValue());
+      }
+    }
+
+    KeyValueWriter kvWriter = (KeyValueWriter)out.getWriter();
+    OutputCollector collector = new KVOutputCollector(kvWriter);
+
+    if(isMap){
+      rproc = new MapRecordProcessor();
+    }
+    else{
+      throw new UnsupportedOperationException("Reduce is yet to be 
implemented");
+    }
+
+    MRTaskReporter mrReporter = new MRTaskReporter(processorContext);
+    rproc.init(jobConf, mrReporter, inputs.values(), collector);
+    rproc.run();
+
+    //done - output does not need to be committed as hive does not use 
outputcommitter
+  }
+
+  /**
+   * KVOutputCollector. OutputCollector that writes using KVWriter
+   *
+   */
+  static class KVOutputCollector implements OutputCollector {
+    private final KeyValueWriter output;
+
+    KVOutputCollector(KeyValueWriter output) {
+      this.output = output;
+    }
+
+    public void collect(Object key, Object value) throws IOException {
+        output.write(key, value);
+    }
+  }
+
+}


Reply via email to