Hi All,
I am trying to read a orc file and perform groupBy operation on it , but
When i run it on a large data set we are facing the following error
message.
Input format of INPUT DATA
|178111256| 107125374|
|178111256| 107148618|
|178111256| 107175361|
|178111256| 107189910|
and we are try to group by the first column.
But as per the logic and syntax the code is appropriate but it is working
well on small data set. I have attached the code in the text file.
Thank you for your time.
ERROR MESSAGE:
Error: java.lang.ArrayIndexOutOfBoundsException at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$Buffer.write(MapTask.java:1453)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer$Buffer.write(MapTask.java:1349)
at java.io.DataOutputStream.writeByte(DataOutputStream.java:153) at
org.apache.hadoop.io.WritableUtils.writeVLong(WritableUtils.java:273) at
org.apache.hadoop.io.WritableUtils.writeVInt(WritableUtils.java:253) at
org.apache.hadoop.io.Text.write(Text.java:330) at
org.apache.hadoop.io.serializer.WritableSerialization$WritableSerializer.serialize(WritableSerialization.java:98)
at
org.apache.hadoop.io.serializer.WritableSerialization$WritableSerializer.serialize(WritableSerialization.java:82)
at
org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1149)
at
org.apache.hadoop.mapred.MapTask$OldOutputCollector.collect(MapTask.java:610)
at orc_groupby.orc_groupby.Orc_groupBy$MyMapper.map(Orc_groupBy.java:73) at
orc_groupby.orc_groupby.Orc_groupBy$MyMapper.map(Orc_groupBy.java:39) at
org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54) at
org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:453) at
org.apache.hadoop.mapred.MapTask.run(MapTask.java:343) at
org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:170) at
java.security.AccessController.doPrivileged(Native Method) at
javax.security.auth.Subject.doAs(Subject.java:422) at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1869)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:164)
--
REGARDS
BALAKUMAR SEETHARAMAN
package orc_groupby.orc_groupby;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat;
import org.apache.hadoop.hive.ql.io.orc.OrcSerde;
import org.apache.hadoop.hive.ql.io.orc.OrcStruct;
import org.apache.hadoop.hive.serde2.SerDeException;
import org.apache.hadoop.hive.serde2.objectinspector.StructField;
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector;
import
org.apache.hadoop.hive.serde2.objectinspector.primitive.WritableIntObjectInspector;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Orc_groupBy extends Configured implements Tool {
public static int A_ID=0;
public static int B_ID=1;
public static class MyMapper
extends MapReduceBase implements Mapper {
private StructObjectInspector oip;
private final OrcSerde serde = new OrcSerde();
public void configure(JobConf job) {
Properties table = new Properties();
table.setProperty("columns", "viewedid,viewerid");
table.setProperty("columns.types", "int,int");
serde.initialize(job, table);
try {
oip = (StructObjectInspector) serde.getObjectInspector();
} catch (SerDeException e) {
e.printStackTrace();
}
}
public void map(K key, OrcStruct val,
OutputCollector output, Reporter reporter)
throws IOException {
List fields =oip.getAllStructFieldRefs();
WritableIntObjectInspector bInspector =
(WritableIntObjectInspector) fields.get(B_ID).getFieldObjectInspector();
String a = "";
String b = "";
try {
a = bInspector.getPrimitiveJavaObject(
oip.getStructFieldData(serde.deserialize(val), fields.get(A_ID))).toString();
b =
bInspector.getPrimitiveJavaObject(oip.getStructFieldData(serde.deserialize(val),
fields.get(B_ID))).toString();
//System.out.print("A="+a+" B="+b);
//System.exit(0);
} catch (SerDeException e1) {
e1.printStackTrace(