bilbingham edited a comment on pull request #649: URL: https://github.com/apache/orc/pull/649#issuecomment-790733843
To reproduce The Attached file was created with the hivev2 streaming api into the following table (5000k random values) CREATE TABLE acidorc ( i int, j int, k int) STORED AS ORC [acid.orc.zip](https://github.com/apache/orc/files/6085038/acid.orc.zip) tblproperties( "transactional"="true", "orc.compress"="SNAPPY", "orc.bloom.filter.columns"="i,j,k"); ``` package orc.apache.orc.test; import com.google.gson.stream.JsonWriter; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.TaskAttemptID; import org.apache.hadoop.mapreduce.TaskType; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; import org.apache.hive.streaming.StreamingException; import org.apache.orc.OrcConf; import org.apache.orc.mapred.OrcStruct; import org.apache.orc.mapreduce.OrcInputFormat; import java.io.*; public class Break { public static void main(String[] args) throws StreamingException, IOException, InterruptedException { // CREATE TABLE acidorc ( i int, j int, k int) // STORED AS ORC // tblproperties( "transactional"="true", "orc.compress"="SNAPPY", "orc.bloom.filter.columns"="i,j,k"); Path workDir = new Path(System.getProperty("test.tmp.dir", "target" + File.separator + "test" + File.separator + "tmp")); JobConf conf = new JobConf(); FileSystem fs; // String typeStr = "struct<i:int,j:int,k:int>"; // OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr); // conf.set("mapreduce.output.fileoutputformat.outputdir", "workDir.toStworkDir.toString()ring()"); // conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000); // conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true); TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 1); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id); // OutputFormat<NullWritable, OrcStruct> outputFormat = // new OrcOutputFormat<OrcStruct>(); // RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext); // // // write 4000 rows with the integer and the binary string // TypeDescription type = TypeDescription.fromString(typeStr); // OrcStruct row = (OrcStruct) OrcStruct.createValue(type); // NullWritable nada = NullWritable.get(); // for(int r=0; r < 3000; ++r) { // row.setFieldValue(0, new IntWritable(r)); // row.setFieldValue(1, new IntWritable(r * 2)); // row.setFieldValue(2, new IntWritable(r * 3)); // writer.write(nada, row); // } // writer.close(attemptContext); conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute(), "5"); FileSplit split = new FileSplit(new Path("/tmp", "acid.orc"), 0, 1000000, new String[0]); RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split, attemptContext); // the sarg should cause it to skip over the rows except 1000 to 2000 int count = 0; while (reader.nextKeyValue() && count < 5) { count++; OutputStream outputStream = new ByteArrayOutputStream(); JsonWriter jw = new JsonWriter(new OutputStreamWriter(outputStream, "UTF-8")); OrcStruct row = (OrcStruct)reader.getCurrentValue(); jw.beginObject(); for (int i = 0; i < row.getNumFields(); i++) { jw.name(row.getSchema().getFieldNames().get(i)); jw.value(String.valueOf(row.getFieldValue(i))); } jw.endObject(); jw.close(); String x = outputStream.toString(); System.out.println(outputStream.toString()); } } } ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
