[jira] [Created] (HIVE-24238) ClassCastException in order-by query over avro table with uniontype column
Gabriel C Balan created HIVE-24238: -- Summary: ClassCastException in order-by query over avro table with uniontype column Key: HIVE-24238 URL: https://issues.apache.org/jira/browse/HIVE-24238 Project: Hive Issue Type: Bug Components: Avro Affects Versions: 3.1.2, 3.1.0 Reporter: Gabriel C Balan {noformat:title=Reproducer} create table avro_reproducer (key int, union_col uniontype ) stored as avro location '/tmp/avro_reproducer'; INSERT INTO TABLE avro_reproducer values (0, create_union(0, 123, 'not me')), (1, create_union(1, -1, 'me, me, me!')); --these queries are ok: select count(*) from avro_reproducer; select * from avro_reproducer; --these queries are not ok select * from avro_reproducer order by union_col; select * from avro_reproducer sort by key; select * from avro_reproducer order by 'does not have to be a column, really'; {noformat} I have verified this reproducer on CDH703, HDP301. It seems the issue is restricted to AVRO; this reproducer does not trigger failures against textfile tables, orc tables, and parquet tables. {noformat:title=Error message in CLI} Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1889) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1877) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1876) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1876) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2110) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2059) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2048) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) Caused by: java.lang.RuntimeException: Error processing row: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.processRow(SparkMapRecordHandler.java:155) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:48) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:27) at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:123) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1315) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row at org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator.process(VectorMapOperator.java:970) at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.processRow(SparkMapRecordHandler.java:142) ... 14 more Caused by: java.lang.ClassCastException: org.apache.hadoop.io.IntWritable cannot be cast to org.apache.hadoop.hive.serde2.objectinspector.StandardUnionObjectInspector$StandardUnion at org.apache.hadoop.hive.ql.exec.vector.VectorAssignRow.assignRowColumn(VectorAssignRow.java:619) at org.apache.hadoop.hive.ql.exec.vector.VectorAssignRow.assignRowColumn(VectorAssignRow.java:351) at org.apache.hadoop.hive.ql.exec.vector.VectorAss
[jira] [Created] (HIVE-23909) ClassCastException: org.apache.hadoop.hive.ql.exec.vector.LongColumnVector cannot be cast to org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector
Gabriel C Balan created HIVE-23909: -- Summary: ClassCastException: org.apache.hadoop.hive.ql.exec.vector.LongColumnVector cannot be cast to org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector Key: HIVE-23909 URL: https://issues.apache.org/jira/browse/HIVE-23909 Project: Hive Issue Type: Bug Components: Vectorization Affects Versions: 3.1.2 Reporter: Gabriel C Balan Query "select ... order by nvl(, 0)" fails with ClassCastException: org.apache.hadoop.hive.ql.exec.vector.LongColumnVector cannot be cast to org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector". The query works fine * if I replace the constant 0 with 0.0, or * if I change the column to int, or * if I set hive.vectorized.execution.enabled to false The query fails in CDH 7.0.3 (Hive 3.1.2), but works fine in HDP 3.0.1 (Hive 3.1.0). {noformat:title=Reproducer} create external table foo (a decimal(10,5) ) location 'file:/tmp/foo'; INSERT INTO TABLE foo values (1), (NULL), (2); set hive.vectorized.execution.enabled = true; select * from foo order by nvl(a,0); {noformat} {noformat:title=Error message in CLI} 20/07/23 05:21:54 [HiveServer2-Background-Pool: Thread-80]: ERROR status.SparkJobMonitor: Job failed with java.lang.ClassCastException: org.apache.hadoop.hive.ql.exec.vector.LongColumnVector cannot be cast to org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector java.util.concurrent.ExecutionException: Exception thrown by job at org.apache.spark.JavaFutureActionWrapper.getImpl(FutureAction.scala:337) at org.apache.spark.JavaFutureActionWrapper.get(FutureAction.scala:342) at org.apache.hive.spark.client.RemoteDriver$JobWrapper.call(RemoteDriver.java:382) at org.apache.hive.spark.client.RemoteDriver$JobWrapper.call(RemoteDriver.java:343) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 8.0 failed 4 times, most recent failure: Lost task 0.3 in stage 8.0 (TID 15, den01eda.us.oracle.com, executor 2): java.lang.IllegalStateException: Hit error while closing operators - failing tree: org.apache.hadoop.hive.ql.metadata.HiveException: Error evaluating a at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.close(SparkMapRecordHandler.java:203) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.closeRecordProcessor(HiveMapFunctionResultList.java:58) at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:96) at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:55) at org.apache.spark.scheduler.Task.run(Task.scala:123) at org.apache.spark.executor.Executor$TaskRunner$$anonfun$10.apply(Executor.scala:408) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1315) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:414) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) at java.lang.Thread.run(Thread.java:748) Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Error evaluating a at org.apache.hadoop.hive.ql.exec.vector.VectorSelectOperator.process(VectorSelectOperator.java:149) at org.apache.hadoop.hive.ql.exec.Operator.vectorForward(Operator.java:969) at org.apache.hadoop.hive.ql.exec.TableScanOperator.process(TableScanOperator.java:126) at org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator.closeOp(VectorMapOperator.java:987) at org.apache.hadoop.hive.ql.exec.Operator.close(Operator.java:732) at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.close(SparkMapRecordHandler.java:180) ... 13 more Caused by: java.lang.ClassCastException: org.apache.hadoop.hive.ql.exec.vector.LongColumnVector cannot be cast to org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector at org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector.setElement(DecimalColumnVector.java:130) at org.apache.hadoop.hive.ql.exec.vector.expressions.VectorCoalesce.evaluate(VectorCoalesce.java:124) at org.apache.hadoop.hi
[jira] [Created] (HIVE-20970) ORC table with bloom filter fails on PPD query
Gabriel C Balan created HIVE-20970: -- Summary: ORC table with bloom filter fails on PPD query Key: HIVE-20970 URL: https://issues.apache.org/jira/browse/HIVE-20970 Project: Hive Issue Type: Bug Components: File Formats, Hive, ORC Affects Versions: 2.1.0 Reporter: Gabriel C Balan I encountered this issue in hive2.1.0-cdh6.0.0. {noformat:title=Reproducer} drop table if exists t1; create table t1(c1 string, c2 int) stored as orc TBLPROPERTIES ("orc.compress"="NONE", "orc.bloom.filter.columns"="c2"); INSERT INTO TABLE t1 VALUES ("row 1", 1), ("row 2", 2), ("row 3", 3); --this works fine set hive.optimize.index.filter=false; select * from t1 where c2=2; --this fails set hive.optimize.index.filter=true; select * from t1 where c2=2; {noformat} These three items are essential to reproducing the issue: # hive.optimize.index.filter=true; # "orc.compress"="NONE" in TBLPROPERTIES # "orc.bloom.filter.columns"="c2" in TBLPROPERTIES That is, if any of the above mentioned items are taken out, the query will not fail anymore. Finally, here is the stack: {noformat:title=Stack trace in log4j file} java.io.IOException: java.lang.IllegalStateException: InputStream#read(byte[]) returned invalid result: 0 The InputStream implementation is buggy. at org.apache.hadoop.hive.ql.exec.FetchOperator.getNextRow(FetchOperator.java:521) at org.apache.hadoop.hive.ql.exec.FetchOperator.pushRow(FetchOperator.java:428) at org.apache.hadoop.hive.ql.exec.FetchTask.fetch(FetchTask.java:146) at org.apache.hadoop.hive.ql.Driver.getResults(Driver.java:2188) at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:259) at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:187) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:409) at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:838) at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:774) at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:701) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:483) at org.apache.hadoop.util.RunJar.run(RunJar.java:313) at org.apache.hadoop.util.RunJar.main(RunJar.java:227) Caused by: java.lang.IllegalStateException: InputStream#read(byte[]) returned invalid result: 0 The InputStream implementation is buggy. at com.google.protobuf.CodedInputStream.refillBuffer(CodedInputStream.java:739) at com.google.protobuf.CodedInputStream.isAtEnd(CodedInputStream.java:701) at com.google.protobuf.CodedInputStream.readTag(CodedInputStream.java:99) at org.apache.orc.OrcProto$RowIndex.(OrcProto.java:7429) at org.apache.orc.OrcProto$RowIndex.(OrcProto.java:7393) at org.apache.orc.OrcProto$RowIndex$1.parsePartialFrom(OrcProto.java:7482) at org.apache.orc.OrcProto$RowIndex$1.parsePartialFrom(OrcProto.java:7477) at com.google.protobuf.AbstractParser.parsePartialFrom(AbstractParser.java:200) at com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:217) at com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:223) at com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:49) at org.apache.orc.OrcProto$RowIndex.parseFrom(OrcProto.java:7593) at org.apache.orc.impl.RecordReaderUtils$DefaultDataReader.readRowIndex(RecordReaderUtils.java:138) at org.apache.orc.impl.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1151) at org.apache.orc.impl.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1134) at org.apache.orc.impl.RecordReaderImpl.pickRowGroups(RecordReaderImpl.java:800) at org.apache.orc.impl.RecordReaderImpl.readStripe(RecordReaderImpl.java:830) at org.apache.orc.impl.RecordReaderImpl.advanceStripe(RecordReaderImpl.java:986) at org.apache.orc.impl.RecordReaderImpl.advanceToNextRow(RecordReaderImpl.java:1021) at org.apache.orc.impl.RecordReaderImpl.(RecordReaderImpl.java:215) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.(RecordReaderImpl.java:63) at org.apache.hadoop.hive.ql.io.orc.ReaderImpl.rowsOptions(ReaderImpl.java:87) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.createReaderFromFile(OrcInputFormat.java:314) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat$OrcRecordReader.(OrcInputFormat.java:225) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getRecordReader(OrcInputFormat.java:1691) at org.apache.hadoop.hi
[jira] [Created] (HIVE-13377) Lost rows when using compact index on parquet table
Gabriel C Balan created HIVE-13377: -- Summary: Lost rows when using compact index on parquet table Key: HIVE-13377 URL: https://issues.apache.org/jira/browse/HIVE-13377 Project: Hive Issue Type: Bug Components: Indexing Affects Versions: 1.1.0 Environment: linux, cdh 5.5.0 Reporter: Gabriel C Balan Priority: Minor Query with where clause on a parquet table loses rows when using a compact index. The query produces the right results without the index. {code} create table small_parq(i int) stored as parquet; insert into table small_parq values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10), (11); set hive.optimize.index.filter=true; set hive.optimize.index.filter.compact.minsize=50; create index comp_idx on table small_parq (i) as 'compact' WITH DEFERRED REBUILD; alter index comp_idx on small_parq rebuild; select * from small_parq where i=3; --this correctly produces 1 row (value 3). select * from small_parq where i=11; --this incorrectly produces 0 rows. --I see correct results when looking for a row in [1,6]; --I see bad results when looking for a row in [7,11]. --All is well once I disable the compact index set hive.optimize.index.filter.compact.minsize=5000; select * from small_parq where i=11; --now it correctly produces 1 row (value 11). {code} It seems I can't reproduce this issue if the base table was ORC, SEQ, AVRO, TEXTFILE. -- This message was sent by Atlassian JIRA (v6.3.4#6332)
[jira] [Created] (HIVE-13114) parquet filter fails for type float when hive.optimize.index.filter=true
Gabriel C Balan created HIVE-13114: -- Summary: parquet filter fails for type float when hive.optimize.index.filter=true Key: HIVE-13114 URL: https://issues.apache.org/jira/browse/HIVE-13114 Project: Hive Issue Type: Bug Affects Versions: 1.2.1 Reporter: Gabriel C Balan Priority: Trivial Hive fails when selecting from a table 'stored as parquet' with a row filter based on a float column and hive.optimize.index.filter=true. The following example fails in hive 1.2.1 (HDP 2.3.2), but works fine in hive 1.1.0 (CDH 5.5.0): {code} create table p(f float)stored as parquet; insert into table p values (1), (2), (3); select * from p where f >= 2; set hive.optimize.index.filter=true; select * from p where f >= 2; {code} The first select query works fine, the second fails with: {code} Failed with exception java.io.IOException:java.lang.IllegalArgumentException: FilterPredicate column: f's declared type (java.lang.Double) does not match the schema found in file metadata. Column f is of type: FullTypeDescriptor(PrimitiveType: FLOAT, OriginalType: null) Valid types for this column are: [class java.lang.Float] {code} Here's the stack trace from log4j: {code} 2016-02-22 12:18:30,691 ERROR [main]: CliDriver (SessionState.java:printError(960)) - Failed with exception java.io.IOException:java.lang.IllegalArgumentException: FilterPredicate column: f's declared type (java.lang.Double) does not match the schema found in file metadata. Column f is of type: FullTypeDescriptor(PrimitiveType: FLOAT, OriginalType: null) Valid types for this column are: [class java.lang.Float] java.io.IOException: java.lang.IllegalArgumentException: FilterPredicate column: f's declared type (java.lang.Double) does not match the schema found in file metadata. Column f is of type: FullTypeDescriptor(PrimitiveType: FLOAT, OriginalType: null) Valid types for this column are: [class java.lang.Float] at org.apache.hadoop.hive.ql.exec.FetchOperator.getNextRow(FetchOperator.java:508) at org.apache.hadoop.hive.ql.exec.FetchOperator.pushRow(FetchOperator.java:415) at org.apache.hadoop.hive.ql.exec.FetchTask.fetch(FetchTask.java:140) at org.apache.hadoop.hive.ql.Driver.getResults(Driver.java:1672) at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:233) at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:165) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376) at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:736) at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:681) at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:621) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:601) at org.apache.hadoop.util.RunJar.run(RunJar.java:221) at org.apache.hadoop.util.RunJar.main(RunJar.java:136) Caused by: java.lang.IllegalArgumentException: FilterPredicate column: f's declared type (java.lang.Double) does not match the schema found in file metadata. Column f is of type: FullTypeDescriptor(PrimitiveType: FLOAT, OriginalType: null) Valid types for this column are: [class java.lang.Float] at parquet.filter2.predicate.ValidTypeMap.assertTypeValid(ValidTypeMap.java:132) at parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumn(SchemaCompatibilityValidator.java:185) at parquet.filter2.predicate.SchemaCompatibilityValidator.validateColumnFilterPredicate(SchemaCompatibilityValidator.java:160) at parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:124) at parquet.filter2.predicate.SchemaCompatibilityValidator.visit(SchemaCompatibilityValidator.java:59) at parquet.filter2.predicate.Operators$GtEq.accept(Operators.java:248) at parquet.filter2.predicate.SchemaCompatibilityValidator.validate(SchemaCompatibilityValidator.java:64) at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:59) at parquet.filter2.compat.RowGroupFilter.visit(RowGroupFilter.java:40) at parquet.filter2.compat.FilterCompat$FilterPredicateCompat.accept(FilterCompat.java:126) at parquet.filter2.compat.RowGroupFilter.filterRowGroups(RowGroupFilter.java:46) at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.getSplit(ParquetRecordReaderWrapper.java:275) at org.apache.hadoop.hive.ql.io.parquet.read.ParquetRecordReaderWrapper.(ParquetRecordReaderWrapper.java:99) at org.apache.hadoop.hive.ql.io.parquet.read.Par
[jira] [Created] (HIVE-12360) Bad seek in uncompressed ORC with predicate pushdown
Gabriel C Balan created HIVE-12360: -- Summary: Bad seek in uncompressed ORC with predicate pushdown Key: HIVE-12360 URL: https://issues.apache.org/jira/browse/HIVE-12360 Project: Hive Issue Type: Bug Components: File Formats, Hive Affects Versions: 1.2.1 Environment: Oracle Linux 6.4, HDP 2.3.2.0-2950 Reporter: Gabriel C Balan Reading from an ORC file bombs in HDP-2.3.2 when pushing down predicate: {noformat:title=Error message in CLI} Failed with exception java.io.IOException:java.lang.IllegalArgumentException: Seek in index to 4613 is outside of the data {noformat} {noformat:title=Stack trace in log4j file} 2015-11-06 09:48:11,873 ERROR [main]: CliDriver (SessionState.java:printError(960)) - Failed with exception java.io.IOException:java.lang.IllegalArgumentException: Seek in index to 4613 is outside of the data java.io.IOException: java.lang.IllegalArgumentException: Seek in index to 4613 is outside of the data at org.apache.hadoop.hive.ql.exec.FetchOperator.getNextRow(FetchOperator.java:508) at org.apache.hadoop.hive.ql.exec.FetchOperator.pushRow(FetchOperator.java:415) at org.apache.hadoop.hive.ql.exec.FetchTask.fetch(FetchTask.java:140) at org.apache.hadoop.hive.ql.Driver.getResults(Driver.java:1672) at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:233) at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:165) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376) at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:736) at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:681) at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:621) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:601) at org.apache.hadoop.util.RunJar.run(RunJar.java:221) at org.apache.hadoop.util.RunJar.main(RunJar.java:136) Caused by: java.lang.IllegalArgumentException: Seek in index to 4613 is outside of the data at org.apache.hadoop.hive.ql.io.orc.InStream$UncompressedStream.seek(InStream.java:139) at org.apache.hadoop.hive.ql.io.orc.InStream$UncompressedStream.read(InStream.java:87) at java.io.InputStream.read(InputStream.java:102) at com.google.protobuf.CodedInputStream.refillBuffer(CodedInputStream.java:737) at com.google.protobuf.CodedInputStream.isAtEnd(CodedInputStream.java:701) at com.google.protobuf.CodedInputStream.readTag(CodedInputStream.java:99) at org.apache.hadoop.hive.ql.io.orc.OrcProto$RowIndex.(OrcProto.java:7429) at org.apache.hadoop.hive.ql.io.orc.OrcProto$RowIndex.(OrcProto.java:7393) at org.apache.hadoop.hive.ql.io.orc.OrcProto$RowIndex$1.parsePartialFrom(OrcProto.java:7482) at org.apache.hadoop.hive.ql.io.orc.OrcProto$RowIndex$1.parsePartialFrom(OrcProto.java:7477) at com.google.protobuf.AbstractParser.parsePartialFrom(AbstractParser.java:200) at com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:217) at com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:223) at com.google.protobuf.AbstractParser.parseFrom(AbstractParser.java:49) at org.apache.hadoop.hive.ql.io.orc.OrcProto$RowIndex.parseFrom(OrcProto.java:7593) at org.apache.hadoop.hive.ql.io.orc.MetadataReader.readRowIndex(MetadataReader.java:88) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1166) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.readRowIndex(RecordReaderImpl.java:1151) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.pickRowGroups(RecordReaderImpl.java:750) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.readStripe(RecordReaderImpl.java:777) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.advanceStripe(RecordReaderImpl.java:986) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.advanceToNextRow(RecordReaderImpl.java:1019) at org.apache.hadoop.hive.ql.io.orc.RecordReaderImpl.(RecordReaderImpl.java:205) at org.apache.hadoop.hive.ql.io.orc.ReaderImpl.rowsOptions(ReaderImpl.java:598) at org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger$ReaderPair.(OrcRawRecordMerger.java:183) at org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger$OriginalReaderPair.(OrcRawRecordMerger.java:226) at org.apache.hadoop.hive.ql.io.orc.OrcRawRecordMerger.(OrcRawRecordMerger.java:437) at org.apache.hadoop.hive.ql.io.orc.OrcInputFormat.getReade
[jira] [Created] (HIVE-10577) losing data loading into list bucketing table
Gabriel C Balan created HIVE-10577: -- Summary: losing data loading into list bucketing table Key: HIVE-10577 URL: https://issues.apache.org/jira/browse/HIVE-10577 Project: Hive Issue Type: Bug Components: Hive Affects Versions: 1.1.1 Environment: linux Reporter: Gabriel C Balan Some rows don't make it into a list bucketing table when the skew column is of type string. All is fine, however, when the skew column is of type float {code:title=src.txt} 1 1.1 111 {code} {code: title=hive DDL/DML|borderStyle=solid} set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; set hive.mapred.supports.subdirectories=true; set hive.optimize.listbucketing=true; set mapred.input.dir.recursive=true; drop table if exists src; create table src (c1 string); load data local inpath 'src.txt' overwrite into table src; drop table if exists lb_str; create table lb_str (c1 string) skewed by (c1) on ('1.1','110','1') STORED AS DIRECTORIES; insert overwrite table lb_str select * from src; drop table if exists lb_float; create table lb_float (c1 float) skewed by (c1) on ('1.1','110','1') STORED AS DIRECTORIES; insert overwrite table lb_float select * from src; select * from lb_str; select * from lb_float; {code} {code: title=hive cli|borderStyle=solid} hive> select * from lb_str; OK 1.1 110 Time taken: 0.071 seconds, Fetched: 2 row(s) hive> select * from lb_float; OK 1.1 1.0 110.0 {code} No 'c1=1/' directory is created in 'lb_str/' -- This message was sent by Atlassian JIRA (v6.3.4#6332)
[jira] [Created] (HIVE-10525) loading data into list bucketing table when null in skew column
Gabriel C Balan created HIVE-10525: -- Summary: loading data into list bucketing table when null in skew column Key: HIVE-10525 URL: https://issues.apache.org/jira/browse/HIVE-10525 Project: Hive Issue Type: Bug Components: Hive Affects Versions: 1.1.0 Environment: linux Reporter: Gabriel C Balan Priority: Minor I'm trying to load data into a list bucketing table. The insert statement fails when there are nulls going into the skew column. If this is the expected behavior, there is no mention of this restriction in the doc. > more *null.csv :: has-null.csv :: 1 2 \N 3 :: no-null.csv :: 1 2 3 set hive.mapred.supports.subdirectories=true; set hive.optimize.listbucketing=true; set mapred.input.dir.recursive=true; set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat; create table src_with_null (x int); load data local inpath 'has-null.csv' overwrite into table src_with_null; create table src_no_null (x int); load data local inpath 'no-null.csv' overwrite into table src_no_null; create table lb (x int) partitioned by (p string) skewed by ( x ) on (1) STORED AS DIRECTORIES stored as rcfile; insert overwrite table lb partition (p = 'foo') select * from src_with_null; --fails insert overwrite table lb partition (p = 'foo') select * from src_no_null; --succeeds I see this in ${hive.log.dir}/hive.log 2015-04-28 13:43:47,646 WARN [Thread-82]: mapred.LocalJobRunner (LocalJobRunner.java:run(560)) - job_local402607316_0001 java.lang.Exception: java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"x":null} at org.apache.hadoop.mapred.LocalJobRunner$Job.runTasks(LocalJobRunner.java:462) at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:522) Caused by: java.lang.RuntimeException: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"x":null} at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.map(ExecMapper.java:179) at org.apache.hadoop.mapred.MapRunner.run(MapRunner.java:54) at org.apache.hadoop.mapred.MapTask.runOldMapper(MapTask.java:453) at org.apache.hadoop.mapred.MapTask.run(MapTask.java:343) at org.apache.hadoop.mapred.LocalJobRunner$Job$MapTaskRunnable.run(LocalJobRunner.java:243) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) at java.util.concurrent.FutureTask$Sync.innerRun(FutureTask.java:334) at java.util.concurrent.FutureTask.run(FutureTask.java:166) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1110) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:603) at java.lang.Thread.run(Thread.java:722) Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"x":null} at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:507) at org.apache.hadoop.hive.ql.exec.mr.ExecMapper.map(ExecMapper.java:170) ... 10 more Caused by: java.lang.NullPointerException at org.apache.hadoop.hive.ql.exec.FileSinkOperator.generateListBucketingDirName(FileSinkOperator.java:833) at org.apache.hadoop.hive.ql.exec.FileSinkOperator.processOp(FileSinkOperator.java:615) at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:815) at org.apache.hadoop.hive.ql.exec.SelectOperator.processOp(SelectOperator.java:84) at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:815) at org.apache.hadoop.hive.ql.exec.TableScanOperator.processOp(TableScanOperator.java:95) at org.apache.hadoop.hive.ql.exec.MapOperator$MapOpCtx.forward(MapOperator.java:157) at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:497) -- This message was sent by Atlassian JIRA (v6.3.4#6332)