[ 
https://issues.apache.org/jira/browse/DRILL-5944?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Kunal Khatua updated DRILL-5944:
--------------------------------
    Description: 
I am running a CTAS query on an s3 bucket with 500 compressed json files - 
ouput as parquet.

Query ran from command line:
 /opt/drill/apache-drill-1.11.0/bin/sqlline --verbose=true --showWarnings=true 
--showNestedErrs=true --force=true --run=therm.sql -u 
jdbc:drill:zk-k8s-drill:2181

therm.sql:
{code:sql}
 use `s3`.`drill-output`; *(s3 points to kairos bucket)
 alter session set `store.format`='parquet';
 ALTER SESSION SET `store.json.all_text_mode` = true;
 create table temps_bucket0 as select t.id, t.`value` as temp, 
to_timestamp(cast(substr(t.`timestamp`,1,10) as int)) as ts, t.device_id from 
`s3`.`bucket=0/` as t where cast(t.`timestamp` as int) > 1475280000 and 
cast(t.`timestamp` as int) < 1491004799;
{code}

Drill ran for 17 min 50.246 sec and managed to write approx. 100M records then 
failed with the following message (see below). I tried to download and 
uncompress the file manually and it is corrupt. Ideally, Drill should log but 
skip the corrupt file.

{code:java}
Error: DATA_READ ERROR: Error parsing JSON - Unexpected character ('d' (code 
100)): was expecting comma to separate OBJECT entries

File /bucket=0/190273.json.gz
 Record 3654965
 Column 37
 Fragment 1:0

[Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010] 
(state=,code=0)
 java.sql.SQLException: DATA_READ ERROR: Error parsing JSON - Unexpected 
character ('d' (code 100)): was expecting comma to separate OBJECT entries

File /bucket=0/190273.json.gz
 Record 3654965
 Column 37
 Fragment 1:0

[Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010]
 at 
org.apache.drill.jdbc.impl.DrillCursor.nextRowInternally(DrillCursor.java:489)
 at org.apache.drill.jdbc.impl.DrillCursor.next(DrillCursor.java:593)
 at org.apache.calcite.avatica.AvaticaResultSet.next(AvaticaResultSet.java:215)
 at 
org.apache.drill.jdbc.impl.DrillResultSetImpl.next(DrillResultSetImpl.java:140)
 at sqlline.IncrementalRows.hasNext(IncrementalRows.java:62)
 at 
sqlline.TableOutputFormat$ResizingRowsProvider.next(TableOutputFormat.java:87)
 at sqlline.TableOutputFormat.print(TableOutputFormat.java:118)
 at sqlline.SqlLine.print(SqlLine.java:1593)
 at sqlline.Commands.execute(Commands.java:852)
 at sqlline.Commands.sql(Commands.java:751)
 at sqlline.SqlLine.dispatch(SqlLine.java:746)
 at sqlline.SqlLine.runCommands(SqlLine.java:1651)
 at sqlline.Commands.run(Commands.java:1304)
 at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
 at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
 at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
 at java.lang.reflect.Method.invoke(Method.java:498)
 at sqlline.ReflectiveCommandHandler.execute(ReflectiveCommandHandler.java:36)
 at sqlline.SqlLine.dispatch(SqlLine.java:742)
 at sqlline.SqlLine.initArgs(SqlLine.java:553)
 at sqlline.SqlLine.begin(SqlLine.java:596)
 at sqlline.SqlLine.start(SqlLine.java:375)
 at sqlline.SqlLine.main(SqlLine.java:268)
 Caused by: org.apache.drill.common.exceptions.UserRemoteException: DATA_READ 
ERROR: Error parsing JSON - Unexpected character ('d' (code 100)): was 
expecting comma to separate OBJECT entries

File /bucket=0/190273.json.gz
 Record 3654965
 Column 37
 Fragment 1:0

[Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010]
 at 
org.apache.drill.exec.rpc.user.QueryResultHandler.resultArrived(QueryResultHandler.java:123)
 at org.apache.drill.exec.rpc.user.UserClient.handle(UserClient.java:368)
 at org.apache.drill.exec.rpc.user.UserClient.handle(UserClient.java:90)
 at org.apache.drill.exec.rpc.RpcBus$InboundHandler.decode(RpcBus.java:274)
 at org.apache.drill.exec.rpc.RpcBus$InboundHandler.decode(RpcBus.java:244)
 at 
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:89)
 at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
 at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
 at 
io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:254)
 at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
 at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
 at 
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
 at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
 at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
 at 
io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:242)
 at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
 at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
 at 
io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
 at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
 at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
 at 
io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:847)
 at 
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
 at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
 at 
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
 at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
 at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
 at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
 at java.lang.Thread.run(Thread.java:745)
 Closing: org.apache.drill.jdbc.impl.DrillConnectionImpl
{code}

  was:
I am running a CTAS query on an s3 bucket with 500 compressed json files - 
ouput as parquet.

Query ran from command line:
/opt/drill/apache-drill-1.11.0/bin/sqlline --verbose=true --showWarnings=true 
--showNestedErrs=true --force=true --run=therm.sql -u 
jdbc:drill:zk-k8s-drill:2181

therm.sql:
use `s3`.`drill-output`; *(s3 points to kairos bucket)
alter session set `store.format`='parquet';
ALTER SESSION SET `store.json.all_text_mode` = true;
create table temps_bucket0 as select t.id, t.`value` as temp, 
to_timestamp(cast(substr(t.`timestamp`,1,10) as int)) as ts, t.device_id from 
`s3`.`bucket=0/` as t where cast(t.`timestamp` as int) > 1475280000 and 
cast(t.`timestamp` as int) < 1491004799;

Drill ran for 17 min 50.246 sec and managed to write approx. 100M records then 
failed with the following message (see below). I tried to download and 
uncompress the file manually and it is corrupt. Ideally, Drill should log but 
skip the corrupt file.

Error: DATA_READ ERROR: Error parsing JSON - Unexpected character ('d' (code 
100)): was expecting comma to separate OBJECT entries

File  /bucket=0/190273.json.gz
Record  3654965
Column  37
Fragment 1:0

[Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010] 
(state=,code=0)
java.sql.SQLException: DATA_READ ERROR: Error parsing JSON - Unexpected 
character ('d' (code 100)): was expecting comma to separate OBJECT entries

File  /bucket=0/190273.json.gz
Record  3654965
Column  37
Fragment 1:0

[Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010]
        at 
org.apache.drill.jdbc.impl.DrillCursor.nextRowInternally(DrillCursor.java:489)
        at org.apache.drill.jdbc.impl.DrillCursor.next(DrillCursor.java:593)
        at 
org.apache.calcite.avatica.AvaticaResultSet.next(AvaticaResultSet.java:215)
        at 
org.apache.drill.jdbc.impl.DrillResultSetImpl.next(DrillResultSetImpl.java:140)
        at sqlline.IncrementalRows.hasNext(IncrementalRows.java:62)
        at 
sqlline.TableOutputFormat$ResizingRowsProvider.next(TableOutputFormat.java:87)
        at sqlline.TableOutputFormat.print(TableOutputFormat.java:118)
        at sqlline.SqlLine.print(SqlLine.java:1593)
        at sqlline.Commands.execute(Commands.java:852)
        at sqlline.Commands.sql(Commands.java:751)
        at sqlline.SqlLine.dispatch(SqlLine.java:746)
        at sqlline.SqlLine.runCommands(SqlLine.java:1651)
        at sqlline.Commands.run(Commands.java:1304)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:498)
        at 
sqlline.ReflectiveCommandHandler.execute(ReflectiveCommandHandler.java:36)
        at sqlline.SqlLine.dispatch(SqlLine.java:742)
        at sqlline.SqlLine.initArgs(SqlLine.java:553)
        at sqlline.SqlLine.begin(SqlLine.java:596)
        at sqlline.SqlLine.start(SqlLine.java:375)
        at sqlline.SqlLine.main(SqlLine.java:268)
Caused by: org.apache.drill.common.exceptions.UserRemoteException: DATA_READ 
ERROR: Error parsing JSON - Unexpected character ('d' (code 100)): was 
expecting comma to separate OBJECT entries

File  /bucket=0/190273.json.gz
Record  3654965
Column  37
Fragment 1:0

[Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010]
        at 
org.apache.drill.exec.rpc.user.QueryResultHandler.resultArrived(QueryResultHandler.java:123)
        at org.apache.drill.exec.rpc.user.UserClient.handle(UserClient.java:368)
        at org.apache.drill.exec.rpc.user.UserClient.handle(UserClient.java:90)
        at 
org.apache.drill.exec.rpc.RpcBus$InboundHandler.decode(RpcBus.java:274)
        at 
org.apache.drill.exec.rpc.RpcBus$InboundHandler.decode(RpcBus.java:244)
        at 
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:89)
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
        at 
io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:254)
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
        at 
io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
        at 
io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:242)
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
        at 
io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
        at 
io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
        at 
io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
        at 
io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:847)
        at 
io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
        at 
io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
        at java.lang.Thread.run(Thread.java:745)
Closing: org.apache.drill.jdbc.impl.DrillConnectionImpl



> Single corrupt compressed json file (in s3) causes query failure
> ----------------------------------------------------------------
>
>                 Key: DRILL-5944
>                 URL: https://issues.apache.org/jira/browse/DRILL-5944
>             Project: Apache Drill
>          Issue Type: Bug
>          Components: Client - JDBC
>    Affects Versions: 1.11.0
>         Environment: 5 node x 8GB Drill cluster deployed on Kubernetes.
>            Reporter: Paul Makkar
>            Priority: Major
>
> I am running a CTAS query on an s3 bucket with 500 compressed json files - 
> ouput as parquet.
> Query ran from command line:
>  /opt/drill/apache-drill-1.11.0/bin/sqlline --verbose=true 
> --showWarnings=true --showNestedErrs=true --force=true --run=therm.sql -u 
> jdbc:drill:zk-k8s-drill:2181
> therm.sql:
> {code:sql}
>  use `s3`.`drill-output`; *(s3 points to kairos bucket)
>  alter session set `store.format`='parquet';
>  ALTER SESSION SET `store.json.all_text_mode` = true;
>  create table temps_bucket0 as select t.id, t.`value` as temp, 
> to_timestamp(cast(substr(t.`timestamp`,1,10) as int)) as ts, t.device_id from 
> `s3`.`bucket=0/` as t where cast(t.`timestamp` as int) > 1475280000 and 
> cast(t.`timestamp` as int) < 1491004799;
> {code}
> Drill ran for 17 min 50.246 sec and managed to write approx. 100M records 
> then failed with the following message (see below). I tried to download and 
> uncompress the file manually and it is corrupt. Ideally, Drill should log but 
> skip the corrupt file.
> {code:java}
> Error: DATA_READ ERROR: Error parsing JSON - Unexpected character ('d' (code 
> 100)): was expecting comma to separate OBJECT entries
> File /bucket=0/190273.json.gz
>  Record 3654965
>  Column 37
>  Fragment 1:0
> [Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010] 
> (state=,code=0)
>  java.sql.SQLException: DATA_READ ERROR: Error parsing JSON - Unexpected 
> character ('d' (code 100)): was expecting comma to separate OBJECT entries
> File /bucket=0/190273.json.gz
>  Record 3654965
>  Column 37
>  Fragment 1:0
> [Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010]
>  at 
> org.apache.drill.jdbc.impl.DrillCursor.nextRowInternally(DrillCursor.java:489)
>  at org.apache.drill.jdbc.impl.DrillCursor.next(DrillCursor.java:593)
>  at 
> org.apache.calcite.avatica.AvaticaResultSet.next(AvaticaResultSet.java:215)
>  at 
> org.apache.drill.jdbc.impl.DrillResultSetImpl.next(DrillResultSetImpl.java:140)
>  at sqlline.IncrementalRows.hasNext(IncrementalRows.java:62)
>  at 
> sqlline.TableOutputFormat$ResizingRowsProvider.next(TableOutputFormat.java:87)
>  at sqlline.TableOutputFormat.print(TableOutputFormat.java:118)
>  at sqlline.SqlLine.print(SqlLine.java:1593)
>  at sqlline.Commands.execute(Commands.java:852)
>  at sqlline.Commands.sql(Commands.java:751)
>  at sqlline.SqlLine.dispatch(SqlLine.java:746)
>  at sqlline.SqlLine.runCommands(SqlLine.java:1651)
>  at sqlline.Commands.run(Commands.java:1304)
>  at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
>  at 
> sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
>  at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
>  at java.lang.reflect.Method.invoke(Method.java:498)
>  at sqlline.ReflectiveCommandHandler.execute(ReflectiveCommandHandler.java:36)
>  at sqlline.SqlLine.dispatch(SqlLine.java:742)
>  at sqlline.SqlLine.initArgs(SqlLine.java:553)
>  at sqlline.SqlLine.begin(SqlLine.java:596)
>  at sqlline.SqlLine.start(SqlLine.java:375)
>  at sqlline.SqlLine.main(SqlLine.java:268)
>  Caused by: org.apache.drill.common.exceptions.UserRemoteException: DATA_READ 
> ERROR: Error parsing JSON - Unexpected character ('d' (code 100)): was 
> expecting comma to separate OBJECT entries
> File /bucket=0/190273.json.gz
>  Record 3654965
>  Column 37
>  Fragment 1:0
> [Error Id: 9458cb2c-d0a4-4b66-9b65-4e8015e2ca97 on 10.75.186.7 :31010]
>  at 
> org.apache.drill.exec.rpc.user.QueryResultHandler.resultArrived(QueryResultHandler.java:123)
>  at org.apache.drill.exec.rpc.user.UserClient.handle(UserClient.java:368)
>  at org.apache.drill.exec.rpc.user.UserClient.handle(UserClient.java:90)
>  at org.apache.drill.exec.rpc.RpcBus$InboundHandler.decode(RpcBus.java:274)
>  at org.apache.drill.exec.rpc.RpcBus$InboundHandler.decode(RpcBus.java:244)
>  at 
> io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:89)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
>  at 
> io.netty.handler.timeout.IdleStateHandler.channelRead(IdleStateHandler.java:254)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
>  at 
> io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:103)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
>  at 
> io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:242)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
>  at 
> io.netty.channel.ChannelInboundHandlerAdapter.channelRead(ChannelInboundHandlerAdapter.java:86)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:339)
>  at 
> io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:324)
>  at 
> io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:847)
>  at 
> io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:131)
>  at 
> io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511)
>  at 
> io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468)
>  at 
> io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382)
>  at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354)
>  at 
> io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
>  at java.lang.Thread.run(Thread.java:745)
>  Closing: org.apache.drill.jdbc.impl.DrillConnectionImpl
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to