[ 
https://issues.apache.org/jira/browse/ARROW-7740?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17028181#comment-17028181
 ] 

Neal Richardson commented on ARROW-7740:
----------------------------------------

Thanks for the report. I think there's multiple issues here, actually. I made a 
more minimal reproducer by taking the json file yours creates and trimming it 
down.

{code}
f2 <- tempfile()
f3 <- tempfile()
cat('{"nest_level1":[{"num":1,"ftr1":0.23342},{"num":2,"ftr1":0.31621}]}
{"nest_level1":[{"num":1,"ftr1":0.31869},{"num":2,"ftr1":0.45382}]}
', file=f2)
cat('{"nest_level1":[{"num":1,"ftr1":0.23342},{"num":2,"ftr1":0.31621}]}
{"nest_level1":[{"num":1,"ftr1":0.31869},{"num":2,"ftr1":0.45382}]}
{"nest_level1":[{"num":1,"ftr1":0.05254},{"num":2,"ftr1":0.482}]}
', file=f3)

arrow::read_json_arrow(f2)
arrow::read_json_arrow(f3)
{code}

Interestingly, the one with 2 rows doesn't crash, but the data it reads into R 
is wrong:

{code}
> arrow::read_json_arrow(f2)
                                               nest_level1
1 4.940656e-324, 9.881313e-324, 2.334200e-01, 3.162100e-01
2 4.940656e-324, 9.881313e-324, 3.186900e-01, 4.538200e-01
{code}

With 3 lines, it aborts. Here's a backtrace:

{code}
> arrow::read_json_arrow(f3)
/Users/enpiar/Documents/ursa/arrow/cpp/src/arrow/array.cc:97:  Check failed: 
(off) <= (length) Slice offset greater than array length
0   libarrow.100.dylib                  0x0000000113ac0354 
_ZN5arrow4util7CerrLog14PrintBackTraceEv + 52
1   libarrow.100.dylib                  0x0000000113ac0274 
_ZN5arrow4util7CerrLogD2Ev + 100
2   libarrow.100.dylib                  0x0000000113ac01d5 
_ZN5arrow4util7CerrLogD1Ev + 21
3   libarrow.100.dylib                  0x0000000113ac01f9 
_ZN5arrow4util7CerrLogD0Ev + 25
4   libarrow.100.dylib                  0x0000000113ac00e0 
_ZN5arrow4util8ArrowLogD2Ev + 80
5   libarrow.100.dylib                  0x0000000113ac0115 
_ZN5arrow4util8ArrowLogD1Ev + 21
6   libarrow.100.dylib                  0x000000011342535e 
_ZNK5arrow9ArrayData5SliceExx + 238
7   libarrow.100.dylib                  0x0000000113432314 
_ZNK5arrow11StructArray5fieldEi + 388
8   arrow.so                            0x000000010d55e383 
_ZN5arrow1r16Converter_StructC2ERKNSt3__16vectorINS2_10shared_ptrINS_5ArrayEEENS2_9allocatorIS6_EEEE
 + 163
9   arrow.so                            0x000000010d551f04 
_ZN5arrow1r9Converter4MakeERKNSt3__16vectorINS2_10shared_ptrINS_5ArrayEEENS2_9allocatorIS6_EEEE
 + 1012
10  arrow.so                            0x000000010d55199d 
_ZN5arrow1r22ArrayVector__as_vectorElRKNSt3__16vectorINS1_10shared_ptrINS_5ArrayEEENS1_9allocatorIS5_EEEE
 + 29
11  arrow.so                            0x000000010d5530b9 
_Z16Array__as_vectorRKNSt3__110shared_ptrIN5arrow5ArrayEEE + 153
12  arrow.so                            0x000000010d55f6bf 
_ZZNK5arrow1r14Converter_List17Ingest_some_nullsEP7SEXPRECRKNSt3__110shared_ptrINS_5ArrayEEEllENKUllE_clEl
 + 95
13  arrow.so                            0x000000010d55f52b 
_ZNK5arrow1r14Converter_List17Ingest_some_nullsEP7SEXPRECRKNSt3__110shared_ptrINS_5ArrayEEEll
 + 251
14  arrow.so                            0x000000010d55401b 
_ZNSt3__110__function6__funcIZN5arrow1r9Converter14IngestParallelEP7SEXPRECRKNS_10shared_ptrINS2_8internal9TaskGroupEEEEUlvE_NS_9allocatorISD_EEFNS2_6StatusEvEEclEv
 + 107
15  libarrow.100.dylib                  0x0000000113ada414 
_ZNKSt3__110__function12__value_funcIFN5arrow6StatusEvEEclEv + 68
16  libarrow.100.dylib                  0x0000000113ad9d0c 
_ZNKSt3__18functionIFN5arrow6StatusEvEEclEv + 28
17  libarrow.100.dylib                  0x0000000113add8e2 
_ZZN5arrow8internal17ThreadedTaskGroup10AppendRealENSt3__18functionIFNS_6StatusEvEEEENKUlvE_clEv
 + 82
18  libarrow.100.dylib                  0x0000000113add86d 
_ZNSt3__1L8__invokeIRZN5arrow8internal17ThreadedTaskGroup10AppendRealENS_8functionIFNS1_6StatusEvEEEEUlvE_JEEEDTclclsr3std3__1E7forwardIT_Efp_Espclsr3std3__1E7forwardIT0_Efp0_EEEOSA_DpOSB_
 + 29
19  libarrow.100.dylib                  0x0000000113add81d 
_ZNSt3__128__invoke_void_return_wrapperIvE6__callIJRZN5arrow8internal17ThreadedTaskGroup10AppendRealENS_8functionIFNS3_6StatusEvEEEEUlvE_EEEvDpOT_
 + 29
20  libarrow.100.dylib                  0x0000000113add7ed 
_ZNSt3__110__function12__alloc_funcIZN5arrow8internal17ThreadedTaskGroup10AppendRealENS_8functionIFNS2_6StatusEvEEEEUlvE_NS_9allocatorIS9_EEFvvEEclEv
 + 29
21  libarrow.100.dylib                  0x0000000113adc4f9 
_ZNSt3__110__function6__funcIZN5arrow8internal17ThreadedTaskGroup10AppendRealENS_8functionIFNS2_6StatusEvEEEEUlvE_NS_9allocatorIS9_EEFvvEEclEv
 + 25
22  libarrow.100.dylib                  0x0000000113ae7b75 
_ZNKSt3__110__function12__value_funcIFvvEEclEv + 53
23  libarrow.100.dylib                  0x0000000113ae7725 
_ZNKSt3__18functionIFvvEEclEv + 21
24  libarrow.100.dylib                  0x0000000113ae70ea 
_ZN5arrow8internalL10WorkerLoopENSt3__110shared_ptrINS0_10ThreadPool5StateEEENS1_15__list_iteratorINS1_6threadEPvEE
 + 618
25  libarrow.100.dylib                  0x0000000113ae6e41 
_ZZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiENK3$_1clEv + 65
26  libarrow.100.dylib                  0x0000000113ae6dbd 
_ZNSt3__1L8__invokeIZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_1JEEEDTclclsr3std3__1E7forwardIT_Efp_Espclsr3std3__1E7forwardIT0_Efp0_EEEOS5_DpOS6_
 + 29
27  libarrow.100.dylib                  0x0000000113ae6d25 
_ZNSt3__1L16__thread_executeINS_10unique_ptrINS_15__thread_structENS_14default_deleteIS2_EEEEZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_1JEJEEEvRNS_5tupleIJT_T0_DpT1_EEENS_15__tuple_indicesIJXspT2_EEEE
 + 37
28  libarrow.100.dylib                  0x0000000113ae6506 
_ZNSt3__114__thread_proxyINS_5tupleIJNS_10unique_ptrINS_15__thread_structENS_14default_deleteIS3_EEEEZN5arrow8internal10ThreadPool21LaunchWorkersUnlockedEiE3$_1EEEEEPvSC_
 + 118
29  libsystem_pthread.dylib             0x00007fff7a2f82eb _pthread_body + 126
30  libsystem_pthread.dylib             0x00007fff7a2fb249 _pthread_start + 66
31  libsystem_pthread.dylib             0x00007fff7a2f740d thread_start + 13
Abort trap: 6
{code}



> R arrow::read_json_arrow aborts session with nested ndjson and default 
> as_data_frame=TRUE
> -----------------------------------------------------------------------------------------
>
>                 Key: ARROW-7740
>                 URL: https://issues.apache.org/jira/browse/ARROW-7740
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, R
>            Reporter: John Sheffield
>            Priority: Minor
>             Fix For: 1.0.0
>
>
> Reading a nested ndjson file using arrow::read_json_arrow with the default 
> `as_data_frame=TRUE` causes an immediate session crash, but switching to 
> `as_data_frame=FALSE` works fine and the resulting arrow object schema is 
> correct.
> {code:java}
> library(tidyr)
> library(arrow)
> library(jsonlite)
> # Create two test datasets: long_df and a variant that nests long_df into
> # a dataframe with a list-column 'nest_level1' containing a dataframe
> long_df <- tidyr::expand_grid(ABC = LETTERS[1:3], xyz = letters[24:26], num = 
> 1:3)
> long_df[["ftr1"]] <- runif(nrow(long_df))
> long_df[["ftr2"]] <- rpois(nrow(long_df), 100)
> nested_frame_level1 <- tidyr::nest(long_df, nest_level1 = c(num, ftr1, ftr2))
> # Write and validate nested ndjson
> jsonlite::stream_out(nested_frame_level1, con = 
> file("nested_frame_level1.json"))
> readLines("nested_frame_level1.json", n = 2) # check we have valid ndjson here
> # This does not cause a session crash
> nested_arrow <- arrow::read_json_arrow(file = "nested_frame_level1.json", 
> as_data_frame = FALSE)
> nested_arrow$schema # correctly interprets 'nest_level1` as `list<item: 
> struct<num: int64, ftr1: double, ftr2: int64>>`
> # This causes a session crash
> nested_df <- arrow::read_json_arrow(file = "nested_frame_level1.json", 
> as_data_frame = TRUE)
>  
> {code}
> The R package version of Arrow is latest CRAN release (arrow * 0.15.1.1, 
> 2019-11-05, CRAN (R 3.5.2)). I'm running this code in a slightly older R 
> version (3.5.1), macOS 10.14.6, x86_64, darwin15.6.0, via RStudio 1.2.5001. 
> [edit: formatting fix]



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to