[ 
https://issues.apache.org/jira/browse/ARROW-6901?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Matthew Franglen updated ARROW-6901:
------------------------------------
    Description: 
The SerializedFileWriter does not update total_num_rows at any point. This 
results in consistently writing zero as the number of rows in the file.

 

This code will fail:
{code:java}
let data = vec![vec![1, 2, 3, 4, 5]];
let file = ...; // a file path here

let schema = Rc::new(
    types::Type::group_type_builder("schema")
        .with_fields(&mut vec![Rc::new(
            types::Type::primitive_type_builder("col1", Type::INT32)
                .with_repetition(Repetition::REQUIRED)
                .build()
                .unwrap(),
        )])
        .build()
        .unwrap(),
);
let props = Rc::new(WriterProperties::builder().build());
let mut file_writer =
    SerializedFileWriter::new(file.try_clone().unwrap(), schema, 
props).unwrap();
let mut rows: i64 = 0;for subset in &data {
    let mut row_group_writer = file_writer.next_row_group().unwrap();
    let col_writer = row_group_writer.next_column().unwrap();
    if let Some(mut writer) = col_writer {
        match writer {
            ColumnWriter::Int32ColumnWriter(ref mut typed) => {
                rows += typed.write_batch(&subset[..], None, None).unwrap() as 
i64;
            }
            _ => {
                unimplemented!();
            }
        }
        row_group_writer.close_column(writer).unwrap();
    }
    file_writer.close_row_group(row_group_writer).unwrap();
}file_writer.close().unwrap();let reader = 
SerializedFileReader::new(file).unwrap();
assert_eq!(reader.num_row_groups(), data.len());
assert_eq!(reader.metadata().file_metadata().num_rows(), rows, "row count in 
metadata not equal to number of rows written");
{code}

  was:
The SerializedFileWriter does not update total_num_rows at any point. This 
results in consistently writing zero as the number of rows in the file.

 

This code will fail:
{code:java}
let data = vec![vec![1, 2, 3, 4, 5]];
let file = ...; // a file path herelet schema = Rc::new(
    types::Type::group_type_builder("schema")
        .with_fields(&mut vec![Rc::new(
            types::Type::primitive_type_builder("col1", Type::INT32)
                .with_repetition(Repetition::REQUIRED)
                .build()
                .unwrap(),
        )])
        .build()
        .unwrap(),
);
let props = Rc::new(WriterProperties::builder().build());
let mut file_writer =
    SerializedFileWriter::new(file.try_clone().unwrap(), schema, 
props).unwrap();
let mut rows: i64 = 0;for subset in &data {
    let mut row_group_writer = file_writer.next_row_group().unwrap();
    let col_writer = row_group_writer.next_column().unwrap();
    if let Some(mut writer) = col_writer {
        match writer {
            ColumnWriter::Int32ColumnWriter(ref mut typed) => {
                rows += typed.write_batch(&subset[..], None, None).unwrap() as 
i64;
            }
            _ => {
                unimplemented!();
            }
        }
        row_group_writer.close_column(writer).unwrap();
    }
    file_writer.close_row_group(row_group_writer).unwrap();
}file_writer.close().unwrap();let reader = 
SerializedFileReader::new(file).unwrap();
assert_eq!(reader.num_row_groups(), data.len());
assert_eq!(reader.metadata().file_metadata().num_rows(), rows, "row count in 
metadata not equal to number of rows written");
{code}


> [Rust][Parquet] Rust Parquet SerializedFileWriter writes total_num_rows as 
> zero
> -------------------------------------------------------------------------------
>
>                 Key: ARROW-6901
>                 URL: https://issues.apache.org/jira/browse/ARROW-6901
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Rust
>    Affects Versions: 0.14.1, 0.15.0
>            Reporter: Matthew Franglen
>            Priority: Minor
>
> The SerializedFileWriter does not update total_num_rows at any point. This 
> results in consistently writing zero as the number of rows in the file.
>  
> This code will fail:
> {code:java}
> let data = vec![vec![1, 2, 3, 4, 5]];
> let file = ...; // a file path here
> let schema = Rc::new(
>     types::Type::group_type_builder("schema")
>         .with_fields(&mut vec![Rc::new(
>             types::Type::primitive_type_builder("col1", Type::INT32)
>                 .with_repetition(Repetition::REQUIRED)
>                 .build()
>                 .unwrap(),
>         )])
>         .build()
>         .unwrap(),
> );
> let props = Rc::new(WriterProperties::builder().build());
> let mut file_writer =
>     SerializedFileWriter::new(file.try_clone().unwrap(), schema, 
> props).unwrap();
> let mut rows: i64 = 0;for subset in &data {
>     let mut row_group_writer = file_writer.next_row_group().unwrap();
>     let col_writer = row_group_writer.next_column().unwrap();
>     if let Some(mut writer) = col_writer {
>         match writer {
>             ColumnWriter::Int32ColumnWriter(ref mut typed) => {
>                 rows += typed.write_batch(&subset[..], None, None).unwrap() 
> as i64;
>             }
>             _ => {
>                 unimplemented!();
>             }
>         }
>         row_group_writer.close_column(writer).unwrap();
>     }
>     file_writer.close_row_group(row_group_writer).unwrap();
> }file_writer.close().unwrap();let reader = 
> SerializedFileReader::new(file).unwrap();
> assert_eq!(reader.num_row_groups(), data.len());
> assert_eq!(reader.metadata().file_metadata().num_rows(), rows, "row count in 
> metadata not equal to number of rows written");
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to