jonded94 commented on issue #9370:
URL: https://github.com/apache/arrow-rs/issues/9370#issuecomment-3889847488
@alamb thanks for looking into this issue. I'm terribly sorry, I think
whatever I vibed there *again* could be at least slightly wrong. I'm still
investigating right now, but let me share what I found so far:
I used this script to dump as much information as possible about data pages:
```
//! Dumps detailed page-level information for a specific row group of a
parquet file.
//!
//! Usage:
//! cargo run --example dump_pages -- <parquet-file> [row-group-index]
//!
//! If row-group-index is omitted, defaults to 0.
use parquet::file::reader::FileReader;
use parquet::file::serialized_reader::{ReadOptionsBuilder,
SerializedFileReader};
use std::env;
use std::fs::File;
fn main() {
let args: Vec<String> = env::args().collect();
if args.len() < 2 {
eprintln!("Usage: {} <parquet-file> [row-group-index]", args[0]);
std::process::exit(1);
}
let path = &args[1];
let rg_idx: usize = args.get(2).map(|s| s.parse().unwrap()).unwrap_or(0);
let file = File::open(path).expect("Failed to open file");
let options = ReadOptionsBuilder::new().with_page_index().build();
let reader = SerializedFileReader::new_with_options(file,
options).expect("Failed to read parquet file");
let metadata = reader.metadata();
let file_meta = metadata.file_metadata();
let num_row_groups = metadata.num_row_groups();
println!("=== File: {} ===", path);
println!(" Version: {}", file_meta.version());
println!(" Created by: {}",
file_meta.created_by().unwrap_or("unknown"));
println!(" Num row groups: {}", num_row_groups);
println!(" Total rows: {}", file_meta.num_rows());
println!();
if rg_idx >= num_row_groups {
eprintln!("Row group index {} out of range (0..{})", rg_idx,
num_row_groups);
std::process::exit(1);
}
let rg_meta = metadata.row_group(rg_idx);
let num_columns = rg_meta.num_columns();
println!("=== Row Group {} ===", rg_idx);
println!(" Num rows: {}", rg_meta.num_rows());
println!(" Total byte size: {}", rg_meta.total_byte_size());
println!(" Num columns: {}", num_columns);
if let Some(ordinal) = rg_meta.ordinal() {
println!(" Ordinal: {}", ordinal);
}
println!();
// Get offset and column indexes if available
let offset_index = metadata.offset_index();
let column_index = metadata.column_index();
let schema = rg_meta.schema_descr();
let row_group_reader = reader.get_row_group(rg_idx).expect("Failed to
get row group");
for col_idx in 0..num_columns {
let col_meta = rg_meta.column(col_idx);
let col_desc = schema.column(col_idx);
println!("--- Column {} ---", col_idx);
println!(" Path: {}", col_desc.path());
println!(" Physical type: {:?}", col_desc.physical_type());
if let Some(lt) = col_desc.logical_type_ref() {
println!(" Logical type: {:?}", lt);
}
let ct = col_desc.converted_type();
if ct != parquet::basic::ConvertedType::NONE {
println!(" Converted type: {:?}", ct);
}
println!(" Max def level: {}", col_desc.max_def_level());
println!(" Max rep level: {}", col_desc.max_rep_level());
println!(" Compression: {:?}", col_meta.compression());
println!(" Encodings: {:?}",
col_meta.encodings().collect::<Vec<_>>());
println!(" Num values (chunk): {}", col_meta.num_values());
println!(" Compressed size: {} bytes", col_meta.compressed_size());
println!(" Uncompressed size: {} bytes",
col_meta.uncompressed_size());
println!(" Data page offset: {}", col_meta.data_page_offset());
if let Some(dict_offset) = col_meta.dictionary_page_offset() {
println!(" Dictionary page offset: {}", dict_offset);
}
if let Some(stats) = col_meta.statistics() {
println!(" Column chunk statistics: {}", stats);
}
println!();
// Print offset index info for this column
if let Some(oi) = offset_index {
let col_oi = &oi[rg_idx][col_idx];
println!(" Offset Index ({} pages):",
col_oi.page_locations().len());
for (pg_idx, loc) in col_oi.page_locations().iter().enumerate() {
println!(
" Page {:3}: offset={:>12}, compressed_size={:>8},
first_row_index={:>6}",
pg_idx, loc.offset, loc.compressed_page_size,
loc.first_row_index
);
}
println!();
}
// Print column index info (page-level null stats)
if let Some(ci) = column_index {
let col_ci = &ci[rg_idx][col_idx];
println!(" Column Index ({} pages):", col_ci.num_pages());
for pg_idx in 0..col_ci.num_pages() as usize {
let null_page = col_ci.is_null_page(pg_idx);
let null_count = col_ci.null_count(pg_idx);
println!(
" Page {:3}: null_page={}, null_count={:?}",
pg_idx, null_page, null_count
);
}
println!();
}
// Now iterate through actual pages
let mut page_reader = row_group_reader
.get_column_page_reader(col_idx)
.expect("Failed to get page reader");
println!(" Data Pages:");
let mut page_num = 0;
loop {
match page_reader.get_next_page() {
Ok(Some(page)) => {
let buf_len = page.buffer().len();
match &page {
parquet::column::page::Page::DataPage {
num_values,
encoding,
def_level_encoding,
rep_level_encoding,
statistics,
..
} => {
println!(" Page {} (DataPage v1):", page_num);
println!(" num_values: {}", num_values);
println!(" encoding: {:?}", encoding);
println!(" def_level_encoding: {:?}",
def_level_encoding);
println!(" rep_level_encoding: {:?}",
rep_level_encoding);
println!(" buffer size: {} bytes", buf_len);
if let Some(stats) = statistics {
println!(" statistics: {}", stats);
}
}
parquet::column::page::Page::DataPageV2 {
num_values,
encoding,
num_nulls,
num_rows,
def_levels_byte_len,
rep_levels_byte_len,
is_compressed,
statistics,
..
} => {
println!(" Page {} (DataPage v2):", page_num);
println!(" num_values: {}", num_values);
println!(" num_rows: {}", num_rows);
println!(" num_nulls: {}", num_nulls);
println!(" encoding: {:?}", encoding);
println!(" def_levels_byte_len: {}",
def_levels_byte_len);
println!(" rep_levels_byte_len: {}",
rep_levels_byte_len);
println!(" is_compressed: {}",
is_compressed);
println!(" buffer size: {} bytes", buf_len);
if let Some(stats) = statistics {
println!(" statistics: {}", stats);
}
}
parquet::column::page::Page::DictionaryPage {
num_values,
encoding,
is_sorted,
..
} => {
println!(" Page {} (Dictionary):", page_num);
println!(" num_values: {}", num_values);
println!(" encoding: {:?}", encoding);
println!(" is_sorted: {}", is_sorted);
println!(" buffer size: {} bytes", buf_len);
}
}
page_num += 1;
}
Ok(None) => break,
Err(e) => {
eprintln!(" Error reading page: {}", e);
break;
}
}
}
println!();
}
}
```
As noted in my original message, the error occurs when decoding rows of row
group 99, so here is a full dump of everything in that row group:
[row_group_99_data_page_dump.txt](https://github.com/user-attachments/files/25257065/row_group_99_data_page_dump.txt)
There are a lot of v2 data pages and dictionary data pages, but *no* v1 data
pages:
```
$ grep "DataPage v2" row_group_99_data_page_dump.txt | wc -l
51
$ grep "DataPage v1" row_group_99_data_page_dump.txt | wc -l
0
$ grep "Dictionary" row_group_99_data_page_dump.txt | wc -l
44
```
So at least the *test* that is contained in my PR
https://github.com/apache/arrow-rs/pull/9374 has to be slightly wrong.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]