ryanrussell opened a new issue, #8119: URL: https://github.com/apache/arrow-rs/issues/8119
# ARROW-010 & ARROW-011: Parquet Statistics and Metadata API Removals ## Problem Arrow 56.0 removed numerous Parquet statistics and metadata APIs that were deprecated in 54.0.0 and earlier. Our linter doesn't detect usage of these removed functions, causing compilation failures. ## API Changes Details **All removed in 56.0.0 per cargo-public-api diff:** ### Statistics API (ARROW-010) ```rust // Statistics methods: -pub fn parquet::file::statistics::Statistics::distinct_count(&self) -> core::option::Option<u64> -pub fn parquet::file::statistics::Statistics::has_min_max_set(&self) -> bool -pub fn parquet::file::statistics::Statistics::has_nulls(&self) -> bool -pub fn parquet::file::statistics::Statistics::max_bytes(&self) -> &[u8] -pub fn parquet::file::statistics::Statistics::min_bytes(&self) -> &[u8] -pub fn parquet::file::statistics::Statistics::null_count(&self) -> u64 // ValueStatistics methods: -pub fn parquet::file::statistics::ValueStatistics<T>::has_min_max_set(&self) -> bool -pub fn parquet::file::statistics::ValueStatistics<T>::max(&self) -> &T -pub fn parquet::file::statistics::ValueStatistics<T>::max_bytes(&self) -> &[u8] -pub fn parquet::file::statistics::ValueStatistics<T>::min(&self) -> &T -pub fn parquet::file::statistics::ValueStatistics<T>::min_bytes(&self) -> &[u8] -pub fn parquet::file::statistics::ValueStatistics<T>::null_count(&self) -> u64 ``` ### Metadata and Properties API (ARROW-011) ```rust // Schema conversion: -pub fn parquet::arrow::arrow_to_parquet_schema(schema: &arrow_schema::schema::Schema) -> parquet::errors::Result<parquet::schema::types::SchemaDescriptor> // Writer properties: -pub fn parquet::file::properties::WriterProperties::max_statistics_size(&self, col: &parquet::schema::types::ColumnPath) -> usize -pub fn parquet::file::properties::WriterPropertiesBuilder::set_column_max_statistics_size(self, col: parquet::schema::types::ColumnPath, value: usize) -> Self -pub fn parquet::file::properties::WriterPropertiesBuilder::set_max_statistics_size(self, value: usize) -> Self -pub const parquet::file::properties::DEFAULT_MAX_STATISTICS_SIZE: usize // Metadata constructors: -pub fn parquet::file::metadata::ParquetMetaData::new_with_page_index(file_metadata: parquet::file::metadata::FileMetaData, row_groups: alloc::vec::Vec<parquet::file::metadata::RowGroupMetaData>, column_index: core::option::Option<parquet::file::metadata::ParquetColumnIndex>, offset_index: core::option::Option<parquet::file::metadata::ParquetOffsetIndex>) -> Self -pub fn parquet::file::metadata::ColumnChunkMetaDataBuilder::set_file_offset(self, value: i64) -> Self // Page index functions: -pub fn parquet::file::page_index::index_reader::read_pages_locations<R: parquet::file::reader::ChunkReader>(reader: &R, chunks: &[parquet::file::metadata::ColumnChunkMetaData]) -> core::result::Result<alloc::vec::Vec<alloc::vec::Vec<parquet::format::PageLocation>>, parquet::errors::ParquetError> ``` ## Implementation Task Create `src/rules/arrow_010_parquet_statistics_removed.rs` with these specifications: ### Statistics Rule Implementation (ARROW-010) ```rust use regex::Regex; use std::path::Path; use crate::output::{Issue, Severity}; use crate::rules::Rule; /// ARROW-010: Detect removed Parquet Statistics API usage /// /// Arrow 56.0 removed deprecated Statistics and ValueStatistics methods /// that were deprecated in 54.0.0 and earlier versions. pub struct Arrow010Rule; impl Rule for Arrow010Rule { fn rule_id(&self) -> &'static str { "ARROW-010" } fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> { let mut issues = Vec::new(); let removed_stats_methods = vec\![ (r"\.distinct_count\s*\(\s*\)", "Use statistics.distinct_count_opt() or handle the Option return type."), (r"\.has_min_max_set\s*\(\s*\)", "Check if statistics.min_opt() and statistics.max_opt() are Some instead."), (r"\.has_nulls\s*\(\s*\)", "Use statistics.null_count_opt().map(|c| c > 0).unwrap_or(false) instead."), (r"\.max_bytes\s*\(\s*\)", "Use statistics.max_bytes_opt().unwrap_or(&[]) for the same behavior."), (r"\.min_bytes\s*\(\s*\)", "Use statistics.min_bytes_opt().unwrap_or(&[]) for the same behavior."), (r"\.null_count\s*\(\s*\)", "Use statistics.null_count_opt().unwrap_or(0) instead."), ]; for (line_num, line) in content.lines().enumerate() { // Only check lines that might be using Parquet statistics if line.contains("Statistics") || line.contains("ValueStatistics") || line.contains("statistics") || line.contains("stats") { for (pattern, suggestion) in &removed_stats_methods { let regex = Regex::new(pattern)?; if let Some(mat) = regex.find(line) { issues.push(Issue { rule_id: self.rule_id().to_string(), severity: Severity::Error, message: format\!("Parquet Statistics method removed in Arrow 56.0: {}", line[mat.start()+1..mat.end()].trim_end_matches(['(', ')', ' '])), file_path: file_path.to_path_buf(), line: line_num + 1, column: mat.start() + 1, suggestion: Some(suggestion.to_string()), auto_fixable: false, // Return types changed to Option deprecated_since: Some("54.0.0 (removed in 56.0.0)".to_string()), changelog_url: Some("https://github.com/apache/arrow-rs/blob/main/CHANGELOG.md#560".to_string()), migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-statistics-api".to_string()), }); break; } } } } Ok(issues) } fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> { Ok(Vec::new()) } } ``` ### Metadata Rule Implementation (ARROW-011) Create `src/rules/arrow_011_parquet_metadata_removed.rs`: ```rust use regex::Regex; use std::path::Path; use crate::output::{Issue, Severity}; use crate::rules::Rule; /// ARROW-011: Detect removed Parquet metadata and properties APIs /// /// Arrow 56.0 removed various deprecated metadata construction and /// writer property configuration methods. pub struct Arrow011Rule; impl Rule for Arrow011Rule { fn rule_id(&self) -> &'static str { "ARROW-011" } fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> { let mut issues = Vec::new(); let removed_functions = vec\![ (r"arrow_to_parquet_schema\s*\(", "Use parquet::arrow::ArrowToParquetSchemaConverter instead."), (r"\.max_statistics_size\s*\(", "This method was removed. Statistics size is now automatically managed."), (r"\.set_column_max_statistics_size\s*\(", "Remove this call. Statistics size is now automatically managed per-column."), (r"\.set_max_statistics_size\s*\(", "Remove this call. Global statistics size limits are no longer configurable."), (r"DEFAULT_MAX_STATISTICS_SIZE", "This constant was removed. Statistics size is now automatically managed."), (r"ParquetMetaData::new_with_page_index\s*\(", "Use ParquetMetaData::new() and add page index separately."), (r"\.set_file_offset\s*\(", "This method was removed from ColumnChunkMetaDataBuilder."), (r"read_pages_locations\s*\(", "This function was removed. Use the page index APIs directly."), ]; for (line_num, line) in content.lines().enumerate() { for (pattern, suggestion) in &removed_functions { let regex = Regex::new(pattern)?; if let Some(mat) = regex.find(line) { issues.push(Issue { rule_id: self.rule_id().to_string(), severity: Severity::Error, message: format\!("Parquet metadata/properties API removed in Arrow 56.0: {}", line[mat.start()..mat.end()].trim_end_matches(['(', ' '])), file_path: file_path.to_path_buf(), line: line_num + 1, column: mat.start() + 1, suggestion: Some(suggestion.to_string()), auto_fixable: false, deprecated_since: Some("54.0.0 (removed in 56.0.0)".to_string()), changelog_url: Some("https://github.com/apache/arrow-rs/pull/7811".to_string()), migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-metadata-api".to_string()), }); break; } } } Ok(issues) } fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> { Ok(Vec::new()) } } ``` ### Tests for Both Rules ```rust // In arrow_010 tests: #[test] fn test_detects_statistics_methods() { let rule = Arrow010Rule; let content = r#" fn analyze_stats(stats: &Statistics) { let count = stats.null_count(); let has_values = stats.has_min_max_set(); let min = stats.min_bytes(); } "#; let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap(); assert_eq\!(issues.len(), 3); assert\!(issues.iter().all(|i| matches\!(i.severity, Severity::Error))); } // In arrow_011 tests: #[test] fn test_detects_metadata_functions() { let rule = Arrow011Rule; let content = r#" let schema_desc = arrow_to_parquet_schema(&arrow_schema)?; let props = WriterPropertiesBuilder::new() .set_max_statistics_size(1024) .build()?; "#; let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap(); assert_eq\!(issues.len(), 2); } ``` ### Integration Steps 1. Add both to `src/rules/mod.rs`: ```rust pub mod arrow_010_parquet_statistics_removed; pub mod arrow_011_parquet_metadata_removed; // In RuleEngine::new(): rules.push(Box::new(arrow_010_parquet_statistics_removed::Arrow010Rule)); rules.push(Box::new(arrow_011_parquet_metadata_removed::Arrow011Rule)); ``` ## Acceptance Criteria - ✅ Detects all removed Statistics methods (ARROW-010) - ✅ Detects all removed metadata/properties APIs (ARROW-011) - ✅ Provides specific migration guidance with Option types - ✅ Error severity for compilation failures - ✅ Non-auto-fixable due to API changes - ✅ Comprehensive test coverage ## Migration Notes - **Statistics**: All methods now return Option types for safety - **Properties**: Statistics sizing is now automatic - **Metadata**: Use modern constructors and builders - **Timeline**: Deprecated ≤54.0.0 → Removed 56.0.0 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org