ryanrussell opened a new issue, #8119:
URL: https://github.com/apache/arrow-rs/issues/8119

   # ARROW-010 & ARROW-011: Parquet Statistics and Metadata API Removals
   
   ## Problem
   Arrow 56.0 removed numerous Parquet statistics and metadata APIs that were 
deprecated in 54.0.0 and earlier. Our linter doesn't detect usage of these 
removed functions, causing compilation failures.
   
   ## API Changes Details
   **All removed in 56.0.0 per cargo-public-api diff:**
   
   ### Statistics API (ARROW-010)
   ```rust
   // Statistics methods:
   -pub fn parquet::file::statistics::Statistics::distinct_count(&self) -> 
core::option::Option<u64>
   -pub fn parquet::file::statistics::Statistics::has_min_max_set(&self) -> bool
   -pub fn parquet::file::statistics::Statistics::has_nulls(&self) -> bool
   -pub fn parquet::file::statistics::Statistics::max_bytes(&self) -> &[u8]
   -pub fn parquet::file::statistics::Statistics::min_bytes(&self) -> &[u8]
   -pub fn parquet::file::statistics::Statistics::null_count(&self) -> u64
   
   // ValueStatistics methods:
   -pub fn 
parquet::file::statistics::ValueStatistics<T>::has_min_max_set(&self) -> bool
   -pub fn parquet::file::statistics::ValueStatistics<T>::max(&self) -> &T
   -pub fn parquet::file::statistics::ValueStatistics<T>::max_bytes(&self) -> 
&[u8]
   -pub fn parquet::file::statistics::ValueStatistics<T>::min(&self) -> &T
   -pub fn parquet::file::statistics::ValueStatistics<T>::min_bytes(&self) -> 
&[u8]
   -pub fn parquet::file::statistics::ValueStatistics<T>::null_count(&self) -> 
u64
   ```
   
   ### Metadata and Properties API (ARROW-011)
   ```rust
   // Schema conversion:
   -pub fn parquet::arrow::arrow_to_parquet_schema(schema: 
&arrow_schema::schema::Schema) -> 
parquet::errors::Result<parquet::schema::types::SchemaDescriptor>
   
   // Writer properties:
   -pub fn 
parquet::file::properties::WriterProperties::max_statistics_size(&self, col: 
&parquet::schema::types::ColumnPath) -> usize
   -pub fn 
parquet::file::properties::WriterPropertiesBuilder::set_column_max_statistics_size(self,
 col: parquet::schema::types::ColumnPath, value: usize) -> Self
   -pub fn 
parquet::file::properties::WriterPropertiesBuilder::set_max_statistics_size(self,
 value: usize) -> Self
   -pub const parquet::file::properties::DEFAULT_MAX_STATISTICS_SIZE: usize
   
   // Metadata constructors:
   -pub fn 
parquet::file::metadata::ParquetMetaData::new_with_page_index(file_metadata: 
parquet::file::metadata::FileMetaData, row_groups: 
alloc::vec::Vec<parquet::file::metadata::RowGroupMetaData>, column_index: 
core::option::Option<parquet::file::metadata::ParquetColumnIndex>, 
offset_index: 
core::option::Option<parquet::file::metadata::ParquetOffsetIndex>) -> Self
   -pub fn 
parquet::file::metadata::ColumnChunkMetaDataBuilder::set_file_offset(self, 
value: i64) -> Self
   
   // Page index functions:
   -pub fn parquet::file::page_index::index_reader::read_pages_locations<R: 
parquet::file::reader::ChunkReader>(reader: &R, chunks: 
&[parquet::file::metadata::ColumnChunkMetaData]) -> 
core::result::Result<alloc::vec::Vec<alloc::vec::Vec<parquet::format::PageLocation>>,
 parquet::errors::ParquetError>
   ```
   
   ## Implementation Task
   Create `src/rules/arrow_010_parquet_statistics_removed.rs` with these 
specifications:
   
   ### Statistics Rule Implementation (ARROW-010)
   ```rust
   use regex::Regex;
   use std::path::Path;
   use crate::output::{Issue, Severity};
   use crate::rules::Rule;
   
   /// ARROW-010: Detect removed Parquet Statistics API usage
   /// 
   /// Arrow 56.0 removed deprecated Statistics and ValueStatistics methods
   /// that were deprecated in 54.0.0 and earlier versions.
   pub struct Arrow010Rule;
   
   impl Rule for Arrow010Rule {
       fn rule_id(&self) -> &'static str {
           "ARROW-010"
       }
   
       fn check_rust_source(&self, file_path: &Path, content: &str) -> 
Result<Vec<Issue>, Box<dyn std::error::Error>> {
           let mut issues = Vec::new();
           
           let removed_stats_methods = vec\![
               (r"\.distinct_count\s*\(\s*\)", "Use 
statistics.distinct_count_opt() or handle the Option return type."),
               (r"\.has_min_max_set\s*\(\s*\)", "Check if statistics.min_opt() 
and statistics.max_opt() are Some instead."),
               (r"\.has_nulls\s*\(\s*\)", "Use 
statistics.null_count_opt().map(|c| c > 0).unwrap_or(false) instead."),
               (r"\.max_bytes\s*\(\s*\)", "Use 
statistics.max_bytes_opt().unwrap_or(&[]) for the same behavior."),
               (r"\.min_bytes\s*\(\s*\)", "Use 
statistics.min_bytes_opt().unwrap_or(&[]) for the same behavior."),
               (r"\.null_count\s*\(\s*\)", "Use 
statistics.null_count_opt().unwrap_or(0) instead."),
           ];
           
           for (line_num, line) in content.lines().enumerate() {
               // Only check lines that might be using Parquet statistics
               if line.contains("Statistics") || 
line.contains("ValueStatistics") || 
                  line.contains("statistics") || line.contains("stats") {
                   
                   for (pattern, suggestion) in &removed_stats_methods {
                       let regex = Regex::new(pattern)?;
                       if let Some(mat) = regex.find(line) {
                           issues.push(Issue {
                               rule_id: self.rule_id().to_string(),
                               severity: Severity::Error,
                               message: format\!("Parquet Statistics method 
removed in Arrow 56.0: {}", 
                                   
line[mat.start()+1..mat.end()].trim_end_matches(['(', ')', ' '])),
                               file_path: file_path.to_path_buf(),
                               line: line_num + 1,
                               column: mat.start() + 1,
                               suggestion: Some(suggestion.to_string()),
                               auto_fixable: false, // Return types changed to 
Option
                               deprecated_since: Some("54.0.0 (removed in 
56.0.0)".to_string()),
                               changelog_url: 
Some("https://github.com/apache/arrow-rs/blob/main/CHANGELOG.md#560".to_string()),
                               migration_guide_url: 
Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-statistics-api".to_string()),
                           });
                           break;
                       }
                   }
               }
           }
           
           Ok(issues)
       }
   
       fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> 
Result<Vec<Issue>, Box<dyn std::error::Error>> {
           Ok(Vec::new())
       }
   }
   ```
   
   ### Metadata Rule Implementation (ARROW-011)
   Create `src/rules/arrow_011_parquet_metadata_removed.rs`:
   
   ```rust
   use regex::Regex;
   use std::path::Path;
   use crate::output::{Issue, Severity};
   use crate::rules::Rule;
   
   /// ARROW-011: Detect removed Parquet metadata and properties APIs
   /// 
   /// Arrow 56.0 removed various deprecated metadata construction and
   /// writer property configuration methods.
   pub struct Arrow011Rule;
   
   impl Rule for Arrow011Rule {
       fn rule_id(&self) -> &'static str {
           "ARROW-011"
       }
   
       fn check_rust_source(&self, file_path: &Path, content: &str) -> 
Result<Vec<Issue>, Box<dyn std::error::Error>> {
           let mut issues = Vec::new();
           
           let removed_functions = vec\![
               (r"arrow_to_parquet_schema\s*\(", "Use 
parquet::arrow::ArrowToParquetSchemaConverter instead."),
               (r"\.max_statistics_size\s*\(", "This method was removed. 
Statistics size is now automatically managed."),
               (r"\.set_column_max_statistics_size\s*\(", "Remove this call. 
Statistics size is now automatically managed per-column."),
               (r"\.set_max_statistics_size\s*\(", "Remove this call. Global 
statistics size limits are no longer configurable."),
               (r"DEFAULT_MAX_STATISTICS_SIZE", "This constant was removed. 
Statistics size is now automatically managed."),
               (r"ParquetMetaData::new_with_page_index\s*\(", "Use 
ParquetMetaData::new() and add page index separately."),
               (r"\.set_file_offset\s*\(", "This method was removed from 
ColumnChunkMetaDataBuilder."),
               (r"read_pages_locations\s*\(", "This function was removed. Use 
the page index APIs directly."),
           ];
           
           for (line_num, line) in content.lines().enumerate() {
               for (pattern, suggestion) in &removed_functions {
                   let regex = Regex::new(pattern)?;
                   if let Some(mat) = regex.find(line) {
                       issues.push(Issue {
                           rule_id: self.rule_id().to_string(),
                           severity: Severity::Error,
                           message: format\!("Parquet metadata/properties API 
removed in Arrow 56.0: {}", 
                               
line[mat.start()..mat.end()].trim_end_matches(['(', ' '])),
                           file_path: file_path.to_path_buf(),
                           line: line_num + 1,
                           column: mat.start() + 1,
                           suggestion: Some(suggestion.to_string()),
                           auto_fixable: false,
                           deprecated_since: Some("54.0.0 (removed in 
56.0.0)".to_string()),
                           changelog_url: 
Some("https://github.com/apache/arrow-rs/pull/7811".to_string()),
                           migration_guide_url: 
Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-metadata-api".to_string()),
                       });
                       break;
                   }
               }
           }
           
           Ok(issues)
       }
   
       fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> 
Result<Vec<Issue>, Box<dyn std::error::Error>> {
           Ok(Vec::new())
       }
   }
   ```
   
   ### Tests for Both Rules
   ```rust
   // In arrow_010 tests:
   #[test]
   fn test_detects_statistics_methods() {
       let rule = Arrow010Rule;
       let content = r#"
   fn analyze_stats(stats: &Statistics) {
       let count = stats.null_count();
       let has_values = stats.has_min_max_set();
       let min = stats.min_bytes();
   }
   "#;
       
       let issues = rule.check_rust_source(&PathBuf::from("test.rs"), 
content).unwrap();
       assert_eq\!(issues.len(), 3);
       assert\!(issues.iter().all(|i| matches\!(i.severity, Severity::Error)));
   }
   
   // In arrow_011 tests:
   #[test]
   fn test_detects_metadata_functions() {
       let rule = Arrow011Rule;
       let content = r#"
   let schema_desc = arrow_to_parquet_schema(&arrow_schema)?;
   let props = WriterPropertiesBuilder::new()
       .set_max_statistics_size(1024)
       .build()?;
   "#;
       
       let issues = rule.check_rust_source(&PathBuf::from("test.rs"), 
content).unwrap();
       assert_eq\!(issues.len(), 2);
   }
   ```
   
   ### Integration Steps
   1. Add both to `src/rules/mod.rs`:
      ```rust
      pub mod arrow_010_parquet_statistics_removed;
      pub mod arrow_011_parquet_metadata_removed;
      // In RuleEngine::new():
      rules.push(Box::new(arrow_010_parquet_statistics_removed::Arrow010Rule));
      rules.push(Box::new(arrow_011_parquet_metadata_removed::Arrow011Rule));
      ```
   
   ## Acceptance Criteria
   - ✅ Detects all removed Statistics methods (ARROW-010)
   - ✅ Detects all removed metadata/properties APIs (ARROW-011)
   - ✅ Provides specific migration guidance with Option types
   - ✅ Error severity for compilation failures
   - ✅ Non-auto-fixable due to API changes
   - ✅ Comprehensive test coverage
   
   ## Migration Notes
   - **Statistics**: All methods now return Option types for safety
   - **Properties**: Statistics sizing is now automatic
   - **Metadata**: Use modern constructors and builders
   - **Timeline**: Deprecated ≤54.0.0 → Removed 56.0.0


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to