ryanrussell opened a new issue, #8117: URL: https://github.com/apache/arrow-rs/issues/8117
# ARROW-008: SerializedPageReaderState Type Changes (Parquet) ## Problem Arrow 56.0 changed `SerializedPageReaderState.offset` and `remaining_bytes` from `usize` to `u64`. This breaks code that directly accesses these fields or depends on `usize` return types. ## API Change Details **Changed in commit 5555d30b0:** ```rust // OLD (55.2.0): pub struct SerializedPageReaderState { pub offset: usize, pub remaining_bytes: usize, // ... } // NEW (56.0.0): pub struct SerializedPageReaderState { pub offset: u64, pub remaining_bytes: u64, // ... } ``` **Rationale:** WebAssembly compatibility - `usize` is 32-bit on wasm32, limiting file sizes to 4GB. **Commit:** `5555d30b0` - `[Parquet] Use u64 for SerializedPageReaderState.offset & remaining_bytes, instead of usize` ## Implementation Task Create `src/rules/arrow_008_parquet_page_reader_types.rs` with these specifications: ### Rule Implementation ```rust use regex::Regex; use std::path::Path; use crate::output::{Issue, Severity}; use crate::rules::Rule; /// ARROW-008: Detect SerializedPageReaderState field type assumptions /// /// Arrow 56.0 changed offset and remaining_bytes from usize to u64 /// in SerializedPageReaderState for WebAssembly compatibility. pub struct Arrow008Rule; impl Rule for Arrow008Rule { fn rule_id(&self) -> &'static str { "ARROW-008" } fn check_rust_source(&self, file_path: &Path, content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> { let mut issues = Vec::new(); // Pattern 1: Direct field access let field_access = Regex::new(r"(\w+)\.(?:offset|remaining_bytes)\b")?; // Pattern 2: Type annotations expecting usize let usize_pattern = Regex::new(r":\s*usize\s*=.*\.(?:offset|remaining_bytes)")?; for (line_num, line) in content.lines().enumerate() { // Check for direct field access that might assume usize if let Some(mat) = field_access.find(line) { // Only flag if it looks like SerializedPageReaderState if line.contains("SerializedPageReaderState") || line.contains("page_reader") || line.contains("PageReader") { issues.push(Issue { rule_id: self.rule_id().to_string(), severity: Severity::Warning, message: "SerializedPageReaderState.offset and .remaining_bytes changed from usize to u64 in Arrow 56.0".to_string(), file_path: file_path.to_path_buf(), line: line_num + 1, column: mat.start() + 1, suggestion: Some("Verify type compatibility: these fields are now u64. Use explicit casting if needed: field as usize or u64::from(field).".to_string()), auto_fixable: false, deprecated_since: Some("56.0.0".to_string()), changelog_url: Some("https://github.com/apache/arrow-rs/pull/7918".to_string()), migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-page-reader-types".to_string()), }); } } // Check for explicit usize type annotations if usize_pattern.is_match(line) { issues.push(Issue { rule_id: self.rule_id().to_string(), severity: Severity::Error, message: "Type mismatch: SerializedPageReaderState fields are now u64, not usize".to_string(), file_path: file_path.to_path_buf(), line: line_num + 1, column: line.find(": usize").unwrap_or(0) + 1, suggestion: Some("Change type annotation from 'usize' to 'u64'. These fields were changed for WebAssembly compatibility.".to_string()), auto_fixable: true, deprecated_since: Some("56.0.0".to_string()), changelog_url: Some("https://github.com/apache/arrow-rs/pull/7918".to_string()), migration_guide_url: Some("https://arrow.apache.org/docs/rust/migration_guide.html#parquet-page-reader-types".to_string()), }); } } Ok(issues) } fn check_cargo_toml(&self, _file_path: &Path, _content: &str) -> Result<Vec<Issue>, Box<dyn std::error::Error>> { Ok(Vec::new()) } } ``` ### Tests Required ```rust #[cfg(test)] mod tests { use super::*; use std::path::PathBuf; #[test] fn test_detects_usize_type_annotation() { let rule = Arrow008Rule; let content = r#" use parquet::file::serialized_reader::SerializedPageReaderState; fn process_state(state: &SerializedPageReaderState) { let offset: usize = state.offset; // Should trigger error let remaining: usize = state.remaining_bytes; // Should trigger error } "#; let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap(); assert_eq\!(issues.len(), 2); assert\!(issues.iter().any(|i| i.severity == Severity::Error)); assert\!(issues.iter().any(|i| i.auto_fixable)); } #[test] fn test_detects_field_access() { let rule = Arrow008Rule; let content = r#" use parquet::file::serialized_reader::SerializedPageReaderState; fn get_position(page_reader: &SerializedPageReaderState) -> (usize, usize) { (page_reader.offset, page_reader.remaining_bytes) // Should warn about access } "#; let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap(); assert\!(issues.len() >= 1); assert\!(issues.iter().any(|i| i.severity == Severity::Warning)); } #[test] fn test_no_issues_for_u64_usage() { let rule = Arrow008Rule; let content = r#" use parquet::file::serialized_reader::SerializedPageReaderState; fn process_state(state: &SerializedPageReaderState) { let offset: u64 = state.offset; let remaining: u64 = state.remaining_bytes; } "#; let issues = rule.check_rust_source(&PathBuf::from("test.rs"), content).unwrap(); assert_eq\!(issues.len(), 0); } } ``` ### Integration Steps 1. Add to `src/rules/mod.rs`: ```rust pub mod arrow_008_parquet_page_reader_types; // In RuleEngine::new(): rules.push(Box::new(arrow_008_parquet_page_reader_types::Arrow008Rule)); ``` 2. Test with these patterns: ```rust let offset: usize = state.offset; // Error - type mismatch let pos = page_reader.remaining_bytes; // Warning - verify compatibility let offset: u64 = state.offset; // OK ``` ## Acceptance Criteria - ✅ Detects explicit `usize` type annotations (Error severity) - ✅ Warns about direct field access patterns - ✅ Provides WebAssembly context in messages - ✅ Auto-fixable for type annotations only - ✅ References correct PR (#7918) and commit ## Migration Notes for Users - **Breaking Change**: Fields changed from `usize` to `u64` - **Reason**: WebAssembly compatibility (usize = 32-bit on wasm32) - **Fix**: Update type annotations, add explicit casting where needed - **Impact**: Affects code directly accessing these struct fields -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org