sunchao commented on code in PR #1621:
URL: https://github.com/apache/arrow-rs/pull/1621#discussion_r861077721


##########
parquet/src/column/writer.rs:
##########
@@ -1049,6 +1084,54 @@ fn has_dictionary_support(kind: Type, props: 
&WriterProperties) -> bool {
     }
 }
 
+/// Signed comparison of bytes arrays
+fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool {
+    let a_length = a.len();
+    let b_length = b.len();
+
+    if a_length == 0 || b_length == 0 {
+        return a_length > 0 && b_length == 0;
+    }
+
+    let first_a: u8 = a[0];
+    let first_b: u8 = b[0];
+
+    // We can short circuit for different signed numbers or
+    // for equal length bytes arrays that have different first bytes.
+    // The equality requirement is necessary for sign extension cases.
+    // 0xFF10 should be equal to 0x10 (due to big endian sign extension).
+    if (0x80 & first_a) != (0x80 & first_b)
+        || (a_length == b_length && first_a != first_b)

Review Comment:
   Hmm why we need this second condition?



##########
parquet/src/column/writer.rs:
##########
@@ -1049,6 +1084,54 @@ fn has_dictionary_support(kind: Type, props: 
&WriterProperties) -> bool {
     }
 }
 
+/// Signed comparison of bytes arrays
+fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool {
+    let a_length = a.len();
+    let b_length = b.len();
+
+    if a_length == 0 || b_length == 0 {
+        return a_length > 0 && b_length == 0;
+    }
+
+    let first_a: u8 = a[0];
+    let first_b: u8 = b[0];
+
+    // We can short circuit for different signed numbers or
+    // for equal length bytes arrays that have different first bytes.
+    // The equality requirement is necessary for sign extension cases.
+    // 0xFF10 should be equal to 0x10 (due to big endian sign extension).
+    if (0x80 & first_a) != (0x80 & first_b)
+        || (a_length == b_length && first_a != first_b)
+    {
+        return (first_a as i8) > (first_b as i8);
+    }
+
+    // When the lengths are unequal and the numbers are of the same
+    // sign we need to do comparison by sign extending the shorter
+    // value first, and once we get to equal sized arrays, lexicographical
+    // unsigned comparison of everything but the first byte is sufficient.
+
+    let extension: u8 = if (first_a as i8) < 0 { 0xFF } else { 0 };
+
+    if a_length != b_length {
+        let not_equal: Option<bool> = if a_length > b_length {

Review Comment:
   why this is a `Option`? seems it is always `Some`.



##########
parquet/src/column/writer.rs:
##########
@@ -1049,6 +1084,54 @@ fn has_dictionary_support(kind: Type, props: 
&WriterProperties) -> bool {
     }
 }
 
+/// Signed comparison of bytes arrays
+fn compare_greater_byte_array_decimals(a: &[u8], b: &[u8]) -> bool {
+    let a_length = a.len();
+    let b_length = b.len();
+
+    if a_length == 0 || b_length == 0 {
+        return a_length > 0 && b_length == 0;
+    }
+
+    let first_a: u8 = a[0];
+    let first_b: u8 = b[0];
+
+    // We can short circuit for different signed numbers or
+    // for equal length bytes arrays that have different first bytes.
+    // The equality requirement is necessary for sign extension cases.
+    // 0xFF10 should be equal to 0x10 (due to big endian sign extension).
+    if (0x80 & first_a) != (0x80 & first_b)
+        || (a_length == b_length && first_a != first_b)
+    {
+        return (first_a as i8) > (first_b as i8);
+    }
+
+    // When the lengths are unequal and the numbers are of the same
+    // sign we need to do comparison by sign extending the shorter
+    // value first, and once we get to equal sized arrays, lexicographical
+    // unsigned comparison of everything but the first byte is sufficient.
+
+    let extension: u8 = if (first_a as i8) < 0 { 0xFF } else { 0 };
+
+    if a_length != b_length {
+        let not_equal: Option<bool> = if a_length > b_length {
+            let lead_length = a_length - b_length;
+            Some((&a[0..lead_length]).iter().any(|&x| x != extension))
+        } else {
+            let lead_length = b_length - a_length;
+            Some((&b[0..lead_length]).iter().any(|&x| x != extension))
+        };
+
+        if not_equal.is_some() && not_equal.unwrap() {
+            let negative_values: bool = (first_a as i8) < 0;
+            let a_longer: bool = a_length > b_length;
+            return negative_values != a_longer;

Review Comment:
   personally I feel it's easier to read via:
   ```rust
   return if negative_values { !a_longer } else { a_longer };
   ```



##########
parquet/src/column/writer.rs:
##########
@@ -2153,4 +2336,94 @@ mod tests {
             panic!("metadata missing statistics");
         }
     }
+
+    /// Returns Decimals column writer.
+    fn get_test_decimals_column_writer<T: DataType>(
+        page_writer: Box<dyn PageWriter>,
+        max_def_level: i16,
+        max_rep_level: i16,
+        props: WriterPropertiesPtr,
+    ) -> ColumnWriterImpl<T> {
+        let descr = Arc::new(get_test_decimals_column_descr::<T>(
+            max_def_level,
+            max_rep_level,
+        ));
+        let column_writer = get_column_writer(descr, props, page_writer);
+        get_typed_column_writer::<T>(column_writer)
+    }
+
+    /// Returns decimals column reader.
+    fn get_test_decimals_column_reader<T: DataType>(
+        page_reader: Box<dyn PageReader>,
+        max_def_level: i16,
+        max_rep_level: i16,
+    ) -> ColumnReaderImpl<T> {
+        let descr = Arc::new(get_test_decimals_column_descr::<T>(
+            max_def_level,
+            max_rep_level,
+        ));
+        let column_reader = get_column_reader(descr, page_reader);
+        get_typed_column_reader::<T>(column_reader)
+    }
+
+    /// Returns descriptor for Decimal type with primitive column.
+    fn get_test_decimals_column_descr<T: DataType>(
+        max_def_level: i16,
+        max_rep_level: i16,
+    ) -> ColumnDescriptor {
+        let path = ColumnPath::from("col");
+        let tpe = SchemaType::primitive_type_builder("col", 
T::get_physical_type())
+            .with_length(16)
+            .with_logical_type(Some(LogicalType::Decimal {
+                scale: 2,
+                precision: 3,
+            }))
+            .with_scale(2)
+            .with_precision(3)
+            .build()
+            .unwrap();
+        ColumnDescriptor::new(Arc::new(tpe), max_def_level, max_rep_level, 
path)
+    }
+
+    /// Returns column writer for UINT32 Column (Provided as ConvertedType 
without specifying the LogicalType).

Review Comment:
   nit: can we limit line length to 90 characters?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to