alamb commented on a change in pull request #8698:
URL: https://github.com/apache/arrow/pull/8698#discussion_r526207035



##########
File path: rust/parquet/src/column/writer.rs
##########
@@ -955,81 +963,31 @@ impl<T: DataType> ColumnWriterImpl<T> {
 /// Trait to define default encoding for types, including whether or not the 
type
 /// supports dictionary encoding.
 trait EncodingWriteSupport {
-    /// Returns encoding for a column when no other encoding is provided in 
writer
-    /// properties.
-    fn fallback_encoding(props: &WriterProperties) -> Encoding;
-
     /// Returns true if dictionary is supported for column writer, false 
otherwise.
     fn has_dictionary_support(props: &WriterProperties) -> bool;
 }
 
-// Basic implementation, always falls back to PLAIN and supports dictionary.
-impl<T: DataType> EncodingWriteSupport for ColumnWriterImpl<T> {
-    default fn fallback_encoding(_props: &WriterProperties) -> Encoding {
-        Encoding::PLAIN
-    }
-
-    default fn has_dictionary_support(_props: &WriterProperties) -> bool {
-        true
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<BoolType> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::RLE,
-        }
-    }
-
-    // Boolean column does not support dictionary encoding and should fall 
back to
-    // whatever fallback encoding is defined.
-    fn has_dictionary_support(_props: &WriterProperties) -> bool {
-        false
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<Int32Type> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED,
-        }
+/// Returns encoding for a column when no other encoding is provided in writer 
properties.
+fn fallback_encoding(kind: Type, props: &WriterProperties) -> Encoding {
+    match (kind, props.writer_version()) {
+        (Type::BOOLEAN, WriterVersion::PARQUET_2_0) => Encoding::RLE,
+        (Type::INT32, WriterVersion::PARQUET_2_0) => 
Encoding::DELTA_BINARY_PACKED,
+        (Type::INT64, WriterVersion::PARQUET_2_0) => 
Encoding::DELTA_BINARY_PACKED,
+        (Type::BYTE_ARRAY, WriterVersion::PARQUET_2_0) => 
Encoding::DELTA_BYTE_ARRAY,
+        (Type::FIXED_LEN_BYTE_ARRAY, WriterVersion::PARQUET_2_0) => 
Encoding::DELTA_BYTE_ARRAY,
+        _ => Encoding::PLAIN,
     }
 }
 
-impl EncodingWriteSupport for ColumnWriterImpl<Int64Type> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED,
-        }
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<ByteArrayType> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY,
-        }
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<FixedLenByteArrayType> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY,
-        }
-    }
-
-    fn has_dictionary_support(props: &WriterProperties) -> bool {
-        match props.writer_version() {
-            // Dictionary encoding was not enabled in PARQUET 1.0
-            WriterVersion::PARQUET_1_0 => false,
-            WriterVersion::PARQUET_2_0 => true,
-        }
+/// Returns true if dictionary is supported for column writer, false otherwise.

Review comment:
       I carefully checked the logic in this file and I believe it matches the 
original without the need for specialization. 👍 

##########
File path: rust/parquet/src/data_type.rs
##########
@@ -424,16 +460,165 @@ impl AsBytes for str {
     }
 }
 
-/// Contains the Parquet physical type information as well as the Rust 
primitive type
-/// presentation.
-pub trait DataType: 'static {
-    type T: std::cmp::PartialEq
+pub(crate) mod private {
+    use super::{AsBytes, Result, ParquetError};
+
+    pub type BitIndex = u64;
+
+    #[derive(Copy, Clone)]
+    pub enum EncodedValue<'a> {
+        /// The value can be encoded from the following bytes
+        Bytes {
+            data: &'a [u8]
+        },
+        /// The value encodes as a specific set bit at a given index
+        Bits{
+            index: BitIndex
+        },
+    }
+
+    /// Sealed trait to start to remove specialisation from implementations
+    ///
+    /// This is done to force the associated value type to be unimplementable 
outside of this
+    /// crate, and thus hint to the type system (and end user) traits are 
public for the contract
+    /// and not for extension.
+    pub trait ParquetValueType : std::cmp::PartialEq
         + std::fmt::Debug
+        + std::fmt::Display
         + std::default::Default
         + std::clone::Clone
-        + AsBytes
-        + FromBytes
-        + PartialOrd;
+        + super::AsBytes
+        + super::FromBytes
+        + super::SliceAsBytes
+        + PartialOrd
+    {
+        /// Return the most primitive version of encoding a given type
+        fn encoded(&self) -> EncodedValue<'_>;
+
+        /// Return the encoded size for a type
+        fn dict_encoding_size(&self) -> (usize, usize) {
+            (std::mem::size_of::<Self>(), 1)
+        }
+
+        /// Return the value as i64 if possible
+        ///
+        /// This is essentially the same as `std::convert::TryInto<i64>` but 
can
+        /// implemented for `f32` and `f64`, types that would fail orphan rules
+        fn as_i64(&self) -> Result<i64> {
+            Err(general_err!("Type cannot be converted to i64"))
+        }
+
+        /// Return the value as u64 if possible
+        ///
+        /// This is essentially the same as `std::convert::TryInto<u64>` but 
can
+        /// implemented for `f32` and `f64`, types that would fail orphan rules
+        fn as_u64(&self) -> Result<u64> {
+            self.as_i64()
+                .map_err(|_| general_err!("Type cannot be converted to u64"))
+                .map(|x| x as u64)
+        }
+
+        /// Return the value as an Any to allow for downcasts without 
transmutation
+        fn as_any(&self) -> &dyn std::any::Any;
+
+        /// Return the value as an mutable Any to allow for downcasts without 
transmutation
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any;
+    }
+
+    impl ParquetValueType for bool {
+        fn encoded(&self) -> EncodedValue<'_> {
+            EncodedValue::Bits { index: *self as u64 }
+        }
+
+        fn as_i64(&self) -> Result<i64> {
+            Ok(*self as i64)
+        }
+
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+            self
+        }
+    }
+
+    /// Hopelessly unsafe function that emulates `num::as_ne_bytes`
+    ///
+    /// It is not recommended to use this outside of this private module as, 
while it
+    /// _should_ work for primitive values, it is little better than a 
transmutation
+    /// and can act as a backdoor into mis-interpreting types as arbitary byte 
slices
+    fn as_raw<'a, T>(value: *const T) -> &'a [u8] {
+        unsafe {
+            let value = value as *const u8;
+            std::slice::from_raw_parts(value, std::mem::size_of::<T>())
+        }
+    }
+
+    macro_rules! impl_from_raw {
+        ($ty: ty, $self: ident => $as_i64: block) => {
+            impl ParquetValueType for $ty {
+                fn encoded(&self) -> EncodedValue<'_> {
+                    EncodedValue::Bytes { data: as_raw(&*self) }
+                }
+
+                fn as_i64(&$self) -> Result<i64> {
+                    $as_i64
+                }
+
+                fn as_any(&self) -> &dyn std::any::Any {
+                    self
+                }
+
+                fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+                    self
+                }
+            }
+        }
+    }
+
+    impl_from_raw!(i32, self => { Ok(*self as i64) });
+    impl_from_raw!(i64, self => { Ok(*self) });
+    impl_from_raw!(f32, self => { Err(general_err!("Type cannot be converted 
to i64")) });
+    impl_from_raw!(f64, self => { Err(general_err!("Type cannot be converted 
to i64")) });

Review comment:
       ```suggestion
       impl_from_raw!(f32, self => { Err(general_err!("Type f32 cannot be 
converted to i64")) });
       impl_from_raw!(f64, self => { Err(general_err!("Type f64 cannot be 
converted to i64")) });
   ```

##########
File path: rust/rust-toolchain
##########
@@ -1 +0,0 @@
-nightly-2020-11-14

Review comment:
       🎉 

##########
File path: rust/parquet/src/data_type.rs
##########
@@ -424,16 +460,165 @@ impl AsBytes for str {
     }
 }
 
-/// Contains the Parquet physical type information as well as the Rust 
primitive type
-/// presentation.
-pub trait DataType: 'static {
-    type T: std::cmp::PartialEq
+pub(crate) mod private {
+    use super::{AsBytes, Result, ParquetError};
+
+    pub type BitIndex = u64;
+
+    #[derive(Copy, Clone)]
+    pub enum EncodedValue<'a> {
+        /// The value can be encoded from the following bytes
+        Bytes {
+            data: &'a [u8]
+        },
+        /// The value encodes as a specific set bit at a given index

Review comment:
       ```suggestion
           /// The value encoded as a specific set bit at a given index
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to