alamb commented on a change in pull request #8300:
URL: https://github.com/apache/arrow/pull/8300#discussion_r502329941
##########
File path: rust/parquet/src/file/reader.rs
##########
@@ -18,35 +18,37 @@
//! Contains file reader API and provides methods to access file metadata, row
group
//! readers to read individual column chunks, or access record iterator.
-use std::{
- convert::TryFrom,
- fs::File,
- io::{BufReader, Cursor, Read, Seek, SeekFrom},
- path::Path,
- rc::Rc,
-};
+use std::{boxed::Box, io::Read, rc::Rc};
-use byteorder::{ByteOrder, LittleEndian};
-use parquet_format::{
- ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, PageHeader,
PageType,
-};
-use thrift::protocol::TCompactInputProtocol;
-
-use crate::basic::{ColumnOrder, Compression, Encoding, Type};
use crate::column::page::PageIterator;
-use crate::column::{
- page::{Page, PageReader},
- reader::{ColumnReader, ColumnReaderImpl},
-};
-use crate::compression::{create_codec, Codec};
+use crate::column::{page::PageReader, reader::ColumnReader};
use crate::errors::{ParquetError, Result};
-use crate::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC};
+use crate::file::metadata::*;
+pub use crate::file::serialized_reader::{SerializedFileReader,
SerializedPageReader};
use crate::record::reader::RowIter;
-use crate::record::Row;
-use crate::schema::types::{
- self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, Type as SchemaType,
-};
-use crate::util::{io::FileSource, memory::ByteBufferPtr};
+use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType};
+
+use crate::basic::Type;
+
+use crate::column::reader::ColumnReaderImpl;
+
+/// Length should return the amount of bytes that implementor contains.
Review comment:
```suggestion
/// Length should return the total number of bytes in the input source.
```
##########
File path: rust/parquet/src/file/reader.rs
##########
@@ -18,35 +18,37 @@
//! Contains file reader API and provides methods to access file metadata, row
group
//! readers to read individual column chunks, or access record iterator.
-use std::{
- convert::TryFrom,
- fs::File,
- io::{BufReader, Cursor, Read, Seek, SeekFrom},
- path::Path,
- rc::Rc,
-};
+use std::{boxed::Box, io::Read, rc::Rc};
-use byteorder::{ByteOrder, LittleEndian};
-use parquet_format::{
- ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, PageHeader,
PageType,
-};
-use thrift::protocol::TCompactInputProtocol;
-
-use crate::basic::{ColumnOrder, Compression, Encoding, Type};
use crate::column::page::PageIterator;
-use crate::column::{
- page::{Page, PageReader},
- reader::{ColumnReader, ColumnReaderImpl},
-};
-use crate::compression::{create_codec, Codec};
+use crate::column::{page::PageReader, reader::ColumnReader};
use crate::errors::{ParquetError, Result};
-use crate::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC};
+use crate::file::metadata::*;
+pub use crate::file::serialized_reader::{SerializedFileReader,
SerializedPageReader};
use crate::record::reader::RowIter;
-use crate::record::Row;
-use crate::schema::types::{
- self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, Type as SchemaType,
-};
-use crate::util::{io::FileSource, memory::ByteBufferPtr};
+use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType};
+
+use crate::basic::Type;
+
+use crate::column::reader::ColumnReaderImpl;
+
+/// Length should return the amount of bytes that implementor contains.
+/// It's mainly used to read the metadata, which is at the end of the source.
+#[allow(clippy::len_without_is_empty)]
+pub trait Length {
+ /// Returns the amount of bytes of the inner source.
+ fn len(&self) -> u64;
+}
+
+/// The ChunkReader trait generates readers of chunks of a source.
+/// For a file system reader, each chunk might contain a clone of File bounded
on a given range.
+/// For an object store reader, each read can be mapped to a range request.
+pub trait ChunkReader: Length {
+ type T: Read;
+ /// get a serialy readeable slice of the current reader
+ /// This should fail if the slice exceeds the current bounds
+ fn get_read(&self, start: u64, length: usize) -> Result<Self::T>;
+}
// ----------------------------------------------------------------------
Review comment:
For backwards compatibility in other projects, I think we need to allow
`use parquet::file::reader::TryClone` to keep working (`TryClone` moved to
`util::io::TryClone`, and appears not to be public anymore).
Perhap something like this (untested):
```suggestion
pub use super::TryClone;
// ----------------------------------------------------------------------
```
When I tried to compile an in-house project with this branch I got the
following error:
```
error[E0432]: unresolved import
`delorean_parquet::parquet::file::reader::TryClone`
--> delorean_parquet/src/lib.rs:13:28
|
13 | file::reader::{Length, TryClone},
| ^^^^^^^^ no `TryClone` in
`parquet::file::reader`
error[E0432]: unresolved import
`delorean_parquet::parquet::file::reader::TryClone`
```
When I tried to use the copy in `util::io` I got:
```
error[E0603]: module `util` is private
--> delorean_parquet/src/lib.rs:14:5
|
14 | util::io::{TryClone},
| ^^^^ private module
|
```
When I used the import in `seriailized_reader` I got a similar error:
```
error[E0603]: trait `TryClone` is private
--> delorean_parquet/src/lib.rs:14:30
|
14 | file::serialized_reader::TryClone,
| ^^^^^^^^ private trait
|
```
##########
File path: rust/parquet/src/file/reader.rs
##########
@@ -18,35 +18,37 @@
//! Contains file reader API and provides methods to access file metadata, row
group
//! readers to read individual column chunks, or access record iterator.
-use std::{
- convert::TryFrom,
- fs::File,
- io::{BufReader, Cursor, Read, Seek, SeekFrom},
- path::Path,
- rc::Rc,
-};
+use std::{boxed::Box, io::Read, rc::Rc};
-use byteorder::{ByteOrder, LittleEndian};
-use parquet_format::{
- ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, PageHeader,
PageType,
-};
-use thrift::protocol::TCompactInputProtocol;
-
-use crate::basic::{ColumnOrder, Compression, Encoding, Type};
use crate::column::page::PageIterator;
-use crate::column::{
- page::{Page, PageReader},
- reader::{ColumnReader, ColumnReaderImpl},
-};
-use crate::compression::{create_codec, Codec};
+use crate::column::{page::PageReader, reader::ColumnReader};
use crate::errors::{ParquetError, Result};
-use crate::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC};
+use crate::file::metadata::*;
+pub use crate::file::serialized_reader::{SerializedFileReader,
SerializedPageReader};
use crate::record::reader::RowIter;
-use crate::record::Row;
-use crate::schema::types::{
- self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, Type as SchemaType,
-};
-use crate::util::{io::FileSource, memory::ByteBufferPtr};
+use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType};
+
+use crate::basic::Type;
+
+use crate::column::reader::ColumnReaderImpl;
+
+/// Length should return the amount of bytes that implementor contains.
+/// It's mainly used to read the metadata, which is at the end of the source.
+#[allow(clippy::len_without_is_empty)]
+pub trait Length {
+ /// Returns the amount of bytes of the inner source.
+ fn len(&self) -> u64;
+}
+
+/// The ChunkReader trait generates readers of chunks of a source.
Review comment:
👍
##########
File path: rust/parquet/src/util/cursor.rs
##########
@@ -0,0 +1,203 @@
+// Licensed to the Apache Software Foundation (ASF) under one
Review comment:
I discovered above that the entire `util` module is not public --
https://github.com/apache/arrow/blob/master/rust/parquet/src/lib.rs#L39
```
#[macro_use]
mod util;
```
So none of these pub traits can be used outside this crate at the moment
##########
File path: rust/parquet/src/file/reader.rs
##########
@@ -18,35 +18,37 @@
//! Contains file reader API and provides methods to access file metadata, row
group
//! readers to read individual column chunks, or access record iterator.
-use std::{
- convert::TryFrom,
- fs::File,
- io::{BufReader, Cursor, Read, Seek, SeekFrom},
- path::Path,
- rc::Rc,
-};
+use std::{boxed::Box, io::Read, rc::Rc};
-use byteorder::{ByteOrder, LittleEndian};
-use parquet_format::{
- ColumnOrder as TColumnOrder, FileMetaData as TFileMetaData, PageHeader,
PageType,
-};
-use thrift::protocol::TCompactInputProtocol;
-
-use crate::basic::{ColumnOrder, Compression, Encoding, Type};
use crate::column::page::PageIterator;
-use crate::column::{
- page::{Page, PageReader},
- reader::{ColumnReader, ColumnReaderImpl},
-};
-use crate::compression::{create_codec, Codec};
+use crate::column::{page::PageReader, reader::ColumnReader};
use crate::errors::{ParquetError, Result};
-use crate::file::{metadata::*, statistics, FOOTER_SIZE, PARQUET_MAGIC};
+use crate::file::metadata::*;
+pub use crate::file::serialized_reader::{SerializedFileReader,
SerializedPageReader};
use crate::record::reader::RowIter;
-use crate::record::Row;
-use crate::schema::types::{
- self, ColumnDescPtr, SchemaDescPtr, SchemaDescriptor, Type as SchemaType,
-};
-use crate::util::{io::FileSource, memory::ByteBufferPtr};
+use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type as SchemaType};
+
+use crate::basic::Type;
+
+use crate::column::reader::ColumnReaderImpl;
+
+/// Length should return the amount of bytes that implementor contains.
+/// It's mainly used to read the metadata, which is at the end of the source.
+#[allow(clippy::len_without_is_empty)]
+pub trait Length {
+ /// Returns the amount of bytes of the inner source.
+ fn len(&self) -> u64;
+}
+
+/// The ChunkReader trait generates readers of chunks of a source.
+/// For a file system reader, each chunk might contain a clone of File bounded
on a given range.
+/// For an object store reader, each read can be mapped to a range request.
+pub trait ChunkReader: Length {
+ type T: Read;
+ /// get a serialy readeable slice of the current reader
+ /// This should fail if the slice exceeds the current bounds
+ fn get_read(&self, start: u64, length: usize) -> Result<Self::T>;
+}
// ----------------------------------------------------------------------
Review comment:
When I tried to compile an in-house project with this branch I got the
following error:
```
error[E0432]: unresolved import
`delorean_parquet::parquet::file::reader::TryClone`
--> delorean_parquet/src/lib.rs:13:28
|
13 | file::reader::{Length, TryClone},
| ^^^^^^^^ no `TryClone` in
`parquet::file::reader`
error[E0432]: unresolved import
`delorean_parquet::parquet::file::reader::TryClone`
```
When I tried to use the copy in `util::io` I got:
```
error[E0603]: module `util` is private
--> delorean_parquet/src/lib.rs:14:5
|
14 | util::io::{TryClone},
| ^^^^ private module
|
```
When I used the import in `seriailized_reader` I got a similar error:
```
error[E0603]: trait `TryClone` is private
--> delorean_parquet/src/lib.rs:14:30
|
14 | file::serialized_reader::TryClone,
| ^^^^^^^^ private trait
|
```
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]