[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #4661: Stream CSV file during schema inference

GitBox Fri, 23 Dec 2022 05:50:53 -0800


alamb commented on code in PR #4661:
URL: https://github.com/apache/arrow-datafusion/pull/4661#discussion_r1056320608



##########
datafusion/core/src/datasource/file_format/mod.rs:
##########
@@ -136,4 +145,121 @@ pub(crate) mod test_util {
             .await?;
         Ok(exec)
     }
+
+    /// Mock ObjectStore to provide an variable stream of bytes on get
+    /// Able to keep track of how many iterations of the provided bytes were 
repeated
+    #[derive(Debug)]
+    pub struct VariableStream {
+        bytes_to_repeat: Bytes,
+        max_iterations: usize,
+        iterations_detected: Arc<Mutex<usize>>,
+    }
+
+    impl std::fmt::Display for VariableStream {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "VariableStream")
+        }
+    }
+
+    #[async_trait]
+    impl ObjectStore for VariableStream {
+        async fn put(&self, _location: &Path, _bytes: Bytes) -> 
object_store::Result<()> {
+            unimplemented!()
+        }
+
+        async fn put_multipart(
+            &self,
+            _location: &Path,
+        ) -> object_store::Result<(MultipartId, Box<dyn AsyncWrite + Unpin + 
Send>)>
+        {
+            unimplemented!()
+        }
+
+        async fn abort_multipart(
+            &self,
+            _location: &Path,
+            _multipart_id: &MultipartId,
+        ) -> object_store::Result<()> {
+            unimplemented!()
+        }
+
+        async fn get(&self, _location: &Path) -> 
object_store::Result<GetResult> {
+            let bytes = self.bytes_to_repeat.clone();
+            let arc = self.iterations_detected.clone();
+            Ok(GetResult::Stream(
+                futures::stream::repeat_with(move || {
+                    let arc_inner = arc.clone();
+                    *arc_inner.lock().unwrap() += 1;

Review Comment:
   👍 



##########
datafusion/core/src/datasource/file_format/csv.rs:
##########
@@ -291,6 +378,57 @@ mod tests {
         Ok(())
     }
 
+    #[tokio::test]
+    async fn test_infer_schema_stream() -> Result<()> {
+        let session_ctx = SessionContext::new();
+        let state = session_ctx.state();
+        let variable_object_store =
+            Arc::new(VariableStream::new(Bytes::from("1,2,3,4,5\n"), 200));
+        let object_meta = ObjectMeta {
+            location: Path::parse("/")?,
+            last_modified: DateTime::default(),
+            size: usize::MAX,
+        };
+
+        let num_rows_to_read = 100;
+        let csv_format = CsvFormat {
+            has_header: false,
+            schema_infer_max_rec: Some(num_rows_to_read),
+            ..Default::default()
+        };
+        let inferred_schema = csv_format
+            .infer_schema(
+                &state,
+                &(variable_object_store.clone() as Arc<dyn ObjectStore>),
+                &[object_meta],
+            )
+            .await?;
+
+        let actual_fields: Vec<_> = inferred_schema
+            .fields()
+            .iter()
+            .map(|f| format!("{}: {:?}", f.name(), f.data_type()))
+            .collect();
+        assert_eq!(
+            vec![
+                "column_1: Int64",
+                "column_2: Int64",
+                "column_3: Int64",
+                "column_4: Int64",
+                "column_5: Int64"
+            ],
+            actual_fields
+        );
+        // ensuring on csv infer that it won't try to read entire file
+        // should only read as many rows as was configured in the CsvFormat

Review Comment:
   Awesome



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #4661: Stream CSV file during schema inference

Reply via email to