zhuqi-lucas commented on PR #14644:
URL: https://github.com/apache/datafusion/pull/14644#issuecomment-2660707498

   ```rust
   use arrow::array::{RecordBatch, StringBuilder};
   use arrow_schema::{DataType, Field, Schema};
   use datafusion::execution::disk_manager::DiskManagerConfig;
   use datafusion::execution::memory_pool::FairSpillPool;
   use datafusion::execution::runtime_env::RuntimeEnvBuilder;
   use datafusion::parquet::arrow::ArrowWriter;
   use datafusion::prelude::{ParquetReadOptions, SessionConfig, SessionContext};
   use futures::TryStreamExt;
   use std::sync::Arc;
   
   #[tokio::main(flavor = "multi_thread", worker_threads = 1)]
   pub async fn main() {
       build_parquet();
   
       let env = RuntimeEnvBuilder::new()
           .with_disk_manager(DiskManagerConfig::default())
           .with_memory_pool(Arc::new(FairSpillPool::new(100 * 1024 * 1024)))
           .build_arc()
           .unwrap();
   
       let mut config = 
SessionConfig::new().with_sort_spill_reservation_bytes(32 * 1024 * 1024);
       config.options_mut().execution.parquet.schema_force_view_types = false;
   
       let ctx = SessionContext::new_with_config_rt(config, env);
   
       ctx.register_parquet(
           "big_strings",
           "/tmp/big_strings.parquet",
           ParquetReadOptions::default(),
       )
           .await
           .unwrap();
   
       let sql = "SELECT * FROM big_strings ORDER BY strings";
       println!("Sorting strings");
       ctx.sql(sql)
           .await
           .unwrap()
           .execute_stream()
           .await
           .unwrap()
           .try_for_each(|_| std::future::ready(Ok(())))
           .await
           .unwrap();
   }
   
   fn build_parquet() {
       if std::fs::File::open("/tmp/big_strings.parquet").is_ok() {
           println!("Using existing file at /tmp/big_strings.parquet");
           return;
       }
       println!("Generating test file at /tmp/big_strings.parquet");
       let file = std::fs::File::create("/tmp/big_strings.parquet").unwrap();
       let schema = Arc::new(Schema::new(vec![Field::new(
           "strings",
           DataType::Utf8,
           false,
       )]));
       let mut writer = ArrowWriter::try_new(file, schema.clone(), 
None).unwrap();
   
       for batch_idx in 0..100 {
           println!("Generating batch {} of 100", batch_idx);
           let mut string_array_builder =
               StringBuilder::with_capacity(1024 * 1024, 1024 * 1024 * 3 * 14);
           for i in 0..(1024 * 1024) {
               string_array_builder
                   .append_value(format!("string-{}string-{}string-{}", i, i, 
i));
           }
           let array = Arc::new(string_array_builder.finish());
           let batch = RecordBatch::try_new(schema.clone(), 
vec![array]).unwrap();
           writer.write(&batch).unwrap();
       }
       writer.close().unwrap();
   }
   ```
   
   Thank you @kazuyukitanimura for the PR, i applied the PR try to fix the 
testing, but the above testing is still failed for me, i am not sure if i am 
missing something.
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to