alamb commented on code in PR #4712:
URL: https://github.com/apache/arrow-datafusion/pull/4712#discussion_r1055827216
##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -56,25 +55,25 @@ use crate::physical_plan::{Accumulator, ExecutionPlan,
Statistics};
pub const DEFAULT_PARQUET_EXTENSION: &str = ".parquet";
/// The Apache Parquet `FileFormat` implementation
-#[derive(Debug)]
+///
+/// Note it is recommended these are instead configured on the
[`ConfigOptions`]
+/// associated with the [`SessionState`] instead of overridden on a
format-bases
Review Comment:
```suggestion
/// associated with the [`SessionState`] instead of overridden on a
format-basis
```
##########
datafusion-examples/examples/flight_server.rs:
##########
@@ -39,37 +38,21 @@ pub struct FlightServiceImpl {}
#[tonic::async_trait]
impl FlightService for FlightServiceImpl {
- type HandshakeStream = Pin<
- Box<dyn Stream<Item = Result<HandshakeResponse, Status>> + Send + Sync
+ 'static>,
- >;
- type ListFlightsStream =
- Pin<Box<dyn Stream<Item = Result<FlightInfo, Status>> + Send + Sync +
'static>>;
- type DoGetStream =
- Pin<Box<dyn Stream<Item = Result<FlightData, Status>> + Send + Sync +
'static>>;
- type DoPutStream =
- Pin<Box<dyn Stream<Item = Result<PutResult, Status>> + Send + Sync +
'static>>;
- type DoActionStream = Pin<
- Box<
- dyn Stream<Item = Result<arrow_flight::Result, Status>>
- + Send
- + Sync
- + 'static,
- >,
- >;
- type ListActionsStream =
- Pin<Box<dyn Stream<Item = Result<ActionType, Status>> + Send + Sync +
'static>>;
- type DoExchangeStream =
- Pin<Box<dyn Stream<Item = Result<FlightData, Status>> + Send + Sync +
'static>>;
+ type HandshakeStream = BoxStream<'static, Result<HandshakeResponse,
Status>>;
Review Comment:
I forgot this was here -- I have to give this example love to give this
after my work to make arrow-flight easier to use
##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -56,25 +55,25 @@ use crate::physical_plan::{Accumulator, ExecutionPlan,
Statistics};
pub const DEFAULT_PARQUET_EXTENSION: &str = ".parquet";
/// The Apache Parquet `FileFormat` implementation
-#[derive(Debug)]
+///
+/// Note it is recommended these are instead configured on the
[`ConfigOptions`]
+/// associated with the [`SessionState`] instead of overridden on a
format-bases
+///
+/// TODO: Deprecate and remove overrides (#4349)
Review Comment:
```suggestion
/// TODO: Deprecate and remove overrides
/// <https://github.com/apache/arrow-datafusion/issues/4349>
```
##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -1132,6 +1133,9 @@ message ScanLimit {
}
message FileScanExecConf {
+ // Was repeated ConfigOption options = 10;
+ reserved 10;
Review Comment:
👍
##########
datafusion/core/src/execution/context.rs:
##########
@@ -1173,7 +1166,7 @@ pub struct SessionConfig {
/// due to `resolve_table_ref` which passes back references)
default_schema: String,
/// Configuration options
- pub config_options: Arc<RwLock<ConfigOptions>>,
+ config_options: ConfigOptions,
Review Comment:
❤️
##########
datafusion/proto/proto/datafusion.proto:
##########
@@ -445,79 +446,79 @@ message InListNode {
}
enum ScalarFunction {
- Abs=0;
- Acos=1;
- Asin=2;
- Atan=3;
- Ascii=4;
- Ceil=5;
- Cos=6;
- Digest=7;
- Exp=8;
- Floor=9;
- Ln=10;
- Log=11;
- Log10=12;
- Log2=13;
- Round=14;
- Signum=15;
- Sin=16;
- Sqrt=17;
- Tan=18;
- Trunc=19;
- Array=20;
- RegexpMatch=21;
- BitLength=22;
- Btrim=23;
- CharacterLength=24;
- Chr=25;
- Concat=26;
- ConcatWithSeparator=27;
- DatePart=28;
- DateTrunc=29;
- InitCap=30;
- Left=31;
- Lpad=32;
- Lower=33;
- Ltrim=34;
- MD5=35;
- NullIf=36;
- OctetLength=37;
- Random=38;
- RegexpReplace=39;
- Repeat=40;
- Replace=41;
- Reverse=42;
- Right=43;
- Rpad=44;
- Rtrim=45;
- SHA224=46;
- SHA256=47;
- SHA384=48;
- SHA512=49;
- SplitPart=50;
- StartsWith=51;
- Strpos=52;
- Substr=53;
- ToHex=54;
- ToTimestamp=55;
- ToTimestampMillis=56;
- ToTimestampMicros=57;
- ToTimestampSeconds=58;
- Now=59;
- Translate=60;
- Trim=61;
- Upper=62;
- Coalesce=63;
- Power=64;
- StructFun=65;
- FromUnixtime=66;
- Atan2=67;
- DateBin=68;
- ArrowTypeof=69;
- CurrentDate=70;
- CurrentTime=71;
- Uuid=72;
+ Abs = 0;
Review Comment:
whitespace!
##########
datafusion/proto/src/logical_plan/mod.rs:
##########
@@ -353,12 +353,9 @@ impl AsLogicalPlan for LogicalPlanNode {
self
))
})? {
- &FileFormatType::Parquet(protobuf::ParquetFormat {
Review Comment:
I agree serializing the same config options multiple times (once in the main
session context and then once again as part of the file format) is undesirable
for many reasons
##########
parquet-test-utils/src/lib.rs:
##########
@@ -58,6 +58,16 @@ pub struct ParquetScanOptions {
pub enable_page_index: bool,
}
+impl ParquetScanOptions {
+ /// Returns a [`SessionConfig`] with the given options
+ pub fn config(&self) -> SessionConfig {
+ SessionConfig::new()
Review Comment:
Thank you. I agree this PR is already large. I also think the
ParquetScanOptions predated the config options.
I think removing the ParquetScanOptions as a follow on PR is a good idea 👍
##########
datafusion/core/src/datasource/file_format/parquet.rs:
##########
@@ -85,13 +84,9 @@ impl ParquetFormat {
}
/// Return true if pruning is enabled
- pub fn enable_pruning(&self) -> bool {
+ pub fn enable_pruning(&self, config_options: &ConfigOptions) -> bool {
Review Comment:
👍
##########
datafusion/core/src/execution/context.rs:
##########
@@ -1202,14 +1195,14 @@ impl SessionConfig {
/// Create an execution config with config options read from the
environment
pub fn from_env() -> Self {
Self {
- config_options: ConfigOptions::from_env().into_shareable(),
+ config_options: ConfigOptions::from_env(),
..Default::default()
}
}
/// Set a configuration option
- pub fn set(self, key: &str, value: ScalarValue) -> Self {
- self.config_options.write().set(key, value);
+ pub fn set(mut self, key: &str, value: ScalarValue) -> Self {
Review Comment:
fyi @waynexia
##########
datafusion/core/src/physical_plan/file_format/parquet.rs:
##########
@@ -302,6 +287,8 @@ impl ExecutionPlan for ParquetExec {
})
})?;
+ let config_options = ctx.session_config().config_options();
Review Comment:
I think it is reasonable to look at the session configuration while
executing 🤷
It certainly seems better than the current state of master where the config
options (attached to session state) are read via interior mutability
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]