alamb commented on code in PR #8565:
URL: https://github.com/apache/arrow-datafusion/pull/8565#discussion_r1433252219


##########
datafusion/sqllogictest/test_files/parquet.slt:
##########
@@ -276,6 +276,118 @@ LIMIT 10;
 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC"))
 0 2014-08-27T14:00:00Z Timestamp(Millisecond, Some("UTC"))
 
+# test for
+query ITID
+COPY (SELECT * FROM src_table WHERE int_col > 6 LIMIT 3)
+TO 'test_files/scratch/parquet/test_table/subdir/3.parquet'
+(FORMAT PARQUET, SINGLE_FILE_OUTPUT true);
+----
+3
+
+# Test config ignore_subdirectory:
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = true;
+
+statement ok
+CREATE EXTERNAL TABLE t1_ignore_subdirectory
+STORED AS PARQUET
+WITH HEADER ROW
+LOCATION 'test_files/scratch/parquet/test_table/*.parquet';
+
+query TT
+explain select count(*) from t1_ignore_subdirectory;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
+--TableScan: t1_ignore_subdirectory projection=[]
+physical_plan
+AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
+--CoalescePartitionsExec
+----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
+------ParquetExec: file_groups={2 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet,
 
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}
+
+statement ok
+CREATE EXTERNAL TABLE t2_ignore_subdirectory
+STORED AS PARQUET
+WITH HEADER ROW
+LOCATION 'test_files/scratch/parquet/test_table/';
+
+query TT
+explain select count(*) from t2_ignore_subdirectory;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
+--TableScan: t2_ignore_subdirectory projection=[]
+physical_plan
+AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
+--CoalescePartitionsExec
+----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
+------ParquetExec: file_groups={2 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet,
 
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet]]}
+
+# scan file: 0.parquet 1.parquet 2.parquet
+
+query I
+select count(*) from t1_ignore_subdirectory;
+----
+9
+
+query I
+select count(*) from t2_ignore_subdirectory;
+----
+9
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = false;
+
+statement ok
+CREATE EXTERNAL TABLE t1_with_subdirectory
+STORED AS PARQUET
+WITH HEADER ROW
+LOCATION 'test_files/scratch/parquet/test_table/*.parquet';
+
+query TT
+explain select count(*) from t1_with_subdirectory;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
+--TableScan: t1_with_subdirectory projection=[]
+physical_plan
+AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
+--CoalescePartitionsExec
+----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
+------ParquetExec: file_groups={2 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet,
 
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet,
 
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/subdir/3.parquet]]}
+
+
+statement ok
+CREATE EXTERNAL TABLE t2_with_subdirectory
+STORED AS PARQUET
+WITH HEADER ROW
+LOCATION 'test_files/scratch/parquet/test_table/';
+
+query TT
+explain select count(*) from t2_with_subdirectory;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
+--TableScan: t2_with_subdirectory projection=[]
+physical_plan
+AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
+--CoalescePartitionsExec
+----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
+------ParquetExec: file_groups={2 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/0.parquet,
 
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/1.parquet],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/2.parquet,
 
WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet/test_table/subdir/3.parquet]]}
+
+# scan file: 0.parquet 1.parquet 2.parquet 3.parquet
+query I
+select count(*) from t1_with_subdirectory;
+----
+12
+
+query I
+select count(*) from t2_with_subdirectory;

Review Comment:
   It is cool to see the different rows but I don't understand the need for all 
the different tables and explain plans
   
   I think we can get coverage by simply creating the equivalent of 
`t2_with_subdirectory` and showing that it returns 12 rows when
   
   ```sql
   set datafusion.execution.listing_table_ignore_subdirectory = false;
   ```
   
   And 9 when 
   
   ```sql
   set datafusion.execution.listing_table_ignore_subdirectory = true;
   ```
   



##########
datafusion/sqllogictest/test_files/csv_files.slt:
##########
@@ -63,3 +63,60 @@ id6 value"6
 id7 value"7
 id8 value"8
 id9 value"9
+
+
+# When reading a partitioned table, the `listing_table_ignore_subdirectory` 
configuration will be invalid
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = false;
+
+statement ok
+CREATE EXTERNAL TABLE partition_csv_table (
+  name VARCHAR,
+  ts TIMESTAMP,
+  c_date DATE,
+)
+STORED AS CSV
+PARTITIONED BY (c_date)
+LOCATION '../core/tests/data/partitioned_table';
+
+query I
+select count(*) from partition_csv_table;
+----
+4
+
+statement ok
+DROP TABLE partition_csv_table
+
+statement ok
+set datafusion.execution.listing_table_ignore_subdirectory = true;
+
+statement ok
+CREATE EXTERNAL TABLE partition_csv_table (
+  name VARCHAR,
+  ts TIMESTAMP,
+  c_date DATE,
+)
+STORED AS CSV
+PARTITIONED BY (c_date)
+LOCATION '../core/tests/data/partitioned_table';
+
+query TT
+explain select count(*) from partition_csv_table;
+----
+logical_plan
+Aggregate: groupBy=[[]], aggr=[[COUNT(UInt8(1)) AS COUNT(*)]]
+--TableScan: partition_csv_table projection=[]
+physical_plan
+AggregateExec: mode=Final, gby=[], aggr=[COUNT(*)]
+--CoalescePartitionsExec
+----AggregateExec: mode=Partial, gby=[], aggr=[COUNT(*)]
+------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2
+--------CsvExec: file_groups={2 groups: 
[[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table/c_date=2018-11-13/timestamps.csv],
 
[WORKSPACE_ROOT/datafusion/core/tests/data/partitioned_table/c_date=2018-12-13/timestamps.csv]]},
 has_header=false
+
+query I
+select count(*) from partition_csv_table;

Review Comment:
   I don't understand what this test is testing -- in both cases the table has 
4 rows (aka there is no data in a subdirectory to ignore, right)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to