kiszk commented on a change in pull request #10114: URL: https://github.com/apache/arrow/pull/10114#discussion_r767175296
########## File path: java/dataset/src/test/java/org/apache/arrow/dataset/file/TestFileSystemDataset.java ########## @@ -129,6 +137,29 @@ public void testParquetBatchSize() throws Exception { AutoCloseables.close(datum); } + @Test + public void testParquetDirectoryRead() throws Exception { + final File outputFolder = TMP.newFolder(); + ParquetWriteSupport.writeTempFile(AVRO_SCHEMA_USER, outputFolder, + 1, "a", 2, "b", 3, "c"); + ParquetWriteSupport.writeTempFile(AVRO_SCHEMA_USER, outputFolder, + 4, "e", 5, "f", 6, "g", 7, "h"); + String expectedJsonUnordered = "[[1,\"a\"],[2,\"b\"],[3,\"c\"],[4,\"e\"],[5,\"f\"],[6,\"g\"],[7,\"h\"]]"; + + ScanOptions options = new ScanOptions(new String[0], 1); + FileSystemDatasetFactory factory = new FileSystemDatasetFactory(rootAllocator(), NativeMemoryPool.getDefault(), + FileFormat.PARQUET, outputFolder.toURI().toString()); + Schema schema = inferResultSchemaFromFactory(factory, options); + List<ArrowRecordBatch> datum = collectResultFromFactory(factory, options); + + assertSingleTaskProduced(factory, options); + assertEquals(7, datum.size()); + datum.forEach(batch -> assertEquals(1, batch.getLength())); + checkParquetReadResult(schema, expectedJsonUnordered, datum); + + AutoCloseables.close(datum); Review comment: Can we use `try (... datum = ...) { ... }` at line 153? So, we can remove this line. ########## File path: java/dataset/src/test/java/org/apache/arrow/dataset/ParquetWriteSupport.java ########## @@ -42,13 +43,15 @@ private final Schema avroSchema; private final List<GenericRecord> writtenRecords = new ArrayList<>(); private final GenericRecordListBuilder recordListBuilder = new GenericRecordListBuilder(); + private final Random random = new Random(); public ParquetWriteSupport(String schemaName, File outputFolder) throws Exception { avroSchema = readSchemaFromFile(schemaName); - path = outputFolder.getPath() + File.separator + "generated.parquet"; + path = outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".parquet"; uri = "file://" + path; - writer = AvroParquetWriter.<GenericRecord>builder(new org.apache.hadoop.fs.Path(path)) + writer = AvroParquetWriter Review comment: nit: Do we need this format change? ########## File path: java/dataset/src/test/java/org/apache/arrow/dataset/ParquetWriteSupport.java ########## @@ -42,13 +43,15 @@ private final Schema avroSchema; private final List<GenericRecord> writtenRecords = new ArrayList<>(); private final GenericRecordListBuilder recordListBuilder = new GenericRecordListBuilder(); + private final Random random = new Random(); public ParquetWriteSupport(String schemaName, File outputFolder) throws Exception { avroSchema = readSchemaFromFile(schemaName); - path = outputFolder.getPath() + File.separator + "generated.parquet"; + path = outputFolder.getPath() + File.separator + "generated-" + random.nextLong() + ".parquet"; Review comment: I think that this change wants to get a unique name for a short period. How about using `System.currentTimeMillis()`? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org