This is an automated email from the ASF dual-hosted git repository.
snlee pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new e1a66512dc CSVRecordReader – Set header record when header is
explicitly provided (#11594)
e1a66512dc is described below
commit e1a66512dcb9d8ac5bf6ef5d1454961fc6d997ad
Author: Ragesh Rajagopalan <[email protected]>
AuthorDate: Thu Sep 14 20:44:11 2023 -0700
CSVRecordReader – Set header record when header is explicitly provided
(#11594)
---
.../plugin/inputformat/csv/CSVRecordReader.java | 11 ++++--
.../inputformat/csv/CSVRecordReaderTest.java | 45 +++++++++++++++++++++-
.../src/test/resources/dataFileWithNoHeader2.csv | 4 ++
.../test/resources/dataFileWithValidHeaders.csv | 5 +++
4 files changed, 60 insertions(+), 5 deletions(-)
diff --git
a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
index 49339ac218..c4dc8c167f 100644
---
a/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
+++
b/pinot-plugins/pinot-input-format/pinot-csv/src/main/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReader.java
@@ -137,11 +137,11 @@ public class CSVRecordReader implements RecordReader {
}
if (_isHeaderProvided) {
+ _headerMap = parseLineAsHeader(config.getHeader());
+ _format = _format.builder().setHeader(_headerMap.keySet().toArray(new
String[0])).build();
if (!_useLineIterator) {
validateHeaderForDelimiter(delimiter, config.getHeader(), _format);
}
- _headerMap = parseLineAsHeader(config.getHeader());
- _format = _format.builder().setHeader(_headerMap.keySet().toArray(new
String[0])).build();
}
if (config.isMultiValueDelimiterEnabled()) {
@@ -329,7 +329,12 @@ public class CSVRecordReader implements RecordReader {
// read the first line
String headerLine = _bufferedReader.readLine();
_headerMap = parseLineAsHeader(headerLine);
- _format = _format.builder().setHeader(_headerMap.keySet().toArray(new
String[0])).build();
+ _format = _format.builder()
+ // If header isn't provided, the first line would be set as header
and the 'skipHeader' property
+ // is set to false.
+ .setSkipHeaderRecord(false)
+ .setHeader(_headerMap.keySet().toArray(new String[0]))
+ .build();
}
_nextLine = _bufferedReader.readLine();
}
diff --git
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
index e174ced7e5..d245fb33c0 100644
---
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
+++
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/java/org/apache/pinot/plugin/inputformat/csv/CSVRecordReaderTest.java
@@ -532,6 +532,46 @@ public class CSVRecordReaderTest extends
AbstractRecordReaderTest {
// Note: The default CSVRecordReader cannot handle unparseable rows
}
+ @Test
+ public void testReadingDataFileWithNoHeaderAndDataRecordsWithEmptyValues()
+ throws URISyntaxException, IOException {
+ URI uri =
ClassLoader.getSystemResource("dataFileWithNoHeader2.csv").toURI();
+ File dataFile = new File(uri);
+
+ // test using line iterator
+ CSVRecordReaderConfig readerConfig = new CSVRecordReaderConfig();
+ readerConfig.setSkipUnParseableLines(true);
+ readerConfig.setHeader("key,num0,num1");
+ List<GenericRow> genericRows = readCSVRecords(dataFile, readerConfig,
null, false);
+ Assert.assertEquals(4, genericRows.size());
+
+ // test using default CSVRecordReader
+ readerConfig.setSkipUnParseableLines(false);
+ genericRows = readCSVRecords(dataFile, readerConfig, null, false);
+ Assert.assertEquals(4, genericRows.size());
+ }
+
+ @Test
+ public void testReadingDataFileWithValidHeaders()
+ throws URISyntaxException, IOException {
+ URI uri =
ClassLoader.getSystemResource("dataFileWithValidHeaders.csv").toURI();
+ File dataFile = new File(uri);
+
+ // test using line iterator
+ CSVRecordReaderConfig readerConfig = new CSVRecordReaderConfig();
+ readerConfig.setSkipUnParseableLines(true);
+ // No explicit header is set and attempt to skip the header should be
ignored. 1st line would be treated as the
+ // header line.
+ readerConfig.setSkipHeader(false);
+ List<GenericRow> genericRows = readCSVRecords(dataFile, readerConfig,
null, false);
+ Assert.assertEquals(4, genericRows.size());
+
+ // test using default CSVRecordReader
+ readerConfig.setSkipUnParseableLines(false);
+ genericRows = readCSVRecords(dataFile, readerConfig, null, false);
+ Assert.assertEquals(4, genericRows.size());
+ }
+
private List<GenericRow> readCSVRecords(File dataFile,
CSVRecordReaderConfig readerConfig, GenericRow genericRow, boolean
rewind)
throws IOException {
@@ -543,10 +583,11 @@ public class CSVRecordReaderTest extends
AbstractRecordReaderTest {
while (recordReader.hasNext()) {
if (genericRow != null) {
recordReader.next(reuse);
+ genericRows.add(reuse);
} else {
- recordReader.next();
+ GenericRow nextRow = recordReader.next();
+ genericRows.add(nextRow);
}
- genericRows.add(genericRow);
}
if (rewind) {
diff --git
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv
new file mode 100644
index 0000000000..e54016ac4c
--- /dev/null
+++
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithNoHeader2.csv
@@ -0,0 +1,4 @@
+"key00",12.3,8.42
+"key01",,7.1
+"key02",,16.81
+"key03",,7.12
diff --git
a/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv
new file mode 100644
index 0000000000..010cab05fa
--- /dev/null
+++
b/pinot-plugins/pinot-input-format/pinot-csv/src/test/resources/dataFileWithValidHeaders.csv
@@ -0,0 +1,5 @@
+"key","num0","num1"
+"key00",12.3,8.42
+"key01",,7.1
+"key02",,16.81
+"key03",,7.12
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]