This is an automated email from the ASF dual-hosted git repository. bchapuis pushed a commit to branch csv-datastore in repository https://gitbox.apache.org/repos/asf/incubator-baremaps.git
commit 31b8bd2b93e64ee5794a5d66e1ff386895722ea1 Author: Bertil Chapuis <[email protected]> AuthorDate: Wed Oct 30 20:52:52 2024 +0100 Add a csv datastore --- .../apache/baremaps/geocoder/GeonamesReader.java | 2 + .../apache/baremaps/storage/csv/CsvDataStore.java | 81 +++++++++ .../apache/baremaps/storage/csv/CsvDataTable.java | 196 +++++++++++++++++++++ .../baremaps/storage/csv/CsvDataTableTest.java | 182 +++++++++++++++++++ 4 files changed, 461 insertions(+) diff --git a/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesReader.java b/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesReader.java index b35c4c45f..53a18dc8f 100644 --- a/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesReader.java +++ b/baremaps-core/src/main/java/org/apache/baremaps/geocoder/GeonamesReader.java @@ -22,6 +22,8 @@ package org.apache.baremaps.geocoder; import com.fasterxml.jackson.databind.MappingIterator; import com.fasterxml.jackson.dataformat.csv.CsvMapper; import com.fasterxml.jackson.dataformat.csv.CsvSchema; +import com.fasterxml.jackson.dataformat.csv.CsvSchema.ColumnType; + import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; diff --git a/baremaps-core/src/main/java/org/apache/baremaps/storage/csv/CsvDataStore.java b/baremaps-core/src/main/java/org/apache/baremaps/storage/csv/CsvDataStore.java new file mode 100644 index 000000000..f79112427 --- /dev/null +++ b/baremaps-core/src/main/java/org/apache/baremaps/storage/csv/CsvDataStore.java @@ -0,0 +1,81 @@ +package org.apache.baremaps.storage.csv; + +import org.apache.baremaps.data.storage.DataSchema; +import org.apache.baremaps.data.storage.DataStore; +import org.apache.baremaps.data.storage.DataStoreException; +import org.apache.baremaps.data.storage.DataTable; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.List; + +/** + * A DataStore implementation that manages a single CSV file. + */ +public class CsvDataStore implements DataStore { + + private final String tableName; + private final DataSchema schema; + private final CsvDataTable dataTable; + + /** + * Constructs a CsvDataStore with the specified table name, schema, and CSV file. + * + * @param tableName the name of the table + * @param schema the data schema defining the structure + * @param csvFile the CSV file to read data from + * @param hasHeader whether the CSV file includes a header row + * @param separator the character used to separate columns in the CSV file + * @throws IOException if an I/O error occurs + */ + public CsvDataStore(String tableName, DataSchema schema, File csvFile, boolean hasHeader, char separator) throws IOException { + this.tableName = tableName; + this.schema = schema; + this.dataTable = new CsvDataTable(schema, csvFile, hasHeader, separator); + } + + /** + * {@inheritDoc} + */ + @Override + public List<String> list() throws DataStoreException { + return Collections.singletonList(tableName); + } + + /** + * {@inheritDoc} + */ + @Override + public DataTable get(String name) throws DataStoreException { + if (this.tableName.equals(name)) { + return dataTable; + } else { + throw new DataStoreException("Table '" + name + "' not found."); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void add(DataTable table) throws DataStoreException { + throw new UnsupportedOperationException("Adding tables is not supported in CsvDataStore."); + } + + /** + * {@inheritDoc} + */ + @Override + public void add(String name, DataTable table) throws DataStoreException { + throw new UnsupportedOperationException("Adding tables is not supported in CsvDataStore."); + } + + /** + * {@inheritDoc} + */ + @Override + public void remove(String name) throws DataStoreException { + throw new UnsupportedOperationException("Removing tables is not supported in CsvDataStore."); + } +} diff --git a/baremaps-core/src/main/java/org/apache/baremaps/storage/csv/CsvDataTable.java b/baremaps-core/src/main/java/org/apache/baremaps/storage/csv/CsvDataTable.java new file mode 100644 index 000000000..ca26cb70b --- /dev/null +++ b/baremaps-core/src/main/java/org/apache/baremaps/storage/csv/CsvDataTable.java @@ -0,0 +1,196 @@ +package org.apache.baremaps.storage.csv; + +import com.fasterxml.jackson.core.JsonParser; +import com.fasterxml.jackson.core.JsonToken; +import com.fasterxml.jackson.dataformat.csv.CsvMapper; +import com.fasterxml.jackson.dataformat.csv.CsvSchema; +import org.apache.baremaps.data.storage.DataColumn; +import org.apache.baremaps.data.storage.DataRow; +import org.apache.baremaps.data.storage.DataSchema; +import org.apache.baremaps.data.storage.DataTable; +import org.locationtech.jts.io.WKTReader; + +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +/** + * A DataTable implementation that reads data from a CSV file using Jackson. + */ +public class CsvDataTable implements DataTable { + + private final DataSchema schema; + private final File csvFile; + private final CsvSchema csvSchema; + private final long size; + + /** + * Constructs a CsvDataTable with the specified schema, CSV file, header presence, and separator. + * + * @param schema the data schema defining the structure + * @param csvFile the CSV file to read data from + * @param hasHeader whether the CSV file includes a header row + * @param separator the character used to separate columns in the CSV file + * @throws IOException if an I/O error occurs + */ + public CsvDataTable(DataSchema schema, File csvFile, boolean hasHeader, char separator) throws IOException { + this.schema = schema; + this.csvFile = csvFile; + this.csvSchema = buildCsvSchema(schema, hasHeader, separator); + this.size = calculateSize(); + } + + /** + * Builds the CsvSchema for Jackson based on the provided DataSchema, header presence, and separator. + * + * @param dataSchema the data schema + * @param hasHeader whether the CSV file includes a header row + * @param separator the character used to separate columns + * @return the CsvSchema for Jackson + */ + private CsvSchema buildCsvSchema(DataSchema dataSchema, boolean hasHeader, char separator) { + CsvSchema.Builder builder = CsvSchema.builder(); + for (DataColumn column : dataSchema.columns()) { + builder.addColumn(column.name()); + } + CsvSchema schema = builder.setUseHeader(hasHeader).setColumnSeparator(separator).build(); + return schema; + } + + /** + * Calculates the number of rows in the CSV file. + * + * @return the number of rows + * @throws IOException if an I/O error occurs + */ + private long calculateSize() throws IOException { + try (var parser = new CsvMapper().readerFor(Map.class) + .with(csvSchema) + .createParser(csvFile)) { + long rowCount = 0; + while (parser.nextToken() != null) { + if (parser.currentToken() == JsonToken.START_OBJECT) { + rowCount++; + } + } + return rowCount; + } + } + + @Override + public DataSchema schema() { + return schema; + } + + @Override + public boolean add(DataRow row) { + throw new UnsupportedOperationException("Adding rows is not supported."); + } + + @Override + public void clear() { + throw new UnsupportedOperationException("Clearing rows is not supported."); + } + + @Override + public long size() { + return size; + } + + @Override + public Iterator<DataRow> iterator() { + try { + CsvMapper csvMapper = new CsvMapper(); + JsonParser parser = csvMapper.readerFor(Map.class) + .with(csvSchema) + .createParser(csvFile); + + Iterator<Map<String, String>> csvIterator = csvMapper.readerFor(Map.class) + .with(csvSchema) + .readValues(parser); + + return new Iterator<>() { + @Override + public boolean hasNext() { + return csvIterator.hasNext(); + } + + @Override + public DataRow next() { + Map<String, String> csvRow = csvIterator.next(); + DataRow dataRow = schema.createRow(); + + for (int i = 0; i < schema.columns().size(); i++) { + DataColumn column = schema.columns().get(i); + String columnName = column.name(); + String value = csvRow.get(columnName); + + if (value != null) { + Object parsedValue = parseValue(column, value); + dataRow.set(i, parsedValue); + } else { + dataRow.set(i, null); + } + } + return dataRow; + } + }; + + } catch (IOException e) { + throw new RuntimeException("Error reading CSV file", e); + } + } + + /** + * Parses the string value from the CSV according to the column type. + * + * @param column the data column + * @param value the string value from the CSV + * @return the parsed value + */ + private Object parseValue(DataColumn column, String value) { + DataColumn.Type type = column.type(); + try { + switch (type) { + case STRING: + return value; + case INTEGER: + return Integer.parseInt(value); + case LONG: + return Long.parseLong(value); + case FLOAT: + return Float.parseFloat(value); + case DOUBLE: + return Double.parseDouble(value); + case BOOLEAN: + return Boolean.parseBoolean(value); + case GEOMETRY: + case POINT: + case LINESTRING: + case POLYGON: + case MULTIPOINT: + case MULTILINESTRING: + case MULTIPOLYGON: + case GEOMETRYCOLLECTION: + WKTReader reader = new WKTReader(); + return reader.read(value); + default: + throw new IllegalArgumentException("Unsupported column type: " + type); + } + } catch (Exception e) { + throw new RuntimeException("Error parsing value for column " + column.name(), e); + } + } + + @Override + public Spliterator<DataRow> spliterator() { + return Spliterators.spliteratorUnknownSize(iterator(), Spliterator.ORDERED); + } + + @Override + public Stream<DataRow> stream() { + return StreamSupport.stream(spliterator(), false); + } +} diff --git a/baremaps-core/src/test/java/org/apache/baremaps/storage/csv/CsvDataTableTest.java b/baremaps-core/src/test/java/org/apache/baremaps/storage/csv/CsvDataTableTest.java new file mode 100644 index 000000000..09747d9ea --- /dev/null +++ b/baremaps-core/src/test/java/org/apache/baremaps/storage/csv/CsvDataTableTest.java @@ -0,0 +1,182 @@ +package org.apache.baremaps.storage.csv; + +import org.apache.baremaps.data.storage.*; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.locationtech.jts.geom.Geometry; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class CsvDataTableTest { + + private File tempCsvFile; + + @BeforeEach + void setUp() throws IOException { + tempCsvFile = File.createTempFile("test", ".csv"); + tempCsvFile.deleteOnExit(); + } + + @AfterEach + void tearDown() { + if (tempCsvFile.exists()) { + tempCsvFile.delete(); + } + } + + @Test + void testCsvWithHeaderAndCommaSeparator() throws IOException { + String csvContent = """ + id,name,geom + 1,PointA,"POINT(1 1)" + 2,PointB,"POINT(2 2)" + """; + Files.writeString(tempCsvFile.toPath(), csvContent); + List<DataColumn> columns = List.of( + new DataColumnFixed("id", DataColumn.Cardinality.REQUIRED, DataColumn.Type.INTEGER), + new DataColumnFixed("name", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.STRING), + new DataColumnFixed("geom", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.GEOMETRY) + ); + DataSchema schema = new DataSchemaImpl("test_table", columns); + DataTable dataTable = new CsvDataTable(schema, tempCsvFile, true, ','); + assertEquals(2, dataTable.size()); + int rowCount = 0; + for (DataRow row : dataTable) { + rowCount++; + Integer id = (Integer) row.get("id"); + String name = (String) row.get("name"); + Geometry geometry = (Geometry) row.get("geom"); + assertNotNull(id); + assertNotNull(name); + assertNotNull(geometry); + assertEquals("Point" + (rowCount == 1 ? "A" : "B"), name); + assertEquals("POINT (" + rowCount + " " + rowCount + ")", geometry.toText()); + } + assertEquals(2, rowCount); + } + + @Test + void testCsvWithoutHeaderAndSemicolonSeparator() throws IOException { + String csvContent = """ + 1;PointA;"POINT(1 1)" + 2;PointB;"POINT(2 2)" + """; + Files.writeString(tempCsvFile.toPath(), csvContent); + List<DataColumn> columns = List.of( + new DataColumnFixed("column1", DataColumn.Cardinality.REQUIRED, DataColumn.Type.INTEGER), + new DataColumnFixed("column2", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.STRING), + new DataColumnFixed("column3", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.GEOMETRY) + ); + DataSchema schema = new DataSchemaImpl("test_table", columns); + DataTable dataTable = new CsvDataTable(schema, tempCsvFile, false, ';'); + assertEquals(2, dataTable.size()); + int rowCount = 0; + for (DataRow row : dataTable) { + rowCount++; + Integer id = (Integer) row.get("column1"); + String name = (String) row.get("column2"); + Geometry geometry = (Geometry) row.get("column3"); + + // Verify data + assertNotNull(id); + assertNotNull(name); + assertNotNull(geometry); + + assertEquals("Point" + (rowCount == 1 ? "A" : "B"), name); + assertEquals("POINT (" + rowCount + " " + rowCount + ")", geometry.toText()); + } + assertEquals(2, rowCount); + } + + @Test + void testCsvWithDifferentDataTypes() throws IOException { + String csvContent = """ + int_col,double_col,bool_col,string_col + 1,1.1,true,Hello + 2,2.2,false,World + """; + Files.writeString(tempCsvFile.toPath(), csvContent); + List<DataColumn> columns = List.of( + new DataColumnFixed("int_col", DataColumn.Cardinality.REQUIRED, DataColumn.Type.INTEGER), + new DataColumnFixed("double_col", DataColumn.Cardinality.REQUIRED, DataColumn.Type.DOUBLE), + new DataColumnFixed("bool_col", DataColumn.Cardinality.REQUIRED, DataColumn.Type.BOOLEAN), + new DataColumnFixed("string_col", DataColumn.Cardinality.REQUIRED, DataColumn.Type.STRING) + ); + DataSchema schema = new DataSchemaImpl("test_table", columns); + DataTable dataTable = new CsvDataTable(schema, tempCsvFile, true, ','); + assertEquals(2, dataTable.size()); + int rowCount = 0; + for (DataRow row : dataTable) { + rowCount++; + Integer intCol = (Integer) row.get("int_col"); + Double doubleCol = (Double) row.get("double_col"); + Boolean boolCol = (Boolean) row.get("bool_col"); + String stringCol = (String) row.get("string_col"); + + // Verify data + assertEquals(rowCount, intCol); + assertEquals(rowCount * 1.1, doubleCol); + assertEquals(rowCount == 1, boolCol); + assertEquals(rowCount == 1 ? "Hello" : "World", stringCol); + } + assertEquals(2, rowCount); + } + + @Test + void testCsvWithInvalidData() throws IOException { + String csvContent = """ + id,name + abc,TestName + """; + Files.writeString(tempCsvFile.toPath(), csvContent); + List<DataColumn> columns = List.of( + new DataColumnFixed("id", DataColumn.Cardinality.REQUIRED, DataColumn.Type.INTEGER), + new DataColumnFixed("name", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.STRING) + ); + DataSchema schema = new DataSchemaImpl("test_table", columns); + DataTable dataTable = new CsvDataTable(schema, tempCsvFile, true, ','); + assertThrows(RuntimeException.class, () -> { + for (DataRow row : dataTable) { + // This line should throw an exception because abc is not a valid integer + } + }); + } + + @Test + void testAddAndClearUnsupportedOperations() throws IOException { + String csvContent = ""; + Files.writeString(tempCsvFile.toPath(), csvContent); + List<DataColumn> columns = List.of( + new DataColumnFixed("id", DataColumn.Cardinality.REQUIRED, DataColumn.Type.INTEGER), + new DataColumnFixed("name", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.STRING) + ); + DataSchema schema = new DataSchemaImpl("test_table", columns); + DataTable dataTable = new CsvDataTable(schema, tempCsvFile, true, ','); + assertThrows(UnsupportedOperationException.class, () -> dataTable.add(null)); + assertThrows(UnsupportedOperationException.class, dataTable::clear); + } + + @Test + void testSizeCalculation() throws IOException { + String csvContent = """ + id,name + 1,Name1 + 2,Name2 + 3,Name3 + """; + Files.writeString(tempCsvFile.toPath(), csvContent); + List<DataColumn> columns = List.of( + new DataColumnFixed("id", DataColumn.Cardinality.REQUIRED, DataColumn.Type.INTEGER), + new DataColumnFixed("name", DataColumn.Cardinality.OPTIONAL, DataColumn.Type.STRING) + ); + DataSchema schema = new DataSchemaImpl("test_table", columns); + DataTable dataTable = new CsvDataTable(schema, tempCsvFile, true, ','); + assertEquals(3, dataTable.size()); + } +} \ No newline at end of file
