Re: [PR] Add experimental support for geoparquet [incubator-baremaps]

via GitHub Thu, 18 Apr 2024 07:53:53 -0700


github-advanced-security[bot] commented on code in PR #851:
URL: 
https://github.com/apache/incubator-baremaps/pull/851#discussion_r1570922193



##########
baremaps-core/src/main/java/org/apache/baremaps/storage/geoparquet/GeoParquetTable.java:
##########
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.storage.geoparquet;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Objects;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.stream.StreamSupport;
+import org.apache.baremaps.database.collection.AbstractDataCollection;
+import org.apache.baremaps.database.schema.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.avro.AvroReadSupport;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroup;
+import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.ColumnIOFactory;
+import org.apache.parquet.io.RecordReader;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeoParquetTable extends AbstractDataCollection<DataRow> 
implements DataTable {
+
+  private Configuration configuration;
+
+  private WKBReader wkbReader = new WKBReader();
+
+  private Map<FileStatus, FileInfo> metadata = new HashMap<>();
+
+  private Set<String> geometryColumns;
+
+  private DataRowType rowType;
+
+  private long rowCount;
+
+  record FileInfo(
+      long rowCount,
+      ParquetMetadata parquetMetadata,
+      GeoParquetMetadata geoParquetMetadata,
+      DataRowType dataRowType) {
+  }
+
+  public GeoParquetTable(String uri) {
+        this.configuration = getConfiguration();
+
+        try {
+            URI fullUri = FileStatusIterator.getFullUri(uri);
+            Path globPath = new Path(fullUri.getPath());
+
+            URI rootUri = FileStatusIterator.getRootUri(fullUri);
+            FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+
+            List<FileStatus> files = 
Arrays.asList(fileSystem.globStatus(globPath));
+
+            for (FileStatus fileStatus : files) {
+                try (ParquetFileReader reader = ParquetFileReader
+                        .open(HadoopInputFile.fromPath(fileStatus.getPath(), 
configuration))) {
+
+                    long rowCount = reader.getRecordCount();
+                    ParquetMetadata parquetMetadata = reader.getFooter();
+
+                    String json = 
reader.getFooter().getFileMetaData().getKeyValueMetaData().get("geo");
+                    GeoParquetMetadata fileMetadata = new ObjectMapper()
+                            
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+                            .readValue(json, GeoParquetMetadata.class);
+
+                    List<DataColumn> dataColumns = new ArrayList<>();
+                    List<Type> types = 
parquetMetadata.getFileMetaData().getSchema().getFields();
+                    for (Type type : types) {
+                        String name = type.getName();
+                        if (type.isPrimitive()) {
+                            PrimitiveType primitiveType = 
type.asPrimitiveType();
+                            DataColumn.Type columnType = switch 
(primitiveType.getPrimitiveTypeName()) {
+                                case BINARY -> {
+                                    if 
(fileMetadata.getColumns().containsKey(name)) {
+                                        yield DataColumn.Type.GEOMETRY;
+                                    } else if 
(primitiveType.getLogicalTypeAnnotation() == 
LogicalTypeAnnotation.stringType()) {
+                                        yield DataColumn.Type.STRING;
+                                    } else {
+                                        yield DataColumn.Type.BYTE_ARRAY;
+                                    }
+                                }
+                                case INT64 -> DataColumn.Type.LONG;
+                                case INT32 -> DataColumn.Type.INTEGER;
+                                case BOOLEAN -> DataColumn.Type.BOOLEAN;
+                                case FLOAT -> DataColumn.Type.FLOAT;
+                                case DOUBLE -> DataColumn.Type.DOUBLE;
+                                case INT96 -> DataColumn.Type.BYTE_ARRAY;
+                                case FIXED_LEN_BYTE_ARRAY -> 
DataColumn.Type.BYTE_ARRAY;
+                            };
+                            dataColumns.add(new DataColumnImpl(name, 
columnType));
+                        }
+                    }
+
+                    DataRowType dataRowType = new DataRowTypeImpl(uri, 
dataColumns);
+                    this.metadata.put(fileStatus, new FileInfo(rowCount, 
parquetMetadata, fileMetadata, dataRowType));
+                }
+            }
+
+            for (FileInfo fileInfo : metadata.values()) {
+                rowCount += fileInfo.rowCount();
+
+                if (rowType == null) {
+                    rowType = fileInfo.dataRowType();
+                    geometryColumns = 
fileInfo.geoParquetMetadata().getColumns().keySet();
+                } else if (!rowType.equals(fileInfo.dataRowType())) {
+                    throw new IllegalArgumentException("Inconsistent row 
types");
+                }
+            }
+
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+  @Override
+  public Iterator<DataRow> iterator() {
+    try {
+      return StreamSupport
+          .stream(Spliterators.spliteratorUnknownSize(new DataRowIterator(), 
0),
+              false)
+          .map(values -> (DataRow) new DataRowImpl(rowType(), values))
+          .iterator();
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public long sizeAsLong() {
+    return rowCount;
+  }
+
+  @Override
+  public DataRowType rowType() {
+    return rowType;
+  }
+
+  private static Configuration getConfiguration() {
+    Configuration configuration = new Configuration();
+    configuration.set("fs.s3a.aws.credentials.provider",
+        "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider");
+    configuration.setBoolean("fs.s3a.path.style.access", true);
+    configuration.setBoolean(AvroReadSupport.READ_INT96_AS_FIXED, true);
+    return configuration;
+  }
+
+  private List<Object> asValues(GeoParquetMetadata geoParquetMetadata, 
SimpleGroup simpleGroup) {
+        List<Object> values = new ArrayList<>();
+        List<Type> fields = simpleGroup.getType().getFields();
+        for (int i = 0; i < fields.size(); i++) {
+            Type field = fields.get(i);
+            String name = field.getName();
+            if (field.isPrimitive()) {
+                PrimitiveType primitiveType = field.asPrimitiveType();
+                Object value = null;
+                try {
+                    value = switch (primitiveType.getPrimitiveTypeName()) {
+                        case BINARY -> {
+                            if (geometryColumns.contains(name)) {
+                                byte[] bytes = simpleGroup.getBinary(i, 
0).getBytes();
+                                Geometry geometry = wkbReader.read(bytes);
+
+                                // TODO: set the SRID correctly
+                                int srid = 
geoParquetMetadata.getColumns().get(name).getCrs().get("id").get("code").asInt(4326);
+                                geometry.setSRID(srid);
+
+                                yield geometry;
+                            } else if 
(primitiveType.getLogicalTypeAnnotation() == 
LogicalTypeAnnotation.stringType()) {
+                                yield simpleGroup.getString(i, 0);
+                            } else {
+                                yield simpleGroup.getBinary(i, 0).getBytes();
+                            }
+                        }
+                        case INT64 -> simpleGroup.getLong(i, 0);
+                        case INT32 -> simpleGroup.getInteger(i, 0);
+                        case BOOLEAN -> simpleGroup.getBoolean(i, 0);
+                        case FLOAT -> simpleGroup.getFloat(i, 0);
+                        case DOUBLE -> simpleGroup.getDouble(i, 0);
+                        case INT96 -> simpleGroup.getInt96(i, 0).getBytes();
+                        case FIXED_LEN_BYTE_ARRAY -> simpleGroup.getBinary(i, 
0).getBytes();
+                    };
+                } catch (Exception e) {
+                    // not found
+                }
+                values.add(value);
+            }
+        }
+
+        return values;
+    }
+
+  private class DataRowIterator implements Iterator<List<Object>> {
+
+    private Iterator<Map.Entry<FileStatus, FileInfo>> fileIterator;
+
+    private Map.Entry<FileStatus, FileInfo> currentFileStatus;
+    private Iterator<PageReadStore> pageReadStoreIterator;
+
+    private PageReadStore currentPageReadStore;
+
+    private Iterator<SimpleGroup> simpleGroupIterator;
+
+    private SimpleGroup currentSimpleGroup;
+
+    public DataRowIterator() throws IOException {
+      this.fileIterator = metadata.entrySet().iterator();
+      this.currentFileStatus = fileIterator.next();
+      this.pageReadStoreIterator = new 
PageReadStoreIterator(currentFileStatus);
+      this.currentPageReadStore = pageReadStoreIterator.next();
+      this.simpleGroupIterator = new SimpleGroupIterator(
+          
currentFileStatus.getValue().parquetMetadata().getFileMetaData().getSchema(),
+          currentPageReadStore);
+      this.currentSimpleGroup = simpleGroupIterator.next();
+    }
+
+    @Override
+    public boolean hasNext() {
+      if (simpleGroupIterator.hasNext()) {
+        return true;
+      } else if (pageReadStoreIterator.hasNext()) {
+        currentPageReadStore = pageReadStoreIterator.next();
+        simpleGroupIterator = new SimpleGroupIterator(
+            
currentFileStatus.getValue().parquetMetadata().getFileMetaData().getSchema(),
+            currentPageReadStore);
+        return hasNext();
+      } else if (fileIterator.hasNext()) {
+        currentFileStatus = fileIterator.next();
+        try {
+          pageReadStoreIterator = new PageReadStoreIterator(currentFileStatus);
+          return hasNext();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public List<Object> next() {
+      currentSimpleGroup = simpleGroupIterator.next();
+      return asValues(currentFileStatus.getValue().geoParquetMetadata(), 
currentSimpleGroup);
+    }
+  }
+
+  private static class FileStatusIterator implements Iterator<FileStatus> {
+
+    private final Iterator<FileStatus> fileStatusIterator;
+
+    public FileStatusIterator(String uri, Configuration configuration)
+        throws URISyntaxException, IOException {
+      URI fullUri = getFullUri(uri);
+      Path globPath = new Path(fullUri.getPath());
+
+      URI rootUri = getRootUri(fullUri);
+      FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+
+      FileStatus[] files = fileSystem.globStatus(globPath);
+      fileStatusIterator = Arrays.asList(files).iterator();
+    }
+
+    private static URI getFullUri(String uri) throws URISyntaxException {
+      return new URI(uri);
+    }
+
+    private static URI getRootUri(URI uri) throws URISyntaxException {
+      return new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), 
uri.getPort(), null, null,
+          null);
+    }
+
+    @Override
+    public boolean hasNext() {
+      return fileStatusIterator.hasNext();
+    }
+
+    @Override
+    public FileStatus next() {
+      return fileStatusIterator.next();
+    }
+  }
+
+  private class PageReadStoreIterator implements Iterator<PageReadStore> {

Review Comment:
   ## Inner class could be static
   
   PageReadStoreIterator could be made static, since the enclosing instance is 
used only in its constructor.
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/999)



##########
baremaps-core/src/main/java/org/apache/baremaps/storage/geoparquet/GeoParquetTable.java:
##########
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.storage.geoparquet;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Objects;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.stream.StreamSupport;
+import org.apache.baremaps.database.collection.AbstractDataCollection;
+import org.apache.baremaps.database.schema.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.avro.AvroReadSupport;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroup;
+import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.ColumnIOFactory;
+import org.apache.parquet.io.RecordReader;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeoParquetTable extends AbstractDataCollection<DataRow> 
implements DataTable {
+
+  private Configuration configuration;
+
+  private WKBReader wkbReader = new WKBReader();
+
+  private Map<FileStatus, FileInfo> metadata = new HashMap<>();
+
+  private Set<String> geometryColumns;
+
+  private DataRowType rowType;
+
+  private long rowCount;
+
+  record FileInfo(
+      long rowCount,
+      ParquetMetadata parquetMetadata,
+      GeoParquetMetadata geoParquetMetadata,
+      DataRowType dataRowType) {
+  }
+
+  public GeoParquetTable(String uri) {
+        this.configuration = getConfiguration();
+
+        try {
+            URI fullUri = FileStatusIterator.getFullUri(uri);
+            Path globPath = new Path(fullUri.getPath());
+
+            URI rootUri = FileStatusIterator.getRootUri(fullUri);
+            FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+
+            List<FileStatus> files = 
Arrays.asList(fileSystem.globStatus(globPath));
+
+            for (FileStatus fileStatus : files) {
+                try (ParquetFileReader reader = ParquetFileReader
+                        .open(HadoopInputFile.fromPath(fileStatus.getPath(), 
configuration))) {
+
+                    long rowCount = reader.getRecordCount();
+                    ParquetMetadata parquetMetadata = reader.getFooter();
+
+                    String json = 
reader.getFooter().getFileMetaData().getKeyValueMetaData().get("geo");
+                    GeoParquetMetadata fileMetadata = new ObjectMapper()
+                            
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false)
+                            .readValue(json, GeoParquetMetadata.class);
+
+                    List<DataColumn> dataColumns = new ArrayList<>();
+                    List<Type> types = 
parquetMetadata.getFileMetaData().getSchema().getFields();
+                    for (Type type : types) {
+                        String name = type.getName();
+                        if (type.isPrimitive()) {
+                            PrimitiveType primitiveType = 
type.asPrimitiveType();
+                            DataColumn.Type columnType = switch 
(primitiveType.getPrimitiveTypeName()) {
+                                case BINARY -> {
+                                    if 
(fileMetadata.getColumns().containsKey(name)) {
+                                        yield DataColumn.Type.GEOMETRY;
+                                    } else if 
(primitiveType.getLogicalTypeAnnotation() == 
LogicalTypeAnnotation.stringType()) {
+                                        yield DataColumn.Type.STRING;
+                                    } else {
+                                        yield DataColumn.Type.BYTE_ARRAY;
+                                    }
+                                }
+                                case INT64 -> DataColumn.Type.LONG;
+                                case INT32 -> DataColumn.Type.INTEGER;
+                                case BOOLEAN -> DataColumn.Type.BOOLEAN;
+                                case FLOAT -> DataColumn.Type.FLOAT;
+                                case DOUBLE -> DataColumn.Type.DOUBLE;
+                                case INT96 -> DataColumn.Type.BYTE_ARRAY;
+                                case FIXED_LEN_BYTE_ARRAY -> 
DataColumn.Type.BYTE_ARRAY;
+                            };
+                            dataColumns.add(new DataColumnImpl(name, 
columnType));
+                        }
+                    }
+
+                    DataRowType dataRowType = new DataRowTypeImpl(uri, 
dataColumns);
+                    this.metadata.put(fileStatus, new FileInfo(rowCount, 
parquetMetadata, fileMetadata, dataRowType));
+                }
+            }
+
+            for (FileInfo fileInfo : metadata.values()) {
+                rowCount += fileInfo.rowCount();
+
+                if (rowType == null) {
+                    rowType = fileInfo.dataRowType();
+                    geometryColumns = 
fileInfo.geoParquetMetadata().getColumns().keySet();
+                } else if (!rowType.equals(fileInfo.dataRowType())) {
+                    throw new IllegalArgumentException("Inconsistent row 
types");
+                }
+            }
+
+        } catch (Exception e) {
+            throw new RuntimeException(e);
+        }
+    }
+
+  @Override
+  public Iterator<DataRow> iterator() {
+    try {
+      return StreamSupport
+          .stream(Spliterators.spliteratorUnknownSize(new DataRowIterator(), 
0),
+              false)
+          .map(values -> (DataRow) new DataRowImpl(rowType(), values))
+          .iterator();
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  @Override
+  public long sizeAsLong() {
+    return rowCount;
+  }
+
+  @Override
+  public DataRowType rowType() {
+    return rowType;
+  }
+
+  private static Configuration getConfiguration() {
+    Configuration configuration = new Configuration();
+    configuration.set("fs.s3a.aws.credentials.provider",
+        "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider");
+    configuration.setBoolean("fs.s3a.path.style.access", true);
+    configuration.setBoolean(AvroReadSupport.READ_INT96_AS_FIXED, true);
+    return configuration;
+  }
+
+  private List<Object> asValues(GeoParquetMetadata geoParquetMetadata, 
SimpleGroup simpleGroup) {
+        List<Object> values = new ArrayList<>();
+        List<Type> fields = simpleGroup.getType().getFields();
+        for (int i = 0; i < fields.size(); i++) {
+            Type field = fields.get(i);
+            String name = field.getName();
+            if (field.isPrimitive()) {
+                PrimitiveType primitiveType = field.asPrimitiveType();
+                Object value = null;
+                try {
+                    value = switch (primitiveType.getPrimitiveTypeName()) {
+                        case BINARY -> {
+                            if (geometryColumns.contains(name)) {
+                                byte[] bytes = simpleGroup.getBinary(i, 
0).getBytes();
+                                Geometry geometry = wkbReader.read(bytes);
+
+                                // TODO: set the SRID correctly
+                                int srid = 
geoParquetMetadata.getColumns().get(name).getCrs().get("id").get("code").asInt(4326);
+                                geometry.setSRID(srid);
+
+                                yield geometry;
+                            } else if 
(primitiveType.getLogicalTypeAnnotation() == 
LogicalTypeAnnotation.stringType()) {
+                                yield simpleGroup.getString(i, 0);
+                            } else {
+                                yield simpleGroup.getBinary(i, 0).getBytes();
+                            }
+                        }
+                        case INT64 -> simpleGroup.getLong(i, 0);
+                        case INT32 -> simpleGroup.getInteger(i, 0);
+                        case BOOLEAN -> simpleGroup.getBoolean(i, 0);
+                        case FLOAT -> simpleGroup.getFloat(i, 0);
+                        case DOUBLE -> simpleGroup.getDouble(i, 0);
+                        case INT96 -> simpleGroup.getInt96(i, 0).getBytes();
+                        case FIXED_LEN_BYTE_ARRAY -> simpleGroup.getBinary(i, 
0).getBytes();
+                    };
+                } catch (Exception e) {
+                    // not found
+                }
+                values.add(value);
+            }
+        }
+
+        return values;
+    }
+
+  private class DataRowIterator implements Iterator<List<Object>> {
+
+    private Iterator<Map.Entry<FileStatus, FileInfo>> fileIterator;
+
+    private Map.Entry<FileStatus, FileInfo> currentFileStatus;
+    private Iterator<PageReadStore> pageReadStoreIterator;
+
+    private PageReadStore currentPageReadStore;
+
+    private Iterator<SimpleGroup> simpleGroupIterator;
+
+    private SimpleGroup currentSimpleGroup;
+
+    public DataRowIterator() throws IOException {
+      this.fileIterator = metadata.entrySet().iterator();
+      this.currentFileStatus = fileIterator.next();
+      this.pageReadStoreIterator = new 
PageReadStoreIterator(currentFileStatus);
+      this.currentPageReadStore = pageReadStoreIterator.next();
+      this.simpleGroupIterator = new SimpleGroupIterator(
+          
currentFileStatus.getValue().parquetMetadata().getFileMetaData().getSchema(),
+          currentPageReadStore);
+      this.currentSimpleGroup = simpleGroupIterator.next();
+    }
+
+    @Override
+    public boolean hasNext() {
+      if (simpleGroupIterator.hasNext()) {
+        return true;
+      } else if (pageReadStoreIterator.hasNext()) {
+        currentPageReadStore = pageReadStoreIterator.next();
+        simpleGroupIterator = new SimpleGroupIterator(
+            
currentFileStatus.getValue().parquetMetadata().getFileMetaData().getSchema(),
+            currentPageReadStore);
+        return hasNext();
+      } else if (fileIterator.hasNext()) {
+        currentFileStatus = fileIterator.next();
+        try {
+          pageReadStoreIterator = new PageReadStoreIterator(currentFileStatus);
+          return hasNext();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public List<Object> next() {
+      currentSimpleGroup = simpleGroupIterator.next();
+      return asValues(currentFileStatus.getValue().geoParquetMetadata(), 
currentSimpleGroup);
+    }
+  }
+
+  private static class FileStatusIterator implements Iterator<FileStatus> {
+
+    private final Iterator<FileStatus> fileStatusIterator;
+
+    public FileStatusIterator(String uri, Configuration configuration)
+        throws URISyntaxException, IOException {
+      URI fullUri = getFullUri(uri);
+      Path globPath = new Path(fullUri.getPath());
+
+      URI rootUri = getRootUri(fullUri);
+      FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+
+      FileStatus[] files = fileSystem.globStatus(globPath);
+      fileStatusIterator = Arrays.asList(files).iterator();
+    }
+
+    private static URI getFullUri(String uri) throws URISyntaxException {
+      return new URI(uri);
+    }
+
+    private static URI getRootUri(URI uri) throws URISyntaxException {
+      return new URI(uri.getScheme(), uri.getUserInfo(), uri.getHost(), 
uri.getPort(), null, null,
+          null);
+    }
+
+    @Override
+    public boolean hasNext() {
+      return fileStatusIterator.hasNext();
+    }
+
+    @Override
+    public FileStatus next() {
+      return fileStatusIterator.next();
+    }
+  }
+
+  private class PageReadStoreIterator implements Iterator<PageReadStore> {
+
+    private final ParquetFileReader parquetFileReader;
+
+    private final MessageType messageType;
+
+    private PageReadStore next;
+
+    public PageReadStoreIterator(Map.Entry<FileStatus, FileInfo> fileInfo) 
throws IOException {
+      this.parquetFileReader = ParquetFileReader
+          .open(HadoopInputFile.fromPath(fileInfo.getKey().getPath(), 
configuration));
+      this.messageType = 
this.parquetFileReader.getFooter().getFileMetaData().getSchema();
+      try {
+        next = parquetFileReader.readNextRowGroup();
+      } catch (IOException e) {
+        parquetFileReader.close();
+        throw new RuntimeException(e);
+      }
+    }
+
+    @Override
+    public boolean hasNext() {
+      boolean hasNext = next != null;
+      if (!hasNext) {
+        try {
+          parquetFileReader.close();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      }
+      return hasNext;
+    }
+
+    @Override
+    public PageReadStore next() {
+      try {
+        PageReadStore current = next;
+        next = parquetFileReader.readNextRowGroup();
+        if (next == null) {
+          try {
+            parquetFileReader.close();
+          } catch (IOException e) {
+            throw new RuntimeException(e);
+          }
+        }
+        return current;
+      } catch (IOException e) {
+        throw new RuntimeException(e);
+      }
+    }
+  }
+
+  private class SimpleGroupIterator implements Iterator<SimpleGroup> {

Review Comment:
   ## Inner class could be static
   
   SimpleGroupIterator should be made static, since the enclosing instance is 
not used.
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1000)



##########
baremaps-core/src/test/java/org/apache/baremaps/storage/geoparquet/GeoParquetTableTest.java:
##########
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.storage.geoparquet;
+
+import static org.junit.Assert.assertEquals;
+
+import org.apache.baremaps.testing.TestFiles;
+import org.junit.jupiter.api.Test;
+
+class GeoParquetTableTest {

Review Comment:
   ## Unused classes and interfaces
   
   Unused class: GeoParquetTableTest is not referenced within this codebase. If 
not used as an external API it should be removed.
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1002)



##########
baremaps-core/src/main/java/org/apache/baremaps/storage/geoparquet/GeoParquetTable.java:
##########
@@ -0,0 +1,542 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.storage.geoparquet;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.google.common.base.Objects;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.stream.StreamSupport;
+import org.apache.baremaps.database.collection.AbstractDataCollection;
+import org.apache.baremaps.database.schema.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.avro.AvroReadSupport;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.example.data.Group;
+import org.apache.parquet.example.data.simple.SimpleGroup;
+import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.ColumnIOFactory;
+import org.apache.parquet.io.RecordReader;
+import org.apache.parquet.schema.LogicalTypeAnnotation;
+import org.apache.parquet.schema.MessageType;
+import org.apache.parquet.schema.PrimitiveType;
+import org.apache.parquet.schema.Type;
+import org.locationtech.jts.geom.Geometry;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeoParquetTable extends AbstractDataCollection<DataRow> 
implements DataTable {
+
+  private Configuration configuration;
+
+  private WKBReader wkbReader = new WKBReader();
+
+  private Map<FileStatus, FileInfo> metadata = new HashMap<>();
+
+  private Set<String> geometryColumns;
+
+  private DataRowType rowType;
+
+  private long rowCount;
+
+  record FileInfo(
+      long rowCount,
+      ParquetMetadata parquetMetadata,
+      GeoParquetMetadata geoParquetMetadata,
+      DataRowType dataRowType) {
+  }
+
+  public GeoParquetTable(String uri) {
+        this.configuration = getConfiguration();
+
+        try {
+            URI fullUri = FileStatusIterator.getFullUri(uri);
+            Path globPath = new Path(fullUri.getPath());
+
+            URI rootUri = FileStatusIterator.getRootUri(fullUri);
+            FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+
+            List<FileStatus> files = 
Arrays.asList(fileSystem.globStatus(globPath));
+
+            for (FileStatus fileStatus : files) {
+                try (ParquetFileReader reader = ParquetFileReader
+                        .open(HadoopInputFile.fromPath(fileStatus.getPath(), 
configuration))) {
+
+                    long rowCount = reader.getRecordCount();

Review Comment:
   ## Possible confusion of local and field
   
   Confusing name: [GeoParquetTable](1) also refers to field [rowCount](2) 
(without qualifying it with 'this').
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1001)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add experimental support for geoparquet [incubator-baremaps]

Reply via email to