Re: [PR] Create a geoparquet module and add dependencies [incubator-baremaps]

via GitHub Tue, 21 May 2024 16:10:19 -0700


github-advanced-security[bot] commented on code in PR #855:
URL: 
https://github.com/apache/incubator-baremaps/pull/855#discussion_r1609042961



##########
baremaps-geoparquet/src/main/java/org/apache/baremaps/geoparquet/GeoParquetReader.java:
##########
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.geoparquet;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+import org.apache.baremaps.geoparquet.data.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.avro.AvroReadSupport;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.FileMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.ColumnIOFactory;
+import org.apache.parquet.io.RecordReader;
+import org.apache.parquet.schema.MessageType;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeoParquetReader {
+
+  private final URI uri;
+
+  private Configuration configuration;
+
+  private WKBReader wkbReader = new WKBReader();
+
+  private Map<FileStatus, GeoParquetFileInfo> metadata = new LinkedHashMap<>();
+
+  private long rowCount;
+
+  public GeoParquetReader(URI uri) {
+    this.uri = uri;
+    this.initialize();
+  }
+
+  public void initialize() {
+    this.rowCount = 0;
+    this.configuration = getConfiguration();
+
+    try {
+      // List all the files that match the glob pattern
+      Path globPath = new Path(uri.getPath());
+      URI rootUri = getRootUri(uri);
+      FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+      List<FileStatus> files = Arrays.asList(fileSystem.globStatus(globPath));
+
+      // Read the metadata of each file
+      for (FileStatus fileStatus : files) {
+
+        // Open the Parquet file
+        try (ParquetFileReader reader = ParquetFileReader
+            .open(HadoopInputFile.fromPath(fileStatus.getPath(), 
configuration))) {
+
+          // Read the number of rows in the Parquet file
+          long rowCount = reader.getRecordCount();
+
+          // Read the metadata of the Parquet file
+          ParquetMetadata parquetMetadata = reader.getFooter();
+          FileMetaData fileMetadata = parquetMetadata.getFileMetaData();
+
+          // Read the GeoParquet metadata of the Parquet file
+          String json = fileMetadata.getKeyValueMetaData().get("geo");
+          GeoParquetMetadata geoParquetMetadata = new ObjectMapper()
+              .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, 
false)
+              .readValue(json, GeoParquetMetadata.class);
+
+          // Increment the total number of rows
+          this.rowCount += rowCount;
+
+          // Get the geometry columns of the Parquet file
+          Set<String> geometryColumns = 
geoParquetMetadata.getColumns().keySet();
+
+          // Store the metadata of the Parquet file
+          this.metadata.put(fileStatus, new GeoParquetFileInfo(rowCount, 
parquetMetadata,
+              geoParquetMetadata, geometryColumns));
+        }
+      }
+
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public Stream<FeatureGroup> read() throws IOException {
+    return StreamSupport.stream(
+        Spliterators.spliteratorUnknownSize(new GroupIterator(), 
Spliterator.ORDERED),
+        false);
+  }
+
+  private static Configuration getConfiguration() {
+    Configuration configuration = new Configuration();
+    configuration.set("fs.s3a.aws.credentials.provider",
+        "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider");
+    configuration.setBoolean("fs.s3a.path.style.access", true);
+    configuration.setBoolean(AvroReadSupport.READ_INT96_AS_FIXED, true);
+    return configuration;
+  }
+
+  private static int getSrid(GeoParquetMetadata geoParquetMetadata, String 
name) {
+    JsonNode crsId = 
geoParquetMetadata.getColumns().get(name).getCrs().get("id");
+    int srid = switch (crsId.get("authority").asText()) {
+      case "OGC" -> switch (crsId.get("code").asText()) {
+          case "CRS84" -> 4326;
+          default -> 0;
+        };
+      case "EPSG" -> crsId.get("code").asInt();
+      default -> 0;
+    };
+    return srid;
+  }
+
+  private static URI getRootUri(URI uri) throws URISyntaxException {
+    String path = uri.getPath();
+    int index = path.indexOf("*");
+    if (index != -1) {
+      path = path.substring(0, path.lastIndexOf("/", index) + 1);
+    }
+    return new URI(
+        uri.getScheme(),
+        uri.getUserInfo(),
+        uri.getHost(),
+        uri.getPort(),
+        path,
+        null,
+        null);
+  }
+
+  private class GroupIterator implements Iterator<FeatureGroup> {
+
+    private Iterator<Map.Entry<FileStatus, GeoParquetFileInfo>> fileIterator;
+
+    private Map.Entry<FileStatus, GeoParquetFileInfo> currentFileStatus;
+    private Iterator<PageReadStore> pageReadStoreIterator;
+
+    private PageReadStore currentPageReadStore;
+
+    private Iterator<FeatureGroup> simpleGroupIterator;
+
+    private FeatureGroup currentSimpleFeatureGroup;
+
+    public GroupIterator() throws IOException {
+      this.fileIterator = metadata.entrySet().iterator();
+      this.currentFileStatus = fileIterator.next();
+      this.pageReadStoreIterator = new 
PageReadStoreIterator(currentFileStatus);
+      this.currentPageReadStore = pageReadStoreIterator.next();
+      this.simpleGroupIterator = new FeatureGroupIterator(
+          currentFileStatus.getValue(),
+          currentPageReadStore);
+      this.currentSimpleFeatureGroup = simpleGroupIterator.next();
+    }
+
+    @Override
+    public boolean hasNext() {
+      if (simpleGroupIterator.hasNext()) {
+        return true;
+      } else if (pageReadStoreIterator.hasNext()) {
+        currentPageReadStore = pageReadStoreIterator.next();
+        simpleGroupIterator = new FeatureGroupIterator(
+            currentFileStatus.getValue(),
+            currentPageReadStore);
+        return hasNext();
+      } else if (fileIterator.hasNext()) {
+        currentFileStatus = fileIterator.next();
+        try {
+          pageReadStoreIterator = new PageReadStoreIterator(currentFileStatus);
+          return hasNext();
+        } catch (IOException e) {
+          throw new RuntimeException(e);
+        }
+      } else {
+        return false;
+      }
+    }
+
+    @Override
+    public FeatureGroup next() {
+      currentSimpleFeatureGroup = simpleGroupIterator.next();
+      return currentSimpleFeatureGroup;
+    }
+  }
+
+  private class PageReadStoreIterator implements Iterator<PageReadStore> {

Review Comment:
   ## Inner class could be static
   
   PageReadStoreIterator could be made static, since the enclosing instance is 
used only in its constructor.
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1406)



##########
baremaps-geoparquet/src/main/java/org/apache/baremaps/geoparquet/GeoParquetReader.java:
##########
@@ -0,0 +1,289 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.geoparquet;
+
+import com.fasterxml.jackson.databind.DeserializationFeature;
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.util.*;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+import org.apache.baremaps.geoparquet.data.*;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.parquet.avro.AvroReadSupport;
+import org.apache.parquet.column.page.PageReadStore;
+import org.apache.parquet.hadoop.ParquetFileReader;
+import org.apache.parquet.hadoop.metadata.FileMetaData;
+import org.apache.parquet.hadoop.metadata.ParquetMetadata;
+import org.apache.parquet.hadoop.util.HadoopInputFile;
+import org.apache.parquet.io.ColumnIOFactory;
+import org.apache.parquet.io.RecordReader;
+import org.apache.parquet.schema.MessageType;
+import org.locationtech.jts.io.WKBReader;
+
+public class GeoParquetReader {
+
+  private final URI uri;
+
+  private Configuration configuration;
+
+  private WKBReader wkbReader = new WKBReader();
+
+  private Map<FileStatus, GeoParquetFileInfo> metadata = new LinkedHashMap<>();
+
+  private long rowCount;
+
+  public GeoParquetReader(URI uri) {
+    this.uri = uri;
+    this.initialize();
+  }
+
+  public void initialize() {
+    this.rowCount = 0;
+    this.configuration = getConfiguration();
+
+    try {
+      // List all the files that match the glob pattern
+      Path globPath = new Path(uri.getPath());
+      URI rootUri = getRootUri(uri);
+      FileSystem fileSystem = FileSystem.get(rootUri, configuration);
+      List<FileStatus> files = Arrays.asList(fileSystem.globStatus(globPath));
+
+      // Read the metadata of each file
+      for (FileStatus fileStatus : files) {
+
+        // Open the Parquet file
+        try (ParquetFileReader reader = ParquetFileReader
+            .open(HadoopInputFile.fromPath(fileStatus.getPath(), 
configuration))) {
+
+          // Read the number of rows in the Parquet file
+          long rowCount = reader.getRecordCount();

Review Comment:
   ## Possible confusion of local and field
   
   Potentially confusing name: method [initialize](1) also refers to field 
[rowCount](2) (as this.rowCount).
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1407)



##########
baremaps-geoparquet/src/test/java/org/apache/baremaps/geoparquet/GeoParquetReaderTest.java:
##########
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.geoparquet;
+
+import java.io.IOException;
+import java.net.URI;
+import java.util.List;
+import org.apache.baremaps.geoparquet.data.FeatureGroup;
+import org.apache.baremaps.testing.TestFiles;
+import org.junit.jupiter.api.Test;
+
+class GeoParquetReaderTest {

Review Comment:
   ## Unused classes and interfaces
   
   Unused class: GeoParquetReaderTest is not referenced within this codebase. 
If not used as an external API it should be removed.
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1409)



##########
baremaps-geoparquet/src/main/java/org/apache/baremaps/geoparquet/data/FeatureGroup.java:
##########
@@ -0,0 +1,303 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.geoparquet.data;
+
+import java.util.ArrayList;
+import java.util.List;
+import org.apache.parquet.io.api.Binary;
+import org.apache.parquet.io.api.RecordConsumer;
+import org.apache.parquet.schema.GroupType;
+import org.apache.parquet.schema.Type;
+
+public class FeatureGroup {
+
+  private final GeoParquetFileInfo fileInfo;
+  private final GroupType schema;
+  private final List<Object>[] data;
+
+  @SuppressWarnings("unchecked")
+  public FeatureGroup(GeoParquetFileInfo fileInfo, GroupType schema) {
+    this.fileInfo = fileInfo;
+    this.schema = schema;
+    this.data = new List[schema.getFields().size()];
+    for (int i = 0; i < schema.getFieldCount(); i++) {
+      this.data[i] = new ArrayList<>();
+    }
+  }
+
+  @Override
+  public String toString() {
+    return toString("");
+  }
+
+  private StringBuilder appendToString(StringBuilder builder, String indent) {
+    int i = 0;
+    for (Type field : schema.getFields()) {
+      String name = field.getName();
+      List<Object> values = data[i];
+      ++i;
+      if (values != null && !values.isEmpty()) {
+        for (Object value : values) {
+          builder.append(indent).append(name);
+          if (value == null) {
+            builder.append(": NULL\n");
+          } else if (value instanceof FeatureGroup) {
+            builder.append('\n');
+            ((FeatureGroup) value).appendToString(builder, indent + "  ");
+          } else {
+            builder.append(": ").append(value.toString()).append('\n');
+          }
+        }
+      }
+    }
+    return builder;
+  }
+
+  public String toString(String indent) {
+    StringBuilder builder = new StringBuilder();
+    appendToString(builder, indent);
+    return builder.toString();
+  }
+
+  public FeatureGroup addGroup(int fieldIndex) {
+    FeatureGroup g = new FeatureGroup(fileInfo, 
schema.getType(fieldIndex).asGroupType());
+    add(fieldIndex, g);
+    return g;
+  }
+
+  public FeatureGroup getGroup(int fieldIndex, int index) {
+    return (FeatureGroup) getValue(fieldIndex, index);
+  }
+
+  private Object getValue(int fieldIndex, int index) {
+    List<Object> list;
+    try {
+      list = data[fieldIndex];
+    } catch (IndexOutOfBoundsException e) {
+      throw new RuntimeException("not found " + fieldIndex + "(" + 
schema.getFieldName(fieldIndex)
+          + ") in group:\n" + this);
+    }
+    try {
+      return list.get(index);
+    } catch (IndexOutOfBoundsException e) {
+      throw new RuntimeException("not found " + fieldIndex + "(" + 
schema.getFieldName(fieldIndex)
+          + ") element number " + index + " in group:\n" + this);
+    }
+  }
+
+  private void add(int fieldIndex, Primitive value) {
+    Type type = schema.getType(fieldIndex);
+    List<Object> list = data[fieldIndex];
+    if (!type.isRepetition(Type.Repetition.REPEATED)
+        && !list.isEmpty()) {
+      throw new IllegalStateException("field " + fieldIndex + " (" + 
type.getName()
+          + ") can not have more than one value: " + list);
+    }
+    list.add(value);
+  }
+
+  public int getFieldRepetitionCount(int fieldIndex) {
+    List<Object> list = data[fieldIndex];
+    return list == null ? 0 : list.size();
+  }
+
+  public String getValueToString(int fieldIndex, int index) {
+    return String.valueOf(getValue(fieldIndex, index));
+  }
+
+  public String getString(int fieldIndex, int index) {
+    return ((BinaryValue) getValue(fieldIndex, index)).getString();
+  }
+
+  public int getInteger(int fieldIndex, int index) {
+    return ((IntegerValue) getValue(fieldIndex, index)).getInteger();
+  }
+
+  public long getLong(int fieldIndex, int index) {
+    return ((LongValue) getValue(fieldIndex, index)).getLong();
+  }
+
+  public double getDouble(int fieldIndex, int index) {
+    return ((DoubleValue) getValue(fieldIndex, index)).getDouble();
+  }
+
+  public float getFloat(int fieldIndex, int index) {
+    return ((FloatValue) getValue(fieldIndex, index)).getFloat();
+  }
+
+  public boolean getBoolean(int fieldIndex, int index) {
+    return ((BooleanValue) getValue(fieldIndex, index)).getBoolean();
+  }
+
+  public Binary getBinary(int fieldIndex, int index) {
+    return ((BinaryValue) getValue(fieldIndex, index)).getBinary();
+  }
+
+  public NanoTime getTimeNanos(int fieldIndex, int index) {
+    return NanoTime.fromInt96((Int96Value) getValue(fieldIndex, index));
+  }
+
+  public Binary getInt96(int fieldIndex, int index) {
+    return ((Int96Value) getValue(fieldIndex, index)).getInt96();
+  }
+
+  public void add(int fieldIndex, int value) {
+    add(fieldIndex, new IntegerValue(value));
+  }
+
+  public void add(int fieldIndex, long value) {
+    add(fieldIndex, new LongValue(value));
+  }
+
+  public void add(int fieldIndex, String value) {
+    add(fieldIndex, new BinaryValue(Binary.fromString(value)));
+  }
+
+  public void add(int fieldIndex, NanoTime value) {

Review Comment:
   ## Confusing overloading of methods
   
   Method FeatureGroup.add(..) could be confused with overloaded method 
[add](1), since dispatch depends on static types.
   
   [Show more 
details](https://github.com/apache/incubator-baremaps/security/code-scanning/1408)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Create a geoparquet module and add dependencies [incubator-baremaps]

Reply via email to