This is an automated email from the ASF dual-hosted git repository.
bchapuis pushed a commit to branch 849-benchmarking
in repository https://gitbox.apache.org/repos/asf/incubator-baremaps.git
The following commit(s) were added to refs/heads/849-benchmarking by this push:
new 8e6ec161 Add sequential spliterator
8e6ec161 is described below
commit 8e6ec161178e1d1fd6f5b6de29b3e4209fd812ec
Author: Bertil Chapuis <[email protected]>
AuthorDate: Wed Jun 12 17:57:05 2024 +0200
Add sequential spliterator
---
baremaps-benchmarking/pom.xml | 112 ++++++++++-----------
.../benchmarking/GeoParquetReaderBenchmark.java | 78 +++++++-------
.../baremaps/benchmarking/SmallFileBenchmark.java | 72 +++++++------
.../GeoParquetSequentialSpliterator.java | 94 +++++++++++++++++
pom.xml | 2 +-
5 files changed, 234 insertions(+), 124 deletions(-)
diff --git a/baremaps-benchmarking/pom.xml b/baremaps-benchmarking/pom.xml
index 251b5ea4..ae6708b1 100644
--- a/baremaps-benchmarking/pom.xml
+++ b/baremaps-benchmarking/pom.xml
@@ -1,61 +1,61 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.baremaps</groupId>
- <artifactId>baremaps</artifactId>
- <version>0.7.4-SNAPSHOT</version>
- </parent>
- <artifactId>baremaps-benchmarking</artifactId>
- <properties>
- <jmh.version>1.37</jmh.version>
- <maven.deploy.skip>true</maven.deploy.skip>
- </properties>
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.baremaps</groupId>
+ <artifactId>baremaps</artifactId>
+ <version>0.7.4-SNAPSHOT</version>
+ </parent>
+ <artifactId>baremaps-benchmarking</artifactId>
+ <properties>
+ <jmh.version>1.37</jmh.version>
+ <maven.deploy.skip>true</maven.deploy.skip>
+ </properties>
- <dependencies>
- <dependency>
- <groupId>org.apache.baremaps</groupId>
- <artifactId>baremaps-geoparquet</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.baremaps</groupId>
- <artifactId>baremaps-testing</artifactId>
- </dependency>
- <dependency>
- <groupId>org.openjdk.jmh</groupId>
- <artifactId>jmh-core</artifactId>
- <version>${jmh.version}</version>
- </dependency>
- <dependency>
- <groupId>org.openjdk.jmh</groupId>
- <artifactId>jmh-generator-annprocess</artifactId>
- <version>${jmh.version}</version>
- </dependency>
- </dependencies>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.baremaps</groupId>
+ <artifactId>baremaps-geoparquet</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.baremaps</groupId>
+ <artifactId>baremaps-testing</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.openjdk.jmh</groupId>
+ <artifactId>jmh-core</artifactId>
+ <version>${jmh.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.openjdk.jmh</groupId>
+ <artifactId>jmh-generator-annprocess</artifactId>
+ <version>${jmh.version}</version>
+ </dependency>
+ </dependencies>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <version>3.6.0</version>
- <executions>
- <execution>
- <goals>
- <goal>shade</goal>
- </goals>
- <phase>package</phase>
- <configuration>
- <finalName>benchmarks</finalName>
- <transformers>
- <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
- <mainClass>org.openjdk.jmh.Main</mainClass>
- </transformer>
- </transformers>
- </configuration>
- </execution>
- </executions>
- </plugin>
- </plugins>
- </build>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>3.6.0</version>
+ <executions>
+ <execution>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <phase>package</phase>
+ <configuration>
+ <finalName>benchmarks</finalName>
+ <transformers>
+ <transformer
implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
+ <mainClass>org.openjdk.jmh.Main</mainClass>
+ </transformer>
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
</project>
diff --git
a/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/GeoParquetReaderBenchmark.java
b/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/GeoParquetReaderBenchmark.java
index cd0f15b9..0cb83166 100644
---
a/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/GeoParquetReaderBenchmark.java
+++
b/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/GeoParquetReaderBenchmark.java
@@ -40,49 +40,49 @@ import
software.amazon.awssdk.services.s3.model.ListObjectsV2Request;
@Measurement(iterations = 1)
public class GeoParquetReaderBenchmark {
- private static Path directory = Path.of("baremaps-benchmarks/data");
+ private static Path directory = Path.of("baremaps-benchmarks/data");
- public static void main(String[] args) throws RunnerException {
- Options opt = new OptionsBuilder()
- .include(GeoParquetReaderBenchmark.class.getSimpleName())
- .forks(1)
- .build();
- new Runner(opt).run();
- }
+ public static void main(String[] args) throws RunnerException {
+ Options opt = new OptionsBuilder()
+ .include(GeoParquetReaderBenchmark.class.getSimpleName())
+ .forks(1)
+ .build();
+ new Runner(opt).run();
+ }
- @Setup
- public void setup() throws IOException {
- if (!Files.exists(directory)) {
- try (var client = S3Client.builder()
- .region(Region.US_EAST_1)
- .credentialsProvider(new AnonymousAWSCredentialsProvider())
- .build()) {
- var listRequest = ListObjectsV2Request.builder()
- .bucket("overturemaps-us-west-2")
-
.prefix("release/2024-03-12-alpha.0/theme=admins/type=locality_area/")
- .build();
- var objects = client.listObjectsV2(listRequest).contents();
- for (var object : objects) {
- var key = object.key();
- var name = key.substring(key.lastIndexOf("/") + 1);
- var file = directory.resolve(name);
- Files.createDirectories(file.getParent());
- if (!Files.exists(file)) {
- var getRequest = GetObjectRequest.builder()
- .bucket("overturemaps-us-west-2")
- .key(key)
- .build();
- client.getObject(getRequest, file);
- }
- }
- }
+ @Setup
+ public void setup() throws IOException {
+ if (!Files.exists(directory)) {
+ try (var client = S3Client.builder()
+ .region(Region.US_EAST_1)
+ .credentialsProvider(new AnonymousAWSCredentialsProvider())
+ .build()) {
+ var listRequest = ListObjectsV2Request.builder()
+ .bucket("overturemaps-us-west-2")
+
.prefix("release/2024-03-12-alpha.0/theme=admins/type=locality_area/")
+ .build();
+ var objects = client.listObjectsV2(listRequest).contents();
+ for (var object : objects) {
+ var key = object.key();
+ var name = key.substring(key.lastIndexOf("/") + 1);
+ var file = directory.resolve(name);
+ Files.createDirectories(file.getParent());
+ if (!Files.exists(file)) {
+ var getRequest = GetObjectRequest.builder()
+ .bucket("overturemaps-us-west-2")
+ .key(key)
+ .build();
+ client.getObject(getRequest, file);
+ }
}
+ }
}
+ }
- @Benchmark
- public void read() {
- GeoParquetReader reader = new GeoParquetReader(directory.toUri());
- reader.readParallel().count();
- }
+ @Benchmark
+ public void read() {
+ GeoParquetReader reader = new GeoParquetReader(directory.toUri());
+ reader.readParallel().count();
+ }
}
diff --git
a/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/SmallFileBenchmark.java
b/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/SmallFileBenchmark.java
index 91a3ac46..a9038d5f 100644
---
a/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/SmallFileBenchmark.java
+++
b/baremaps-benchmarking/src/main/java/org/apache/baremaps/benchmarking/SmallFileBenchmark.java
@@ -1,5 +1,26 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
package org.apache.baremaps.benchmarking;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.concurrent.TimeUnit;
import org.apache.baremaps.geoparquet.GeoParquetReader;
import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.runner.Runner;
@@ -7,11 +28,6 @@ import org.openjdk.jmh.runner.RunnerException;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.concurrent.TimeUnit;
-
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
@State(Scope.Benchmark)
@@ -19,31 +35,31 @@ import java.util.concurrent.TimeUnit;
@Measurement(iterations = 1)
public class SmallFileBenchmark {
- private Path source =
Path.of("baremaps-testing/data/samples/example.parquet").toAbsolutePath();
- private Path directory =
Path.of("baremaps-benchmarking/small").toAbsolutePath();
+ private Path source =
Path.of("baremaps-testing/data/samples/example.parquet").toAbsolutePath();
+ private Path directory =
Path.of("baremaps-benchmarking/small").toAbsolutePath();
- public static void main(String[] args) throws RunnerException, IOException
{
- Options opt = new OptionsBuilder()
- .include(SmallFileBenchmark.class.getSimpleName())
- .forks(1)
- .build();
- new Runner(opt).run();
- }
+ public static void main(String[] args) throws RunnerException, IOException {
+ Options opt = new OptionsBuilder()
+ .include(SmallFileBenchmark.class.getSimpleName())
+ .forks(1)
+ .build();
+ new Runner(opt).run();
+ }
- @Setup
- public void setup() throws IOException {
- if (!Files.exists(directory)) {
- for (int i = 0; i < 1000; i++) {
- Path target = directory.resolve(i + ".parquet");
- Files.createDirectories(target.getParent());
- Files.copy(source, target);
- }
- }
+ @Setup
+ public void setup() throws IOException {
+ if (!Files.exists(directory)) {
+ for (int i = 0; i < 1000; i++) {
+ Path target = directory.resolve(i + ".parquet");
+ Files.createDirectories(target.getParent());
+ Files.copy(source, target);
+ }
}
+ }
- @Benchmark
- public void read() {
- GeoParquetReader reader = new GeoParquetReader(directory.toUri());
- reader.readParallel().count();
- }
+ @Benchmark
+ public void read() {
+ GeoParquetReader reader = new GeoParquetReader(directory.toUri());
+ reader.readParallel().count();
+ }
}
diff --git
a/baremaps-geoparquet/src/main/java/org/apache/baremaps/geoparquet/GeoParquetSequentialSpliterator.java
b/baremaps-geoparquet/src/main/java/org/apache/baremaps/geoparquet/GeoParquetSequentialSpliterator.java
new file mode 100644
index 00000000..02af24ca
--- /dev/null
+++
b/baremaps-geoparquet/src/main/java/org/apache/baremaps/geoparquet/GeoParquetSequentialSpliterator.java
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.baremaps.geoparquet;
+
+import java.io.IOException;
+import java.util.*;
+import java.util.function.Consumer;
+import org.apache.baremaps.geoparquet.data.GeoParquetGroup;
+import org.apache.baremaps.geoparquet.hadoop.GeoParquetGroupReadSupport;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileStatus;
+import org.apache.parquet.hadoop.ParquetReader;
+
+public class GeoParquetSequentialSpliterator implements
Spliterator<GeoParquetGroup> {
+
+ private ParquetReader<GeoParquetGroup> reader;
+
+ GeoParquetSequentialSpliterator(FileStatus fileStatus, Configuration
configuration) {
+ try {
+ this.reader = ParquetReader
+ .builder(new GeoParquetGroupReadSupport(), fileStatus.getPath())
+ .withConf(configuration)
+ .build();
+ } catch (IOException e) {
+ throw new GeoParquetException("Failed to create reader for " +
fileStatus, e);
+ }
+ }
+
+ @Override
+ public boolean tryAdvance(Consumer<? super GeoParquetGroup> action) {
+ try {
+ // Read the next group
+ GeoParquetGroup group = reader.read();
+
+ // If the group is null, close the resources and set the variables to
null
+ if (group == null) {
+ reader.close();
+ return false;
+ }
+
+ // Accept the group
+ action.accept(group);
+
+ // tell the caller that there are more groups to read
+ return true;
+
+ } catch (IOException e) {
+ try {
+ reader.close();
+ } catch (IOException e2) {
+ // Ignore the exception as the original exception is more important
+ }
+ throw new GeoParquetException("IOException caught while trying to read
the next file.", e);
+ }
+ }
+
+ @Override
+ public Spliterator<GeoParquetGroup> trySplit() {
+ List<GeoParquetGroup> batch = new ArrayList<>();
+ while (batch.size() < 1_000 && tryAdvance(batch::add)) {
+ }
+ if (!batch.isEmpty()) {
+ return Spliterators.spliterator(batch, characteristics() | SIZED);
+ } else {
+ return null;
+ }
+ }
+
+ @Override
+ public long estimateSize() {
+ return Long.MAX_VALUE;
+ }
+
+ @Override
+ public int characteristics() {
+ // The spliterator is not ordered, or sorted
+ return NONNULL | DISTINCT | IMMUTABLE | SUBSIZED;
+ }
+}
diff --git a/pom.xml b/pom.xml
index 4f7233f7..3642444b 100644
--- a/pom.xml
+++ b/pom.xml
@@ -49,6 +49,7 @@ limitations under the License.
</developers>
<modules>
+ <module>baremaps-benchmarking</module>
<module>baremaps-cli</module>
<module>baremaps-core</module>
<module>baremaps-data</module>
@@ -58,7 +59,6 @@ limitations under the License.
<module>baremaps-pmtiles</module>
<module>baremaps-server</module>
<module>baremaps-testing</module>
- <module>baremaps-benchmarking</module>
</modules>
<scm>