This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new b9589b7e66 [SEDONA-731] add osm nodes parser (#1920)
b9589b7e66 is described below
commit b9589b7e66cc84ebe32af02255fbf325789883ed
Author: Paweł Tokaj <[email protected]>
AuthorDate: Mon Apr 21 18:29:37 2025 +0200
[SEDONA-731] add osm nodes parser (#1920)
* SEDONA-731 add osm nodes parser
* SEDONA-731 update docs
* SEDONA-731 update docs
---
docs/tutorial/sql.md | 2 +-
.../osmpbf/extractors/DenseNodeExtractor.java | 2 +
...{DenseNodeExtractor.java => NodeExtractor.java} | 67 +++++++--------------
.../datasources/osmpbf/iterators/BlobIterator.java | 15 ++++-
spark/common/src/test/resources/osmpbf/dense.pbf | Bin 0 -> 282 bytes
spark/common/src/test/resources/osmpbf/nodes.pbf | Bin 0 -> 242 bytes
.../org/apache/sedona/sql/OsmReaderTest.scala | 38 ++++++++++++
7 files changed, 77 insertions(+), 47 deletions(-)
diff --git a/docs/tutorial/sql.md b/docs/tutorial/sql.md
index b835084757..71e8e7d8a1 100644
--- a/docs/tutorial/sql.md
+++ b/docs/tutorial/sql.md
@@ -480,7 +480,7 @@ Since v1.7.1, Sedona supports loading OSM PBF file format
as a DataFrame.
```
OSM PBF files can contain nodes, ways, and relations. Currently Sedona support
-DenseNodes, Ways and Relations. When you load the data you get a DataFrame
with the following schema.
+Nodes, DenseNodes, Ways and Relations. When you load the data you get a
DataFrame with the following schema.
```
root
diff --git
a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
index f850682be8..0b9ca93748 100644
---
a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
+++
b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
@@ -81,6 +81,8 @@ public class DenseNodeExtractor {
keyIndex = keyIndex + 2;
}
+ keyIndex = keyIndex + 1;
+
return tags;
}
}
diff --git
a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/NodeExtractor.java
similarity index 52%
copy from
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
copy to
spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/NodeExtractor.java
index f850682be8..39e98537cd 100644
---
a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java
+++
b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/NodeExtractor.java
@@ -20,29 +20,18 @@ package org.apache.sedona.sql.datasources.osmpbf.extractors;
import java.util.HashMap;
import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
+import org.apache.sedona.sql.datasources.osmpbf.features.TagsResolver;
import org.apache.sedona.sql.datasources.osmpbf.model.OsmNode;
-public class DenseNodeExtractor {
- long latOffset;
- long lonOffset;
- long granularity;
- long firstId;
- long firstLat;
- long firstLon;
- Integer keyIndex;
+public class NodeExtractor {
- Osmformat.DenseNodes nodes;
+ Osmformat.PrimitiveGroup primitiveGroup;
+ Osmformat.PrimitiveBlock primitiveBlock;
- public DenseNodeExtractor(
- Osmformat.DenseNodes nodes, long latOffset, long lonOffset, long
granularity) {
- this.firstId = 0;
- this.firstLat = 0;
- this.firstLon = 0;
- this.latOffset = latOffset;
- this.lonOffset = lonOffset;
- this.granularity = granularity;
- this.nodes = nodes;
- this.keyIndex = 0;
+ public NodeExtractor(
+ Osmformat.PrimitiveGroup primitiveGroup, Osmformat.PrimitiveBlock
primitiveBlock) {
+ this.primitiveGroup = primitiveGroup;
+ this.primitiveBlock = primitiveBlock;
}
public OsmNode extract(int idx, Osmformat.StringTable stringTable) {
@@ -50,37 +39,25 @@ public class DenseNodeExtractor {
}
private OsmNode parse(int idx, Osmformat.StringTable stringTable) {
- long id = nodes.getId(idx) + firstId;
- long latitude = nodes.getLat(idx) + firstLat;
- long longitude = nodes.getLon(idx) + firstLon;
+ Osmformat.Node node = primitiveGroup.getNodes(idx);
+ long id = node.getId();
+ long latitude = node.getLat();
+ long longitude = node.getLon();
+
+ long latOffset = primitiveBlock.getLatOffset();
+ long lonOffset = primitiveBlock.getLonOffset();
+ long granularity = primitiveBlock.getGranularity();
+
+ // https://wiki.openstreetmap.org/wiki/PBF_Format
+ // latitude = .000000001 * (lat_offset + (granularity * lat))
+ // longitude = .000000001 * (lon_offset + (granularity * lon))
float lat = (float) (.000000001 * (latOffset + (latitude * granularity)));
float lon = (float) (.000000001 * (lonOffset + (longitude * granularity)));
- firstId = id;
- firstLat = latitude;
- firstLon = longitude;
-
- HashMap<String, String> tags = parseTags(stringTable);
+ HashMap<String, String> tags =
+ TagsResolver.resolveTags(node.getKeysCount(), node::getKeys,
node::getVals, stringTable);
return new OsmNode(id, lat, lon, tags);
}
-
- HashMap<String, String> parseTags(Osmformat.StringTable stringTable) {
- HashMap<String, String> tags = new HashMap<>();
-
- while (nodes.getKeysVals(keyIndex) != 0) {
- int key = nodes.getKeysVals(keyIndex);
- int value = nodes.getKeysVals(keyIndex + 1);
-
- String keyString = stringTable.getS(key).toStringUtf8();
- String valueString = stringTable.getS(value).toStringUtf8();
-
- tags.put(keyString, valueString);
-
- keyIndex = keyIndex + 2;
- }
-
- return tags;
- }
}
diff --git
a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java
b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java
index 89a9005771..ff7a2f88d2 100644
---
a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java
+++
b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java
@@ -27,6 +27,7 @@ import
org.apache.sedona.sql.datasources.osmpbf.DenseNodeIterator;
import org.apache.sedona.sql.datasources.osmpbf.build.Fileformat.Blob;
import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat;
import org.apache.sedona.sql.datasources.osmpbf.extractors.DenseNodeExtractor;
+import org.apache.sedona.sql.datasources.osmpbf.extractors.NodeExtractor;
import org.apache.sedona.sql.datasources.osmpbf.extractors.RelationExtractor;
import org.apache.sedona.sql.datasources.osmpbf.extractors.WaysExtractor;
import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity;
@@ -68,7 +69,7 @@ public class BlobIterator implements Iterator<OSMEntity> {
}
if (!currentPrimitiveGroup.getNodesList().isEmpty()) {
- return null;
+ return extractNodePrimitiveGroup();
}
if (!currentPrimitiveGroup.getWaysList().isEmpty()) {
@@ -86,6 +87,18 @@ public class BlobIterator implements Iterator<OSMEntity> {
return null;
}
+ private OSMEntity extractNodePrimitiveGroup() {
+ osmEntityIdx += 1;
+ if (currentPrimitiveGroup.getNodesList().size() == osmEntityIdx) {
+ nextEntity();
+ }
+
+ Osmformat.StringTable stringTable = primitiveBlock.getStringtable();
+
+ return new NodeExtractor(currentPrimitiveGroup, primitiveBlock)
+ .extract(osmEntityIdx, stringTable);
+ }
+
public OSMEntity extractDenseNodePrimitiveGroup() {
if (denseNodesIterator == null) {
denseNodesIterator =
diff --git a/spark/common/src/test/resources/osmpbf/dense.pbf
b/spark/common/src/test/resources/osmpbf/dense.pbf
new file mode 100644
index 0000000000..24082953a9
Binary files /dev/null and b/spark/common/src/test/resources/osmpbf/dense.pbf
differ
diff --git a/spark/common/src/test/resources/osmpbf/nodes.pbf
b/spark/common/src/test/resources/osmpbf/nodes.pbf
new file mode 100644
index 0000000000..c3b3a40e4c
Binary files /dev/null and b/spark/common/src/test/resources/osmpbf/nodes.pbf
differ
diff --git
a/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala
b/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala
index 1764e0dfaf..d3f9a52c1c 100644
--- a/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala
+++ b/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala
@@ -25,8 +25,12 @@ import org.testcontainers.containers.MinIOContainer
import java.io.FileInputStream
+case class Node(id: Long, latitude: Double, longitude: Double, tags:
Map[String, String])
+
class OsmReaderTest extends TestBaseScala with Matchers {
val monacoPath: String = resourceFolder + "osmpbf/monaco-latest.osm.pbf"
+ val densePath: String = resourceFolder + "osmpbf/dense.pbf"
+ val nodesPath: String = resourceFolder + "osmpbf/nodes.pbf"
import sparkSession.implicits._
@@ -44,6 +48,40 @@ class OsmReaderTest extends TestBaseScala with Matchers {
assert(cnt > 0)
}
+ it("should parse normal nodes") {
+ sparkSession.read
+ .format("osmpbf")
+ .load(nodesPath)
+ .select("id", "location.*", "tags")
+ .selectExpr(
+ "id",
+ "ROUND(latitude, 2) AS latitude",
+ "ROUND(longitude, 2) AS longitude",
+ "tags")
+ .as[Node]
+ .collect() should contain theSameElementsAs Array(
+ Node(1002, 48.86, 2.35, Map("amenity" -> "cafe", "name" -> "Cafe de
Paris")),
+ Node(1003, 30.12, 22.23, Map("amenity" -> "bakery", "name" ->
"Delicious Pastries")),
+ Node(1001, 52.52, 13.40, Map("amenity" -> "restaurant", "name" ->
"Curry 36")))
+ }
+
+ it("should parse dense nodes") {
+ sparkSession.read
+ .format("osmpbf")
+ .load(densePath)
+ .select("id", "location.*", "tags")
+ .selectExpr(
+ "id",
+ "ROUND(latitude, 2) AS latitude",
+ "ROUND(longitude, 2) AS longitude",
+ "tags")
+ .as[Node]
+ .collect() should contain theSameElementsAs Array(
+ Node(1002, 48.86, 2.35, Map("amenity" -> "cafe", "name" -> "Cafe de
Paris")),
+ Node(1003, 30.12, 22.23, Map("amenity" -> "bakery", "name" ->
"Delicious Pastries")),
+ Node(1001, 52.52, 13.40, Map("amenity" -> "restaurant", "name" ->
"Curry 36")))
+ }
+
it("should be able to read from osm file on s3") {
val container = new MinIOContainer("minio/minio:latest")