This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch prepare-1.7.2 in repository https://gitbox.apache.org/repos/asf/sedona.git
commit 13f6b2e87f267a6ccad68b5b2450fc90254422d4 Author: Paweł Tokaj <[email protected]> AuthorDate: Mon Apr 21 18:29:37 2025 +0200 [SEDONA-731] add osm nodes parser (#1920) * SEDONA-731 add osm nodes parser * SEDONA-731 update docs * SEDONA-731 update docs --- docs/tutorial/sql.md | 2 +- .../osmpbf/extractors/DenseNodeExtractor.java | 2 + ...{DenseNodeExtractor.java => NodeExtractor.java} | 67 +++++++-------------- .../datasources/osmpbf/iterators/BlobIterator.java | 15 ++++- spark/common/src/test/resources/osmpbf/dense.pbf | Bin 0 -> 282 bytes spark/common/src/test/resources/osmpbf/nodes.pbf | Bin 0 -> 242 bytes .../org/apache/sedona/sql/OsmReaderTest.scala | 38 ++++++++++++ 7 files changed, 77 insertions(+), 47 deletions(-) diff --git a/docs/tutorial/sql.md b/docs/tutorial/sql.md index 821ea37d9b..eb7fbe3082 100644 --- a/docs/tutorial/sql.md +++ b/docs/tutorial/sql.md @@ -480,7 +480,7 @@ Since v1.7.1, Sedona supports loading OSM PBF file format as a DataFrame. ``` OSM PBF files can contain nodes, ways, and relations. Currently Sedona support -DenseNodes, Ways and Relations. When you load the data you get a DataFrame with the following schema. +Nodes, DenseNodes, Ways and Relations. When you load the data you get a DataFrame with the following schema. ``` root diff --git a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java index f850682be8..0b9ca93748 100644 --- a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java +++ b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java @@ -81,6 +81,8 @@ public class DenseNodeExtractor { keyIndex = keyIndex + 2; } + keyIndex = keyIndex + 1; + return tags; } } diff --git a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/NodeExtractor.java similarity index 52% copy from spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java copy to spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/NodeExtractor.java index f850682be8..39e98537cd 100644 --- a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/DenseNodeExtractor.java +++ b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/extractors/NodeExtractor.java @@ -20,29 +20,18 @@ package org.apache.sedona.sql.datasources.osmpbf.extractors; import java.util.HashMap; import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat; +import org.apache.sedona.sql.datasources.osmpbf.features.TagsResolver; import org.apache.sedona.sql.datasources.osmpbf.model.OsmNode; -public class DenseNodeExtractor { - long latOffset; - long lonOffset; - long granularity; - long firstId; - long firstLat; - long firstLon; - Integer keyIndex; +public class NodeExtractor { - Osmformat.DenseNodes nodes; + Osmformat.PrimitiveGroup primitiveGroup; + Osmformat.PrimitiveBlock primitiveBlock; - public DenseNodeExtractor( - Osmformat.DenseNodes nodes, long latOffset, long lonOffset, long granularity) { - this.firstId = 0; - this.firstLat = 0; - this.firstLon = 0; - this.latOffset = latOffset; - this.lonOffset = lonOffset; - this.granularity = granularity; - this.nodes = nodes; - this.keyIndex = 0; + public NodeExtractor( + Osmformat.PrimitiveGroup primitiveGroup, Osmformat.PrimitiveBlock primitiveBlock) { + this.primitiveGroup = primitiveGroup; + this.primitiveBlock = primitiveBlock; } public OsmNode extract(int idx, Osmformat.StringTable stringTable) { @@ -50,37 +39,25 @@ public class DenseNodeExtractor { } private OsmNode parse(int idx, Osmformat.StringTable stringTable) { - long id = nodes.getId(idx) + firstId; - long latitude = nodes.getLat(idx) + firstLat; - long longitude = nodes.getLon(idx) + firstLon; + Osmformat.Node node = primitiveGroup.getNodes(idx); + long id = node.getId(); + long latitude = node.getLat(); + long longitude = node.getLon(); + + long latOffset = primitiveBlock.getLatOffset(); + long lonOffset = primitiveBlock.getLonOffset(); + long granularity = primitiveBlock.getGranularity(); + + // https://wiki.openstreetmap.org/wiki/PBF_Format + // latitude = .000000001 * (lat_offset + (granularity * lat)) + // longitude = .000000001 * (lon_offset + (granularity * lon)) float lat = (float) (.000000001 * (latOffset + (latitude * granularity))); float lon = (float) (.000000001 * (lonOffset + (longitude * granularity))); - firstId = id; - firstLat = latitude; - firstLon = longitude; - - HashMap<String, String> tags = parseTags(stringTable); + HashMap<String, String> tags = + TagsResolver.resolveTags(node.getKeysCount(), node::getKeys, node::getVals, stringTable); return new OsmNode(id, lat, lon, tags); } - - HashMap<String, String> parseTags(Osmformat.StringTable stringTable) { - HashMap<String, String> tags = new HashMap<>(); - - while (nodes.getKeysVals(keyIndex) != 0) { - int key = nodes.getKeysVals(keyIndex); - int value = nodes.getKeysVals(keyIndex + 1); - - String keyString = stringTable.getS(key).toStringUtf8(); - String valueString = stringTable.getS(value).toStringUtf8(); - - tags.put(keyString, valueString); - - keyIndex = keyIndex + 2; - } - - return tags; - } } diff --git a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java index 89a9005771..ff7a2f88d2 100644 --- a/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java +++ b/spark/common/src/main/java/org/apache/sedona/sql/datasources/osmpbf/iterators/BlobIterator.java @@ -27,6 +27,7 @@ import org.apache.sedona.sql.datasources.osmpbf.DenseNodeIterator; import org.apache.sedona.sql.datasources.osmpbf.build.Fileformat.Blob; import org.apache.sedona.sql.datasources.osmpbf.build.Osmformat; import org.apache.sedona.sql.datasources.osmpbf.extractors.DenseNodeExtractor; +import org.apache.sedona.sql.datasources.osmpbf.extractors.NodeExtractor; import org.apache.sedona.sql.datasources.osmpbf.extractors.RelationExtractor; import org.apache.sedona.sql.datasources.osmpbf.extractors.WaysExtractor; import org.apache.sedona.sql.datasources.osmpbf.model.OSMEntity; @@ -68,7 +69,7 @@ public class BlobIterator implements Iterator<OSMEntity> { } if (!currentPrimitiveGroup.getNodesList().isEmpty()) { - return null; + return extractNodePrimitiveGroup(); } if (!currentPrimitiveGroup.getWaysList().isEmpty()) { @@ -86,6 +87,18 @@ public class BlobIterator implements Iterator<OSMEntity> { return null; } + private OSMEntity extractNodePrimitiveGroup() { + osmEntityIdx += 1; + if (currentPrimitiveGroup.getNodesList().size() == osmEntityIdx) { + nextEntity(); + } + + Osmformat.StringTable stringTable = primitiveBlock.getStringtable(); + + return new NodeExtractor(currentPrimitiveGroup, primitiveBlock) + .extract(osmEntityIdx, stringTable); + } + public OSMEntity extractDenseNodePrimitiveGroup() { if (denseNodesIterator == null) { denseNodesIterator = diff --git a/spark/common/src/test/resources/osmpbf/dense.pbf b/spark/common/src/test/resources/osmpbf/dense.pbf new file mode 100644 index 0000000000..24082953a9 Binary files /dev/null and b/spark/common/src/test/resources/osmpbf/dense.pbf differ diff --git a/spark/common/src/test/resources/osmpbf/nodes.pbf b/spark/common/src/test/resources/osmpbf/nodes.pbf new file mode 100644 index 0000000000..c3b3a40e4c Binary files /dev/null and b/spark/common/src/test/resources/osmpbf/nodes.pbf differ diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala b/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala index 1764e0dfaf..d3f9a52c1c 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/OsmReaderTest.scala @@ -25,8 +25,12 @@ import org.testcontainers.containers.MinIOContainer import java.io.FileInputStream +case class Node(id: Long, latitude: Double, longitude: Double, tags: Map[String, String]) + class OsmReaderTest extends TestBaseScala with Matchers { val monacoPath: String = resourceFolder + "osmpbf/monaco-latest.osm.pbf" + val densePath: String = resourceFolder + "osmpbf/dense.pbf" + val nodesPath: String = resourceFolder + "osmpbf/nodes.pbf" import sparkSession.implicits._ @@ -44,6 +48,40 @@ class OsmReaderTest extends TestBaseScala with Matchers { assert(cnt > 0) } + it("should parse normal nodes") { + sparkSession.read + .format("osmpbf") + .load(nodesPath) + .select("id", "location.*", "tags") + .selectExpr( + "id", + "ROUND(latitude, 2) AS latitude", + "ROUND(longitude, 2) AS longitude", + "tags") + .as[Node] + .collect() should contain theSameElementsAs Array( + Node(1002, 48.86, 2.35, Map("amenity" -> "cafe", "name" -> "Cafe de Paris")), + Node(1003, 30.12, 22.23, Map("amenity" -> "bakery", "name" -> "Delicious Pastries")), + Node(1001, 52.52, 13.40, Map("amenity" -> "restaurant", "name" -> "Curry 36"))) + } + + it("should parse dense nodes") { + sparkSession.read + .format("osmpbf") + .load(densePath) + .select("id", "location.*", "tags") + .selectExpr( + "id", + "ROUND(latitude, 2) AS latitude", + "ROUND(longitude, 2) AS longitude", + "tags") + .as[Node] + .collect() should contain theSameElementsAs Array( + Node(1002, 48.86, 2.35, Map("amenity" -> "cafe", "name" -> "Cafe de Paris")), + Node(1003, 30.12, 22.23, Map("amenity" -> "bakery", "name" -> "Delicious Pastries")), + Node(1001, 52.52, 13.40, Map("amenity" -> "restaurant", "name" -> "Curry 36"))) + } + it("should be able to read from osm file on s3") { val container = new MinIOContainer("minio/minio:latest")
