HelloJowet commented on issue #1724:
URL: https://github.com/apache/sedona/issues/1724#issuecomment-2551266890
Thank's for looking into it.
> Can you try running the repro without Sedona but with Kryo serialization
enabled?
You're right. In this case the error appears too ...
> I have not reproduced this using similar configurations.
Here are some instructions to reproduce the error:
1. Create `docker-compose.yaml` file:
```yaml
services:
spark_master:
restart: always
image: bitnami/spark:3.5
ports:
- 8080:8080
- 7077:7077
hostname: spark-master
environment:
- SPARK_MODE=master
- SPARK_RPC_AUTHENTICATION_ENABLED=no
- SPARK_RPC_ENCRYPTION_ENABLED=no
- SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- SPARK_SSL_ENABLED=no
- SPARK_WORKER_WEBUI_PORT=8081
spark_worker:
restart: always
image: bitnami/spark:3.5
ports:
- 8081:8081
environment:
- SPARK_MODE=worker
- SPARK_MASTER_URL=spark://spark-master:7077
- SPARK_WORKER_MEMORY=8G
- SPARK_WORKER_CORES=4
- AWS_ACCESS_KEY_ID=user
- AWS_SECRET_ACCESS_KEY=password
- AWS_REGION=us-east-1
depends_on:
- spark_master
storage_s3:
restart: always
image: quay.io/minio/minio:RELEASE.2024-10-29T16-01-48Z
ports:
- 5560:5560
- 5561:5561
hostname: storage-s3
environment:
MINIO_ROOT_USER: admin
MINIO_ROOT_PASSWORD: password
command: server /data --console-address ":5560" --address=":5561"
healthcheck:
test: ["CMD", "curl", "-f",
"http://localhost:5560/minio/health/live"]
interval: 5s
timeout: 5s
retries: 5
storage_s3_initial_setup:
image: minio/mc:RELEASE.2024-10-29T15-34-59Z
depends_on:
storage_s3:
condition: service_healthy
volumes:
- ./minio_docker_entrypoint.sh:/docker_entrypoint.sh:z
entrypoint:
- /docker_entrypoint.sh
database_postgres:
restart: always
image: postgis/postgis:16-3.4
ports:
- 5500:5432
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
```
3. Create `minio_docker_entrypoint.sh` file:
```sh
#!/bin/bash
# Set up alias for MinIO
mc alias set minio http://storage-s3:5561 admin password;
# Create buckets
mc mb minio/data-lakehouse;
```
4. Start docker compose: `docker compose up`
5. Example that works (replace `YOUR_IP_ADDRESS` with your ip address):
```py
from pyspark.sql import SparkSession
spark = (
SparkSession.builder.master('spark://localhost:7077')
.config(
'spark.jars.packages',
'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1,'
'org.apache.iceberg:iceberg-aws-bundle:1.7.1,'
'org.postgresql:postgresql:42.7.4',
)
# .config('spark.serializer',
'org.apache.spark.serializer.KryoSerializer')
config('spark.sql.extensions',
'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
.config('spark.sql.catalog.my_catalog',
'org.apache.iceberg.spark.SparkCatalog')
.config('spark.sql.catalog.my_catalog.type', 'hadoop')
.config('spark.sql.catalog.my_catalog.type', 'jdbc')
.config('spark.sql.catalog.my_catalog.uri',
'jdbc:postgresql://localhost:5500/postgres')
.config('spark.sql.catalog.my_catalog.jdbc.user', 'postgres')
.config('spark.sql.catalog.my_catalog.jdbc.password', 'postgres')
.config('spark.sql.catalog.my_catalog.io-impl',
'org.apache.iceberg.aws.s3.S3FileIO')
.config('spark.sql.catalog.my_catalog.warehouse',
's3://data-lakehouse')
.config('spark.sql.catalog.my_catalog.s3.region', 'us-east-1')
.config('spark.sql.catalog.my_catalog.s3.endpoint',
'http://YOUR_IP_ADDRESS:5561')
.config('spark.sql.catalog.my_catalog.s3.access-key-id', 'admin')
.config('spark.sql.catalog.my_catalog.s3.secret-access-key',
'password')
.getOrCreate()
)
spark.sql('CREATE TABLE my_catalog.table10 (name string) USING iceberg;')
spark.sql("INSERT INTO my_catalog.table10 VALUES ('Alex'), ('Dipankar'),
('Jason')")
```
6. Example that doesn't work:
```py
from sedona.spark import SedonaContext
spark = (
SedonaContext.builder()
.master('spark://localhost:7077')
.config(
'spark.jars.packages',
# sedona
'org.apache.sedona:sedona-spark-3.5_2.12:1.7.0,'
'org.datasyslab:geotools-wrapper:1.7.0-28.5,'
# iceberg
'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1,'
'org.apache.iceberg:iceberg-aws-bundle:1.7.1,'
'org.postgresql:postgresql:42.7.4',
)
# .config('spark.serializer',
'org.apache.spark.serializer.KryoSerializer')
.config('spark.sql.extensions',
'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions')
.config('spark.sql.catalog.my_catalog',
'org.apache.iceberg.spark.SparkCatalog')
.config('spark.sql.catalog.my_catalog.type', 'jdbc')
.config('spark.sql.catalog.my_catalog.uri',
'jdbc:postgresql://localhost:5500/postgres')
.config('spark.sql.catalog.my_catalog.jdbc.user', 'postgres')
.config('spark.sql.catalog.my_catalog.jdbc.password', 'postgres')
.config('spark.sql.catalog.my_catalog.io-impl',
'org.apache.iceberg.aws.s3.S3FileIO')
.config('spark.sql.catalog.my_catalog.warehouse',
's3://data-lakehouse')
.config('spark.sql.catalog.my_catalog.s3.region', 'us-east-1')
.config('spark.sql.catalog.my_catalog.s3.endpoint',
'http://YOUR_IP_ADRESS:5561')
.config('spark.sql.catalog.my_catalog.s3.access-key-id', 'admin')
.config('spark.sql.catalog.my_catalog.s3.secret-access-key',
'password')
.config('spark.sql.catalog.my_catalog.s3.path-style-access', 'true')
.getOrCreate()
)
spark.sql('CREATE TABLE my_catalog.table8 (name string) USING iceberg;')
spark.sql("INSERT INTO my_catalog.table8 VALUES ('Alex'), ('Dipankar'),
('Jason')")
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]