Hi,
I try to implement a simple distance based clustering in PostgreSQL.
It's working the following way:
1. loop as long as there are non clustered points
2. took one point that is not clustered
3. define this as a new cluster
4. get all surrounding points in a defined distance and add them into the cluster
5. calculate the centroid for each cluster and count cluster points
The calculation speed depends mostly on the count of points and the scale used. In the moment i'am using a python script and it took around 1sec for 1000 points for TMS zoomlevel 5 (distance = 4891.96981025m * 40 pixel) and around 3sec at zoomlevel 15 (distance = 4.78m * 40 pixel). Implementing it with pgsql doesn't speeds it up.
Is it possible to do the work completely with SQL and maybe with just one query? I'am not sure if I understood the WITH RECURSIVE concept completely...but shouldn't it by possible to replace the iteration(2.-4.) with a WITH RECURSIVE statement? Are there any other suggestion to speed it up or a better solution?
Greetings
CREATE TABLE points
(
id_point serial PRIMARY KEY,
geom geometry(Point,3857),
cluster_nr integer
);
INSERT INTO points (geom)
SELECT ST_SetSrid(ST_MakePoint(12.5 + random()*5, 51 + random()*5), 3857)
FROM generate_series(0, 1000);
(
id_point serial PRIMARY KEY,
geom geometry(Point,3857),
cluster_nr integer
);
INSERT INTO points (geom)
SELECT ST_SetSrid(ST_MakePoint(12.5 + random()*5, 51 + random()*5), 3857)
FROM generate_series(0, 1000);
CREATE TABLE pg_clusters
(
gid integer,
cluster_feature_ids integer[],
count integer,
geom geometry
);
(
gid integer,
cluster_feature_ids integer[],
count integer,
geom geometry
);
-- select first point, which isn't clustered yet
SELECT id_point
FROM points
WHERE cluster_nr IS NULL
LIMIT 1;
SELECT id_point
FROM points
WHERE cluster_nr IS NULL
LIMIT 1;
-- use first point and get all surrounding points in defined distance including the point itself
-- use them as first cluster
WITH first_cluster AS (
SELECT a.id_point AS cluster_nr,
b.id_point,
b.geom AS geom
FROM points a,
points b
WHERE ST_Distance(a.geom, b.geom) <= 4891.96981025 * 40
AND a.id_point = 185
AND a.cluster_nr IS NULL
)
UPDATE points AS a
SET cluster_nr = b.cluster_nr
FROM first_cluster AS b
WHERE a.id_point = b.id_point;
WITH first_cluster AS (
SELECT a.id_point AS cluster_nr,
b.id_point,
b.geom AS geom
FROM points a,
points b
WHERE ST_Distance(a.geom, b.geom) <= 4891.96981025 * 40
AND a.id_point = 185
AND a.cluster_nr IS NULL
)
UPDATE points AS a
SET cluster_nr = b.cluster_nr
FROM first_cluster AS b
WHERE a.id_point = b.id_point;
--> choose next id
--> calculate next cluster
etc.
--> calculate next cluster
etc.
-- calculate the centroid of a cluster and count cluster points
DELETE FROM pg_clusters;
WITH clusters AS (
SELECT cluster_nr, array_agg(id_point) AS feature_ids, count(id_point), ST_Centroid(ST_Collect(geom)) AS geom
FROM points
GROUP BY cluster_nr
)
INSERT INTO pg_clusters(gid, cluster_feature_ids, count, geom)
SELECT * FROM clusters;
DELETE FROM pg_clusters;
WITH clusters AS (
SELECT cluster_nr, array_agg(id_point) AS feature_ids, count(id_point), ST_Centroid(ST_Collect(geom)) AS geom
FROM points
GROUP BY cluster_nr
)
INSERT INTO pg_clusters(gid, cluster_feature_ids, count, geom)
SELECT * FROM clusters;
_______________________________________________ postgis-users mailing list [email protected] http://lists.osgeo.org/cgi-bin/mailman/listinfo/postgis-users
