d3cf06c2e0
function analogues to ST_ClusterIntersecting and ST_ClusterWithin. New functions return a cluster id within the current window partition, which should be easier to work with than the large collections returned by the older functions.
547 lines
20 KiB
XML
547 lines
20 KiB
XML
<?xml version="1.0" encoding="UTF-8"?>
|
|
<sect1 id="Clustering_Functions">
|
|
<sect1info>
|
|
<abstract>
|
|
<para>These functions implement clustering algorithms for sets of geometries.</para>
|
|
</abstract>
|
|
</sect1info>
|
|
<title>Clustering Functions</title>
|
|
|
|
<refentry id="ST_ClusterDBSCAN">
|
|
<refnamediv>
|
|
<refname>ST_ClusterDBSCAN</refname>
|
|
|
|
<refpurpose>Window function that returns a cluster id for each input geometry using the DBSCAN algorithm.</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsynopsisdiv>
|
|
<funcsynopsis>
|
|
<funcprototype>
|
|
<funcdef>integer <function>ST_ClusterDBSCAN</function></funcdef>
|
|
|
|
<paramdef><type>geometry winset </type>
|
|
<parameter>geom</parameter></paramdef>
|
|
|
|
<paramdef><type>float8 </type>
|
|
<parameter>eps</parameter></paramdef>
|
|
|
|
<paramdef><type>integer </type>
|
|
<parameter>minpoints</parameter></paramdef>
|
|
</funcprototype>
|
|
</funcsynopsis>
|
|
</refsynopsisdiv>
|
|
|
|
<refsection>
|
|
<title>Description</title>
|
|
|
|
<para>
|
|
Returns cluster number for each input geometry, based on a 2D implementation of the
|
|
<ulink url="https://en.wikipedia.org/wiki/DBSCAN">Density-based spatial clustering of applications with noise (DBSCAN)</ulink>
|
|
algorithm. Unlike <xref linkend="ST_ClusterKMeans" />, it does not require the number of clusters to be specified, but instead
|
|
uses the desired <link linkend="ST_Distance">distance</link> (<varname>eps</varname>) and density (<varname>minpoints</varname>) parameters to construct each cluster.
|
|
</para>
|
|
|
|
<para>
|
|
An input geometry will be added to a cluster if it is either:
|
|
<itemizedlist>
|
|
<listitem>
|
|
<para>
|
|
A "core" geometry, that is within <varname>eps</varname> <link linkend="ST_Distance">distance</link> of at least <varname>minpoints</varname> input geometries (including itself) or
|
|
</para>
|
|
</listitem>
|
|
<listitem>
|
|
<para>
|
|
A "border" geometry, that is within <varname>eps</varname> <link linkend="ST_Distance">distance</link> of a core geometry.
|
|
</para>
|
|
</listitem>
|
|
</itemizedlist>
|
|
</para>
|
|
|
|
<para>
|
|
Note that border geometries may be within <varname>eps</varname> distance of core geometries in more than one cluster; in this
|
|
case, either assignment would be correct, and the border geometry will be arbitrarily asssigned to one of the available clusters.
|
|
In these cases, it is possible for a correct cluster to be generated with fewer than <varname>minpoints</varname> geometries.
|
|
When assignment of a border geometry is ambiguous, repeated calls to ST_ClusterDBSCAN will produce identical results if an ORDER BY
|
|
clause is included in the window definition, but cluster assignments may differ from other implementations of the same algorithm.
|
|
</para>
|
|
|
|
<note><para>
|
|
Input geometries that do not meet the criteria to join any other cluster will be assigned a cluster number of NULL.
|
|
</para></note>
|
|
|
|
<para>Availability: 2.3.0</para>
|
|
<para>&curve_support;</para>
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>Examples</title>
|
|
<para>
|
|
Assigning a cluster number to each polygon within 50 meters of each other. Require at least 2 polygons per cluster
|
|
</para>
|
|
<informaltable>
|
|
<tgroup cols="2">
|
|
<tbody>
|
|
<row>
|
|
<entry><para><informalfigure>
|
|
<mediaobject>
|
|
<imageobject>
|
|
<imagedata fileref="images/st_clusterdbscan01.png" />
|
|
</imageobject>
|
|
<caption><para>within 50 meters at least 2 per cluster. singletons have NULL for cid</para></caption>
|
|
</mediaobject>
|
|
</informalfigure>
|
|
<programlisting>SELECT name, ST_ClusterDBSCAN(geom, eps := 50, minpoints := 2) over () AS cid
|
|
FROM boston_polys
|
|
WHERE name > '' AND building > ''
|
|
AND ST_DWithin(geom,
|
|
ST_Transform(
|
|
ST_GeomFromText('POINT(-71.04054 42.35141)', 4326), 26986),
|
|
500);</programlisting>
|
|
</para></entry>
|
|
|
|
<entry><para><screen><![CDATA[ name | bucket
|
|
-------------------------------------+--------
|
|
Manulife Tower | 0
|
|
Park Lane Seaport I | 0
|
|
Park Lane Seaport II | 0
|
|
Renaissance Boston Waterfront Hotel | 0
|
|
Seaport Boston Hotel | 0
|
|
Seaport Hotel & World Trade Center | 0
|
|
Waterside Place | 0
|
|
World Trade Center East | 0
|
|
100 Northern Avenue | 1
|
|
100 Pier 4 | 1
|
|
The Institute of Contemporary Art | 1
|
|
101 Seaport | 2
|
|
District Hall | 2
|
|
One Marina Park Drive | 2
|
|
Twenty Two Liberty | 2
|
|
Vertex | 2
|
|
Vertex | 2
|
|
Watermark Seaport | 2
|
|
Blue Hills Bank Pavilion | NULL
|
|
World Trade Center West | NULL
|
|
(20 rows)]]></screen></para>
|
|
</entry>
|
|
</row>
|
|
</tbody>
|
|
</tgroup>
|
|
</informaltable>
|
|
|
|
|
|
<para>
|
|
Combining parcels with the same cluster number into a single geometry. This uses named argument calling
|
|
</para>
|
|
<programlisting>
|
|
SELECT cid, ST_Collect(geom) AS cluster_geom, array_agg(parcel_id) AS ids_in_cluster FROM (
|
|
SELECT parcel_id, ST_ClusterDBSCAN(geom, eps := 0.5, minpoints := 5) over () AS cid, geom
|
|
FROM parcels) sq
|
|
GROUP BY cid;
|
|
</programlisting>
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>See Also</title>
|
|
<para><xref linkend="ST_DWithin"/>,
|
|
<xref linkend="ST_ClusterKMeans"/>,
|
|
<xref linkend="ST_ClusterIntersecting"/>,
|
|
<xref linkend="ST_ClusterWithin"/>
|
|
<xref linkend="ST_ClusterIntersectingWin"/>
|
|
</para>
|
|
</refsection>
|
|
|
|
</refentry>
|
|
|
|
<refentry id="ST_ClusterIntersectingWin">
|
|
<refnamediv>
|
|
<refname>ST_ClusterIntersectingWin</refname>
|
|
|
|
<refpurpose>Window function that returns a cluster id for each input geometry, clustering input geometries into connected sets.</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsynopsisdiv>
|
|
<funcsynopsis>
|
|
<funcprototype>
|
|
<funcdef>integer <function>ST_ClusterIntersectingWin</function></funcdef>
|
|
<paramdef><type>geometry winset </type> <parameter>geom</parameter></paramdef>
|
|
</funcprototype>
|
|
</funcsynopsis>
|
|
</refsynopsisdiv>
|
|
|
|
<refsection>
|
|
<title>Description</title>
|
|
|
|
<para>ST_ClusterIntersectingWin is a windowing function that builds inter-connecting clusters of geometries that intersect. It is possible to traverse all geometries in a cluster without leaving the cluster. The return value is the cluster number that the geometry argument participates in, or null for null inputs.</para>
|
|
|
|
<para>Availability: 3.4.0</para>
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>Examples</title>
|
|
<programlisting>
|
|
WITH testdata AS (
|
|
SELECT id, geom::geometry FROM (
|
|
VALUES (1, 'LINESTRING (0 0, 1 1)'),
|
|
(2, 'LINESTRING (5 5, 4 4)'),
|
|
(3, 'LINESTRING (6 6, 7 7)'),
|
|
(4, 'LINESTRING (0 0, -1 -1)'),
|
|
(5, 'POLYGON ((0 0, 4 0, 4 4, 0 4, 0 0))')) AS t(id, geom)
|
|
)
|
|
SELECT id,
|
|
ST_AsText(geom),
|
|
ST_ClusterIntersectingWin(geom) OVER () AS cluster
|
|
FROM testdata;
|
|
|
|
id | st_astext | cluster
|
|
----+--------------------------------+---------
|
|
1 | LINESTRING(0 0,1 1) | 0
|
|
2 | LINESTRING(5 5,4 4) | 0
|
|
3 | LINESTRING(6 6,7 7) | 1
|
|
4 | LINESTRING(0 0,-1 -1) | 0
|
|
5 | POLYGON((0 0,4 0,4 4,0 4,0 0)) | 0
|
|
|
|
</programlisting>
|
|
</refsection>
|
|
<refsection>
|
|
<title>See Also</title>
|
|
<para>
|
|
<xref linkend="ST_ClusterDBSCAN" />,
|
|
<xref linkend="ST_ClusterKMeans" />,
|
|
<xref linkend="ST_ClusterWithinWin" />
|
|
<xref linkend="ST_ClusterWithin" />
|
|
<xref linkend="ST_ClusterIntersecting" />
|
|
</para>
|
|
</refsection>
|
|
|
|
</refentry>
|
|
|
|
<refentry id="ST_ClusterIntersecting">
|
|
<refnamediv>
|
|
<refname>ST_ClusterIntersecting</refname>
|
|
|
|
<refpurpose>Aggregate function that clusters input geometries into connected sets.</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsynopsisdiv>
|
|
<funcsynopsis>
|
|
<funcprototype>
|
|
<funcdef>geometry[] <function>ST_ClusterIntersecting</function></funcdef>
|
|
<paramdef><type>geometry set</type> <parameter>g</parameter></paramdef>
|
|
</funcprototype>
|
|
</funcsynopsis>
|
|
</refsynopsisdiv>
|
|
|
|
<refsection>
|
|
<title>Description</title>
|
|
|
|
<para>ST_ClusterIntersecting is an aggregate function that returns an array of GeometryCollections, where each GeometryCollection represents an interconnected set of geometries.</para>
|
|
|
|
<para>Availability: 2.2.0</para>
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>Examples</title>
|
|
<programlisting>
|
|
WITH testdata AS
|
|
(SELECT unnest(ARRAY['LINESTRING (0 0, 1 1)'::geometry,
|
|
'LINESTRING (5 5, 4 4)'::geometry,
|
|
'LINESTRING (6 6, 7 7)'::geometry,
|
|
'LINESTRING (0 0, -1 -1)'::geometry,
|
|
'POLYGON ((0 0, 4 0, 4 4, 0 4, 0 0))'::geometry]) AS geom)
|
|
|
|
SELECT ST_AsText(unnest(ST_ClusterIntersecting(geom))) FROM testdata;
|
|
|
|
--result
|
|
|
|
st_astext
|
|
---------
|
|
GEOMETRYCOLLECTION(LINESTRING(0 0,1 1),LINESTRING(5 5,4 4),LINESTRING(0 0,-1 -1),POLYGON((0 0,4 0,4 4,0 4,0 0)))
|
|
GEOMETRYCOLLECTION(LINESTRING(6 6,7 7))
|
|
</programlisting>
|
|
</refsection>
|
|
<refsection>
|
|
<title>See Also</title>
|
|
<para>
|
|
<xref linkend="ST_ClusterDBSCAN" />,
|
|
<xref linkend="ST_ClusterKMeans" />,
|
|
<xref linkend="ST_ClusterWithinWin" />
|
|
<xref linkend="ST_ClusterWithin" />
|
|
</para>
|
|
</refsection>
|
|
|
|
</refentry>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<refentry id="ST_ClusterKMeans">
|
|
<refnamediv>
|
|
<refname>ST_ClusterKMeans</refname>
|
|
|
|
<refpurpose>Window function that returns a cluster id for each input geometry using the K-means algorithm.</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsynopsisdiv>
|
|
<funcsynopsis>
|
|
<funcprototype>
|
|
<funcdef>integer <function>ST_ClusterKMeans</function></funcdef>
|
|
|
|
<paramdef><type>geometry winset </type>
|
|
<parameter>geom</parameter></paramdef>
|
|
|
|
<paramdef><type>integer </type>
|
|
<parameter>number_of_clusters</parameter></paramdef>
|
|
|
|
<paramdef><type>float </type>
|
|
<parameter>max_radius</parameter></paramdef>
|
|
</funcprototype>
|
|
</funcsynopsis>
|
|
</refsynopsisdiv>
|
|
|
|
<refsection>
|
|
<title>Description</title>
|
|
|
|
<para>Returns <ulink url="https://en.wikipedia.org/wiki/K-means_clustering">K-means</ulink>
|
|
cluster number for each input geometry. The distance used for clustering is the
|
|
distance between the centroids for 2D geometries, and distance between bounding box centers for 3D geometries.
|
|
For POINT inputs, M coordinate will be treated as weight of input and has to be larger than 0.
|
|
</para>
|
|
<para><varname>max_radius</varname>, if set, will cause ST_ClusterKMeans to generate more clusters than
|
|
<varname>k</varname> ensuring that no cluster in output has radius larger than <varname>max_radius</varname>.
|
|
This is useful in reachability analysis. </para>
|
|
<para>Enhanced: 3.2.0 Support for <varname>max_radius</varname></para>
|
|
<para>Enhanced: 3.1.0 Support for 3D geometries and weights</para>
|
|
<para>Availability: 2.3.0</para>
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>Examples</title>
|
|
<para>Generate dummy set of parcels for examples:</para>
|
|
<programlisting>CREATE TABLE parcels AS
|
|
SELECT lpad((row_number() over())::text,3,'0') As parcel_id, geom,
|
|
('{residential, commercial}'::text[])[1 + mod(row_number()OVER(),2)] As type
|
|
FROM
|
|
ST_Subdivide(ST_Buffer('SRID=3857;LINESTRING(40 100, 98 100, 100 150, 60 90)'::geometry,
|
|
40, 'endcap=square'),12) As geom;
|
|
</programlisting>
|
|
|
|
<para><informalfigure>
|
|
<mediaobject>
|
|
<imageobject>
|
|
<imagedata fileref="images/st_clusterkmeans02.png" />
|
|
</imageobject>
|
|
<caption><para>Parcels color-coded by cluster number (cid)</para></caption>
|
|
</mediaobject>
|
|
</informalfigure>
|
|
<programlisting>
|
|
SELECT ST_ClusterKMeans(geom, 3) OVER() AS cid, parcel_id, geom
|
|
FROM parcels;</programlisting>
|
|
<screen> cid | parcel_id | geom
|
|
-----+-----------+---------------
|
|
0 | 001 | 0103000000...
|
|
0 | 002 | 0103000000...
|
|
1 | 003 | 0103000000...
|
|
0 | 004 | 0103000000...
|
|
1 | 005 | 0103000000...
|
|
2 | 006 | 0103000000...
|
|
2 | 007 | 0103000000...
|
|
</screen>
|
|
</para>
|
|
|
|
<para>Partitioning parcel clusters by type:</para>
|
|
<programlisting>
|
|
SELECT ST_ClusterKMeans(geom, 3) over (PARTITION BY type) AS cid, parcel_id, type
|
|
FROM parcels;</programlisting>
|
|
<screen> cid | parcel_id | type
|
|
-----+-----------+-------------
|
|
1 | 005 | commercial
|
|
1 | 003 | commercial
|
|
2 | 007 | commercial
|
|
0 | 001 | commercial
|
|
1 | 004 | residential
|
|
0 | 002 | residential
|
|
2 | 006 | residential
|
|
</screen>
|
|
|
|
<para>Example: Clustering a preaggregated planetary-scale data population dataset
|
|
using 3D clusering and weighting.
|
|
Identify at least 20 regions based on
|
|
<ulink url="https://data.humdata.org/dataset/kontur-population-dataset">Kontur Population Data</ulink>
|
|
that do not span more than 3000 km from their center:</para>
|
|
<programlisting>create table kontur_population_3000km_clusters as
|
|
select
|
|
geom,
|
|
ST_ClusterKMeans(
|
|
ST_Force4D(
|
|
ST_Transform(ST_Force3D(geom), 4978), -- cluster in 3D XYZ CRS
|
|
mvalue := population -- set clustering to be weighed by population
|
|
),
|
|
20, -- aim to generate at least 20 clusters
|
|
max_radius := 3000000 -- but generate more to make each under 3000 km radius
|
|
) over () as cid
|
|
from
|
|
kontur_population;
|
|
</programlisting>
|
|
<para><informalfigure>
|
|
<mediaobject>
|
|
<imageobject>
|
|
<imagedata fileref="images/st_clusterkmeans03.png" />
|
|
</imageobject>
|
|
<caption><para>World population clustered to above specs produces 46 clusters.
|
|
Clusters are centered at well-populated regions (New York, Moscow).
|
|
Greenland is one cluster.
|
|
There are island clusters that span across the antimeridian.
|
|
Cluster edges follow Earth's curvature.</para></caption>
|
|
</mediaobject>
|
|
</informalfigure>
|
|
</para>
|
|
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>See Also</title>
|
|
<para>
|
|
<xref linkend="ST_ClusterDBSCAN"/>,
|
|
<xref linkend="ST_ClusterIntersectingWin"/>,
|
|
<xref linkend="ST_ClusterWithinWin" />,
|
|
<xref linkend="ST_ClusterIntersecting" />,
|
|
<xref linkend="ST_ClusterWithin" />,
|
|
<xref linkend="ST_Subdivide" />,
|
|
<xref linkend="ST_Force_3D" />,
|
|
<xref linkend="ST_Force_4D" />,
|
|
</para>
|
|
</refsection>
|
|
</refentry>
|
|
|
|
<refentry id="ST_ClusterWithin">
|
|
<refnamediv>
|
|
<refname>ST_ClusterWithin</refname>
|
|
|
|
<refpurpose>Aggregate function that clusters input geometries by separation distance.</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsynopsisdiv>
|
|
<funcsynopsis>
|
|
<funcprototype>
|
|
<funcdef>geometry[] <function>ST_ClusterWithin</function></funcdef>
|
|
<paramdef><type>geometry set </type> <parameter>g</parameter></paramdef>
|
|
<paramdef><type>float8 </type> <parameter>distance</parameter></paramdef>
|
|
</funcprototype>
|
|
</funcsynopsis>
|
|
</refsynopsisdiv>
|
|
|
|
<refsection>
|
|
<title>Description</title>
|
|
|
|
<para>ST_ClusterWithin is an aggregate function that returns an array of GeometryCollections, where each GeometryCollection represents a set of geometries separated by no more than the specified distance. (Distances are Cartesian distances in the units of the SRID.)</para>
|
|
|
|
<para>Availability: 2.2.0</para>
|
|
<para>&curve_support;</para>
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>Examples</title>
|
|
<programlisting>
|
|
WITH testdata AS
|
|
(SELECT unnest(ARRAY['LINESTRING (0 0, 1 1)'::geometry,
|
|
'LINESTRING (5 5, 4 4)'::geometry,
|
|
'LINESTRING (6 6, 7 7)'::geometry,
|
|
'LINESTRING (0 0, -1 -1)'::geometry,
|
|
'POLYGON ((0 0, 4 0, 4 4, 0 4, 0 0))'::geometry]) AS geom)
|
|
|
|
SELECT ST_AsText(unnest(ST_ClusterWithin(geom, 1.4))) FROM testdata;
|
|
|
|
--result
|
|
|
|
st_astext
|
|
---------
|
|
GEOMETRYCOLLECTION(LINESTRING(0 0,1 1),LINESTRING(5 5,4 4),LINESTRING(0 0,-1 -1),POLYGON((0 0,4 0,4 4,0 4,0 0)))
|
|
GEOMETRYCOLLECTION(LINESTRING(6 6,7 7))
|
|
</programlisting>
|
|
</refsection>
|
|
<refsection>
|
|
<title>See Also</title>
|
|
<para>
|
|
<xref linkend="ST_ClusterDBSCAN" />,
|
|
<xref linkend="ST_ClusterKMeans" />,
|
|
<xref linkend="ST_ClusterIntersectingWin"/>,
|
|
<xref linkend="ST_ClusterWithinWin"/>,
|
|
<xref linkend="ST_ClusterIntersecting"/>
|
|
</para>
|
|
</refsection>
|
|
|
|
</refentry>
|
|
|
|
<refentry id="ST_ClusterWithinWin">
|
|
<refnamediv>
|
|
<refname>ST_ClusterWithinWin</refname>
|
|
|
|
<refpurpose>Window function that returns a cluster id for each input geometry, clustering using separation distance.</refpurpose>
|
|
</refnamediv>
|
|
|
|
<refsynopsisdiv>
|
|
<funcsynopsis>
|
|
<funcprototype>
|
|
<funcdef>integer <function>ST_ClusterWithinWin</function></funcdef>
|
|
<paramdef><type>geometry winset </type> <parameter>geom</parameter></paramdef>
|
|
<paramdef><type>float8 </type> <parameter>distance</parameter></paramdef>
|
|
</funcprototype>
|
|
</funcsynopsis>
|
|
</refsynopsisdiv>
|
|
|
|
<refsection>
|
|
<title>Description</title>
|
|
|
|
<para>ST_ClusterWithinWin is a window function that returns an integer for each input geometry, where the integer the cluster number the geometry is a member of. Clusters represent a set of input geometries separated by no more than the specified distance. (Distances are Cartesian distances in the units of the SRID.)
|
|
</para>
|
|
<para>ST_ClusterWithinWin is equivalent to running <xref linkend="ST_ClusterDBSCAN" /> with a `minpoints` of zero.</para>
|
|
<para>Availability: 3.4.0</para>
|
|
<para>&curve_support;</para>
|
|
|
|
</refsection>
|
|
|
|
<refsection>
|
|
<title>Examples</title>
|
|
<programlisting>
|
|
WITH testdata AS (
|
|
SELECT id, geom::geometry FROM (
|
|
VALUES (1, 'LINESTRING (0 0, 1 1)'),
|
|
(2, 'LINESTRING (5 5, 4 4)'),
|
|
(3, 'LINESTRING (6 6, 7 7)'),
|
|
(4, 'LINESTRING (0 0, -1 -1)'),
|
|
(5, 'POLYGON ((0 0, 4 0, 4 4, 0 4, 0 0))')) AS t(id, geom)
|
|
)
|
|
SELECT id,
|
|
ST_AsText(geom),
|
|
ST_ClusterWithinWin(geom, 1.4) OVER () AS cluster
|
|
FROM testdata;
|
|
|
|
|
|
id | st_astext | cluster
|
|
----+--------------------------------+---------
|
|
1 | LINESTRING(0 0,1 1) | 0
|
|
2 | LINESTRING(5 5,4 4) | 0
|
|
3 | LINESTRING(6 6,7 7) | 1
|
|
4 | LINESTRING(0 0,-1 -1) | 0
|
|
5 | POLYGON((0 0,4 0,4 4,0 4,0 0)) | 0
|
|
|
|
</programlisting>
|
|
</refsection>
|
|
<refsection>
|
|
<title>See Also</title>
|
|
<para>
|
|
<xref linkend="ST_ClusterDBSCAN" />,
|
|
<xref linkend="ST_ClusterKMeans" />,
|
|
<xref linkend="ST_ClusterIntersectingWin"/>,
|
|
<xref linkend="ST_ClusterWithin"/>,
|
|
<xref linkend="ST_ClusterIntersecting"/>
|
|
</para>
|
|
</refsection>
|
|
|
|
</refentry>
|
|
|
|
</sect1>
|