View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *      https://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19   * This is not the original file distributed by the Apache Software Foundation
20   * It has been modified by the Hipparchus project
21   */
22  package org.hipparchus.clustering;
23  
24  import java.util.ArrayList;
25  import java.util.Collection;
26  import java.util.HashMap;
27  import java.util.HashSet;
28  import java.util.List;
29  import java.util.Map;
30  import java.util.Set;
31  
32  import org.hipparchus.clustering.distance.DistanceMeasure;
33  import org.hipparchus.clustering.distance.EuclideanDistance;
34  import org.hipparchus.exception.LocalizedCoreFormats;
35  import org.hipparchus.exception.MathIllegalArgumentException;
36  import org.hipparchus.exception.NullArgumentException;
37  import org.hipparchus.util.MathUtils;
38  
39  /**
40   * DBSCAN (density-based spatial clustering of applications with noise) algorithm.
41   * <p>
42   * The DBSCAN algorithm forms clusters based on the idea of density connectivity, i.e.
43   * a point p is density connected to another point q, if there exists a chain of
44   * points p<sub>i</sub>, with i = 1 .. n and p<sub>1</sub> = p and p<sub>n</sub> = q,
45   * such that each pair &lt;p<sub>i</sub>, p<sub>i+1</sub>&gt; is directly density-reachable.
46   * A point q is directly density-reachable from point p if it is in the &epsilon;-neighborhood
47   * of this point.
48   * <p>
49   * Any point that is not density-reachable from a formed cluster is treated as noise, and
50   * will thus not be present in the result.
51   * <p>
52   * The algorithm requires two parameters:
53   * <ul>
54   *   <li>eps: the distance that defines the &epsilon;-neighborhood of a point
55   *   <li>minPoints: the minimum number of density-connected points required to form a cluster
56   * </ul>
57   *
58   * @param <T> type of the points to cluster
59   * @see <a href="http://en.wikipedia.org/wiki/DBSCAN">DBSCAN (wikipedia)</a>
60   * @see <a href="http://www.dbs.ifi.lmu.de/Publikationen/Papers/KDD-96.final.frame.pdf">
61   * A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise</a>
62   */
63  public class DBSCANClusterer<T extends Clusterable> extends Clusterer<T> {
64  
65      /** Maximum radius of the neighborhood to be considered. */
66      private final double              eps;
67  
68      /** Minimum number of points needed for a cluster. */
69      private final int                 minPts;
70  
71      /** Status of a point during the clustering process. */
72      private enum PointStatus {
73          /** The point has is considered to be noise. */
74          NOISE,
75          /** The point is already part of a cluster. */
76          PART_OF_CLUSTER
77      }
78  
79      /**
80       * Creates a new instance of a DBSCANClusterer.
81       * <p>
82       * The euclidean distance will be used as default distance measure.
83       *
84       * @param eps maximum radius of the neighborhood to be considered
85       * @param minPts minimum number of points needed for a cluster
86       * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
87       */
88      public DBSCANClusterer(final double eps, final int minPts)
89          throws MathIllegalArgumentException {
90          this(eps, minPts, new EuclideanDistance());
91      }
92  
93      /**
94       * Creates a new instance of a DBSCANClusterer.
95       *
96       * @param eps maximum radius of the neighborhood to be considered
97       * @param minPts minimum number of points needed for a cluster
98       * @param measure the distance measure to use
99       * @throws MathIllegalArgumentException if {@code eps < 0.0} or {@code minPts < 0}
100      */
101     public DBSCANClusterer(final double eps, final int minPts, final DistanceMeasure measure)
102         throws MathIllegalArgumentException {
103         super(measure);
104 
105         if (eps < 0.0d) {
106             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, eps, 0);
107         }
108         if (minPts < 0) {
109             throw new MathIllegalArgumentException(LocalizedCoreFormats.NUMBER_TOO_SMALL, minPts, 0);
110         }
111         this.eps = eps;
112         this.minPts = minPts;
113     }
114 
115     /**
116      * Returns the maximum radius of the neighborhood to be considered.
117      * @return maximum radius of the neighborhood
118      */
119     public double getEps() {
120         return eps;
121     }
122 
123     /**
124      * Returns the minimum number of points needed for a cluster.
125      * @return minimum number of points needed for a cluster
126      */
127     public int getMinPts() {
128         return minPts;
129     }
130 
131     /**
132      * Performs DBSCAN cluster analysis.
133      *
134      * @param points the points to cluster
135      * @return the list of clusters
136      * @throws NullArgumentException if the data points are null
137      */
138     @Override
139     public List<Cluster<T>> cluster(final Collection<T> points) throws NullArgumentException {
140 
141         // sanity checks
142         MathUtils.checkNotNull(points);
143 
144         final List<Cluster<T>> clusters = new ArrayList<>();
145         final Map<Clusterable, PointStatus> visited = new HashMap<>();
146 
147         for (final T point : points) {
148             if (visited.get(point) != null) {
149                 continue;
150             }
151             final List<T> neighbors = getNeighbors(point, points);
152             if (neighbors.size() >= minPts) {
153                 // DBSCAN does not care about center points
154                 final Cluster<T> cluster = new Cluster<>();
155                 clusters.add(expandCluster(cluster, point, neighbors, points, visited));
156             } else {
157                 visited.put(point, PointStatus.NOISE);
158             }
159         }
160 
161         return clusters;
162     }
163 
164     /**
165      * Expands the cluster to include density-reachable items.
166      *
167      * @param cluster Cluster to expand
168      * @param point Point to add to cluster
169      * @param neighbors List of neighbors
170      * @param points the data set
171      * @param visited the set of already visited points
172      * @return the expanded cluster
173      */
174     private Cluster<T> expandCluster(final Cluster<T> cluster,
175                                      final T point,
176                                      final List<T> neighbors,
177                                      final Collection<T> points,
178                                      final Map<Clusterable, PointStatus> visited) {
179         cluster.addPoint(point);
180         visited.put(point, PointStatus.PART_OF_CLUSTER);
181 
182         List<T> seeds = new ArrayList<>(neighbors);
183         int index = 0;
184         while (index < seeds.size()) {
185             final T current = seeds.get(index);
186             PointStatus pStatus = visited.get(current);
187             // only check non-visited points
188             if (pStatus == null) {
189                 final List<T> currentNeighbors = getNeighbors(current, points);
190                 if (currentNeighbors.size() >= minPts) {
191                     seeds = merge(seeds, currentNeighbors);
192                 }
193             }
194 
195             if (pStatus != PointStatus.PART_OF_CLUSTER) {
196                 visited.put(current, PointStatus.PART_OF_CLUSTER);
197                 cluster.addPoint(current);
198             }
199 
200             index++;
201         }
202         return cluster;
203     }
204 
205     /**
206      * Returns a list of density-reachable neighbors of a {@code point}.
207      *
208      * @param point the point to look for
209      * @param points possible neighbors
210      * @return the List of neighbors
211      */
212     private List<T> getNeighbors(final T point, final Collection<T> points) {
213         final List<T> neighbors = new ArrayList<>();
214         for (final T neighbor : points) {
215             if (point != neighbor && distance(neighbor, point) <= eps) {
216                 neighbors.add(neighbor);
217             }
218         }
219         return neighbors;
220     }
221 
222     /**
223      * Merges two lists together.
224      *
225      * @param one first list
226      * @param two second list
227      * @return merged lists
228      */
229     private List<T> merge(final List<T> one, final List<T> two) {
230         final Set<T> oneSet = new HashSet<>(one);
231         for (T item : two) {
232             if (!oneSet.contains(item)) {
233                 one.add(item);
234             }
235         }
236         return one;
237     }
238 }