|
| 1 | +# -------------------------------------------------------------------------- |
| 2 | +# Source file provided under Apache License, Version 2.0, January 2004, |
| 3 | +# http://www.apache.org/licenses/ |
| 4 | +# (c) Copyright IBM Corp. 2021, 2022 |
| 5 | +# -------------------------------------------------------------------------- |
| 6 | + |
| 7 | +""" |
| 8 | +K-means is a way of clustering points in a multi-dimensional space |
| 9 | +where the set of points to be clustered are partitioned into k subsets. |
| 10 | +The idea is to minimize the inter-point distances inside a cluster in |
| 11 | +order to produce clusters which group together close points. |
| 12 | +
|
| 13 | +See https://en.wikipedia.org/wiki/K-means_clustering |
| 14 | +""" |
| 15 | + |
| 16 | + |
| 17 | +import numpy as np |
| 18 | +from docplex.cp.model import CpoModel |
| 19 | +import docplex.cp.solver.solver as solver |
| 20 | +from docplex.cp.utils import compare_natural |
| 21 | + |
| 22 | +def make_model(coords, k, trust_numerics=True): |
| 23 | + """ |
| 24 | + Build a K-means model from a set of coordinate vectors (points), |
| 25 | + and a given number of clusters k. |
| 26 | +
|
| 27 | + We assign each point to a cluster and minimize the objective which |
| 28 | + is the sum of the squares of the distances of each point to |
| 29 | + the centre of gravity of the cluster to which it belongs. |
| 30 | +
|
| 31 | + Here, there are two ways of building the objective function. One |
| 32 | + uses the sum of squares of the coordinates of points in a cluster |
| 33 | + minus the size of the cluster times the center value. This is akin |
| 34 | + to the calculation of variance vi E[X^2] - E[X]^2. This is the most |
| 35 | + efficient but can be numerically unstable due to massive cancellation. |
| 36 | +
|
| 37 | + The more numerically stable (but less efficient) way to calculate the |
| 38 | + objective is the analog of the variance calculation (sum_i(X_i - mu_i)^2)/n |
| 39 | + """ |
| 40 | + # Sizes and ranges |
| 41 | + n, d = coords.shape |
| 42 | + N, D, K = range(n), range(d), range(k) |
| 43 | + |
| 44 | + # Model, and decision variables. x[c] = cluster to which node c belongs |
| 45 | + mdl = CpoModel() |
| 46 | + x = [mdl.integer_var(0, k-1, "C_{}".format(i)) for i in N] |
| 47 | + |
| 48 | + # Size (number of nodes) in each cluster. If this is zero, we make |
| 49 | + # it 1 to avoid division by zero later (if a particular cluster is |
| 50 | + # not used). |
| 51 | + csize = [mdl.max(1, mdl.count(x, c)) for c in K] |
| 52 | + |
| 53 | + # Calculate total distance squared |
| 54 | + total_dist2 = 0 |
| 55 | + for c in K: # For each cluster |
| 56 | + # Boolean vector saying which points are in this cluster |
| 57 | + included = [x[i] == c for i in N] |
| 58 | + for dim in D: # For each dimension |
| 59 | + # Points for each point in the given dimension (x, y, z, ...) |
| 60 | + point = coords[:, dim] |
| 61 | + |
| 62 | + # Calculate the cluster centre for this dimension |
| 63 | + centre = mdl.scal_prod(included, point) / csize[c] |
| 64 | + |
| 65 | + # Calculate the total distance^2 for this cluster & dimension |
| 66 | + if trust_numerics: |
| 67 | + sum_of_x2 = mdl.scal_prod(included, (p**2 for p in point)) |
| 68 | + dist2 = sum_of_x2 - centre**2 * csize[c] |
| 69 | + else: |
| 70 | + all_dist2 = ((centre - p)**2 for p in point) |
| 71 | + dist2 = mdl.scal_prod(included, all_dist2) |
| 72 | + |
| 73 | + # Keep the total distance squared in a sum |
| 74 | + total_dist2 += dist2 |
| 75 | + |
| 76 | + # Minimize the total distance squared |
| 77 | + mdl.minimize(total_dist2) |
| 78 | + return mdl |
| 79 | + |
| 80 | + |
| 81 | +if __name__ == "__main__": |
| 82 | + import sys |
| 83 | + # Default values |
| 84 | + n, d, k, sd = 500, 2, 5, 1234 |
| 85 | + |
| 86 | + # Accept number of points, number of dimensions, number of clusters, seed |
| 87 | + if len(sys.argv) > 1: |
| 88 | + n = int(sys.argv[1]) |
| 89 | + if len(sys.argv) > 2: |
| 90 | + d = int(sys.argv[2]) |
| 91 | + if len(sys.argv) > 3: |
| 92 | + k = int(sys.argv[3]) |
| 93 | + if len(sys.argv) > 4: |
| 94 | + sd = int(sys.argv[4]) |
| 95 | + |
| 96 | + # Message |
| 97 | + print("Generating with N = {}, D = {}, K = {}".format(n, d, k)) |
| 98 | + |
| 99 | + # Seed and generate coordinates on the unit hypercube |
| 100 | + np.random.seed(sd) |
| 101 | + coords = np.random.uniform(0, 1, size=(n, d)) |
| 102 | + |
| 103 | + # Build model |
| 104 | + mdl = make_model(coords, k) |
| 105 | + |
| 106 | + # Solve using constraint programming |
| 107 | + mdl.solve(SearchType="Restart", TimeLimit=10, LogPeriod=50000) |
| 108 | + |
| 109 | + if compare_natural(solver.get_solver_version(), '22.1') >= 0: |
| 110 | + # Solve using neighborhood search |
| 111 | + mdl.solve(SearchType="Neighborhood", TimeLimit=10, LogPeriod=50000) |
0 commit comments