1def kmeans(X, k, maxiter, seed = None):
2 """
3 specify the number of clusters k and
4 the maximum iteration to run the algorithm
5 """
6 n_row, n_col = X.shape
7
8 # randomly choose k data points as initial centroids
9 if seed is not None:
10 np.random.seed(seed)
11
12 rand_indices = np.random.choice(n_row, size = k)
13 centroids = X[rand_indices]
14
15 for itr in range(maxiter):
16 # compute distances between each data point and the set of centroids
17 # and assign each data point to the closest centroid
18 distances_to_centroids = pairwise_distances(X, centroids, metric = 'euclidean')
19 cluster_assignment = np.argmin(distances_to_centroids, axis = 1)
20
21 # select all data points that belong to cluster i and compute
22 # the mean of these data points (each feature individually)
23 # this will be our new cluster centroids
24 new_centroids = np.array([X[cluster_assignment == i].mean(axis = 0) for i in range(k)])
25
26 # if the updated centroid is still the same,
27 # then the algorithm converged
28 if np.all(centroids == new_centroids):
29 break
30
31 centroids = new_centroids
32
33 return centroids, cluster_assignment
34