1from sklearn.cluster import KMeans
2df = np.array([[1,4],[2,2],[2,5],[3,3],[3,4],[4,7],[5,6],[6,4],[6,7],[7,6],[7,9],[8,7],[8,9],[9,4],[9,8]])
3kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10)
4y_pred = kmeans.fit_predict(df)
1from sklearn.cluster import KMeans
2kmeans = KMeans(init="random", n_clusters=3, n_init=10, max_iter=300, random_state=42 )
3kmeans.fit(x_train) #Replace your training dataset instead of x_train
4# The lowest SSE value
5print(kmeans.inertia_)
6# Final locations of the centroid
7print(kmeans.cluster_centers_)
8# The number of iterations required to converge
9print(kmeans.n_iter_)
10# first five predicted labels
11print(kmeans.labels_[:5])
12
13
14# init controls the initialization technique. The standard version of the k-means algorithm is implemented by setting init to "random". Setting this to "k-means++" employs an advanced trick to speed up convergence, which you’ll use later.
15
16# n_clusters sets k for the clustering step. This is the most important parameter for k-means.
17
18# n_init sets the number of initializations to perform. This is important because two runs can converge on different cluster assignments. The default behavior for the scikit-learn algorithm is to perform ten k-means runs and return the results of the one with the lowest SSE.
19
20# max_iter sets the number of maximum iterations for each initialization of the k-means algorithm.
1import plotly.plotly as py
2import pandas as pd
3
4df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/alpha_shape.csv')
5df.head()
6
7scatter = dict(
8 mode = "markers",
9 name = "y",
10 type = "scatter3d",
11 x = df['x'], y = df['y'], z = df['z'],
12 marker = dict( size=2, color="rgb(23, 190, 207)" )
13)
14clusters = dict(
15 alphahull = 7,
16 name = "y",
17 opacity = 0.1,
18 type = "mesh3d",
19 x = df['x'], y = df['y'], z = df['z']
20)
21layout = dict(
22 title = '3d point clustering',
23 scene = dict(
24 xaxis = dict( zeroline=False ),
25 yaxis = dict( zeroline=False ),
26 zaxis = dict( zeroline=False ),
27 )
28)
29fig = dict( data=[scatter, clusters], layout=layout )
30# Use py.iplot() for IPython notebook
31py.iplot(fig, filename='3d point clustering')
32
1# Function: K Means
2# -------------
3# K-Means is an algorithm that takes in a dataset and a constant
4# k and returns k centroids (which define clusters of data in the
5# dataset which are similar to one another).
6def kmeans(dataSet, k):
7
8 # Initialize centroids randomly
9 numFeatures = dataSet.getNumFeatures()
10 centroids = getRandomCentroids(numFeatures, k)
11
12 # Initialize book keeping vars.
13 iterations = 0
14 oldCentroids = None
15
16 # Run the main k-means algorithm
17 while not shouldStop(oldCentroids, centroids, iterations):
18 # Save old centroids for convergence test. Book keeping.
19 oldCentroids = centroids
20 iterations += 1
21
22 # Assign labels to each datapoint based on centroids
23 labels = getLabels(dataSet, centroids)
24
25 # Assign centroids based on datapoint labels
26 centroids = getCentroids(dataSet, labels, k)
27
28 # We can get the labels too by calling getLabels(dataSet, centroids)
29 return centroids
30