1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
| from __future__ import division import random import numpy as np from scipy.spatial.distance import cdist from scipy.sparse import issparse
def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ): if not issparse(X): X = np.asanyarray(X) centres = centres.todense() if issparse(centres) else centres.copy() N, dim = X.shape k, cdim = centres.shape if dim != cdim: raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % ( X.shape, centres.shape )) if verbose: print "kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % ( X.shape, centres.shape, delta, maxiter, metric) allx = np.arange(N) prevdist = 0 for jiter in range( 1, maxiter+1 ): D = cdist_sparse( X, centres, metric=metric, p=p ) xtoc = D.argmin(axis=1) distances = D[allx,xtoc] avdist = distances.mean() if verbose >= 2: print "kmeans: av |X - nearest centre| = %.4g" % avdist if (1 - delta) * prevdist <= avdist <= prevdist or jiter == maxiter: break prevdist = avdist for jc in range(k): c = np.where( xtoc == jc )[0] if len(c) > 0: centres[jc] = X[c].mean( axis=0 ) if verbose: print "kmeans: %d iterations cluster sizes:" % jiter, np.bincount(xtoc) if verbose >= 2: r50 = np.zeros(k) r90 = np.zeros(k) for j in range(k): dist = distances[ xtoc == j ] if len(dist) > 0: r50[j], r90[j] = np.percentile( dist, (50, 90) ) print "kmeans: cluster 50 % radius", r50.astype(int) print "kmeans: cluster 90 % radius", r90.astype(int) return centres, xtoc, distances
def kmeanssample( X, k, nsample=0, **kwargs ): N, dim = X.shape if nsample == 0: nsample = max( 2*np.sqrt(N), 10*k ) Xsample = randomsample( X, int(nsample) ) pass1centres = randomsample( X, int(k) ) samplecentres = kmeans( Xsample, pass1centres, **kwargs )[0] return kmeans( X, samplecentres, **kwargs )
def cdist_sparse( X, Y, **kwargs ): sxy = 2*issparse(X) + issparse(Y) if sxy == 0: return cdist( X, Y, **kwargs ) d = np.empty( (X.shape[0], Y.shape[0]), np.float64 ) if sxy == 2: for j, x in enumerate(X): d[j] = cdist( x.todense(), Y, **kwargs ) [0] elif sxy == 1: for k, y in enumerate(Y): d[:,k] = cdist( X, y.todense(), **kwargs ) [0] else: for j, x in enumerate(X): for k, y in enumerate(Y): d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0] return d
def randomsample( X, n ): sampleix = random.sample( xrange( X.shape[0] ), int(n) ) return X[sampleix]
def nearestcentres( X, centres, metric="euclidean", p=2 ): D = cdist( X, centres, metric=metric, p=p ) return D.argmin(axis=1)
def Lqmetric( x, y=None, q=.5 ): return (np.abs(x - y) ** q) .mean() if y is not None else (np.abs(x) ** q) .mean()
class Kmeans: def __init__( self, X, k=0, centres=None, nsample=0, **kwargs ): self.X = X if centres is None: self.centres, self.Xtocentre, self.distances = kmeanssample( X, k=k, nsample=nsample, **kwargs ) else: self.centres, self.Xtocentre, self.distances = kmeans( X, centres, **kwargs )
def __iter__(self): for jc in range(len(self.centres)): yield jc, (self.Xtocentre == jc)
|