-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdpgmm.py
executable file
·402 lines (323 loc) · 23.7 KB
/
dpgmm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
# Copyright 2011 Tom SF Haines
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
import sys
import math
import random
import numpy
import numpy.random
import scipy.special
from gcp import *
class DPGMM:
"""A Dirichlet process Gaussian mixture model, implimented using the mean-field variational method, with the stick tying rather than capping method such that incrimental usage works. For those unfamiliar with Dirichlet processes the key thing to realise is that each stick corresponds to a mixture component for density estimation or a cluster for clustering, so the stick cap is the maximum number of these entities (Though it can choose to use less sticks than supplied.). As each stick has a computational cost standard practise is to start with a low number of sticks and then increase the count, updating the model each time, until no improvement to the model is obtained with further sticks. Note that because the model is fully Bayesian the output is in fact a probability distribution over the probability distribution from which the data is drawn, i.e. instead of the point estimate of a Gaussian mixture model you get back a probability distribution from which you can draw a Gaussian mixture model, though shortcut methods are provided to get the probability/component membership probability of features. Regarding the model it actually has an infinite number of sticks, but as computers don't have an infinite amount of memory or computation the sticks count is capped. The infinite number of sticks past the cap are still modeled in a limited manor, such that you can get the probability in clustering of a sample/feature belonging to an unknown cluster (i.e. one of the infinite number past the stick cap.). It also means that there is no explicit cluster count, as it is modelling a probability distribution over the number of clusters - if you need a cluster count the best choice is to threshold on the component weights, as returned by sampleMixture(...) or intMixture(...), i.e. keep only thise that are higher than a percentage of the highest weight."""
def __init__(self, dims, stickCap = 1):
"""You initialise with the number of dimensions and the cap on the number of sticks to have. Note that the stick cap should be high enough for it to represent enough components, but not so high that you run out of memory. The better option is to set the stick cap to 1 (The default) and use the solve grow methods, which in effect find the right number of sticks. Alternativly if only given one parameter of the same type it acts as a copy constructor."""
if isinstance(dims, DPGMM):
self.dims = dims.dims
self.stickCap = dims.stickCap
self.data = map(lambda x: x.copy(), dims.data)
self.prior = GaussianPrior(dims.prior)
self.priorT = StudentT(dims.priorT) if dims.priorT!=None else None
self.n = map(lambda x: GaussianPrior(x), dims.n)
self.beta = dims.beta.copy()
self.alpha = dims.alpha.copy()
self.v = dims.v.copy()
self.z = None if dims.z is None else dims.z.copy()
self.skip = dims.skip
self.epsilon = dims.epsilon
self.nT = map(lambda x: None if x==None else StudentT(x), dims.nT)
self.vExpLog = dims.vExpLog.copy()
self.vExpNegLog = dims.vExpNegLog.copy()
else:
self.dims = dims
self.stickCap = stickCap
self.data = [] # A list containing data matrices - used to collate all the samples ready for processing. Before processing they are all appended into a single data matrix, such that this list is of length 1.
self.prior = GaussianPrior(self.dims) # The prior over the mixture components.
self.priorT = None
self.n = map(lambda _: GaussianPrior(self.dims), xrange(self.stickCap)) # The mixture components, one for each stick.
self.beta = numpy.ones(2, dtype=numpy.float32) # The two parameters (Typically named alpha & beta) for the Gamma distribution prior over alpha.
self.alpha = numpy.ones(2, dtype=numpy.float32) # The parameters for the Gamma distribution that represents the current distribution over alpha - basically beta updated with the current stick configuration.
self.v = numpy.ones((self.stickCap,2), dtype=numpy.float32) # Each [i,:] of this array represents the two parameters of the beta distribution over the strick breaking weight for the relevant mixture component.
self.z = None # The matrix of multinomials over stick-assignment for each sample, aligned with the data matrix. In the case of incrimental use will not necessarily be complete.
self.skip = 0 # Number of samples at the start of the data matrix to not bother updating - useful to speed things up with incrimental learning.
self.epsilon = 1e-4 # Amount of change below which it stops iterating.
# The cache of stuff kept around for speed...
self.nT = [None]*self.stickCap # The student T distribution associated with each Gaussian.
self.vExpLog = numpy.empty(self.stickCap, dtype=numpy.float32) # The expected value of the logorithm of each v.
self.vExpNegLog = numpy.empty(self.stickCap, dtype=numpy.float32) # The expected value of the logarithm of 1 minus each v.
self.vExpLog[:] = -1.0 # Need these to always be correct - this matches initialisation of v.
self.vExpNegLog[:] = -1.0 # As above.
def incStickCap(self, inc = 1):
"""Increases the stick cap by the given number of entrys. Can be used in collaboration with nllData to increase the number of sticks until insufficient improvement, indicating the right number has been found."""
self.stickCap += inc
self.n += map(lambda _: GaussianPrior(self.dims), xrange(inc))
self.v = numpy.append(self.v,numpy.ones((inc,2), dtype=numpy.float32),0)
if self.z is not None:
self.z = numpy.append(self.z, numpy.random.mtrand.dirichlet(32.0*numpy.ones(inc), size=self.z.shape[0]), 1)
weight = numpy.random.mtrand.dirichlet(numpy.ones(2), size=self.z.shape[0])
self.z[:,:self.stickCap-inc] *= weight[:,0].reshape((self.z.shape[0],1))
self.z[:,self.stickCap-inc:] *= weight[:,1].reshape((self.z.shape[0],1))
self.nT += [None] * inc
self.vExpLog = numpy.append(self.vExpLog,-1.0*numpy.ones(inc, dtype=numpy.float32))
self.vExpNegLog = numpy.append(self.vExpNegLog,-1.0*numpy.ones(inc, dtype=numpy.float32))
def getStickCap(self):
"""Returns the current stick cap."""
return self.stickCap
def setPrior(self, mean = None, covar = None, weight = None, scale = 1.0):
"""Sets a prior for the mixture components - basically a pass through for the addPrior method of the GaussianPrior class. If None (The default) is provided for the mean or the covar then it calculates these values for the currently contained sample set and uses them. Note that the prior defaults to nothing - this must be called before fitting the model, and if mean/covar are not provided then there must be enough data points to avoid problems. weight defaults to the number of dimensions if not specified. If covar is not given then scale is a multiplier for the covariance matrix - setting it high will soften the prior up and make it consider softer solutions when given less data. Returns True on success, False on failure - failure can happen if there is not enough data contained for automatic calculation (Think singular covariance matrix). This must be called before any solve methods are called."""
# Handle mean/covar being None...
if mean is None or covar is None:
inc = GaussianInc(self.dims)
dm = self.getDM()
for i in xrange(dm.shape[0]): inc.add(dm[i,:])
ggd = inc.fetch()
if mean is None: mean = ggd.getMean()
if covar is None: covar = ggd.getCovariance() * scale
if numpy.linalg.det(covar)<1e-12: return False
# Update the prior...
self.prior.reset()
self.prior.addPrior(mean, covar, weight)
self.priorT = self.prior.intProb()
return True
def setConcGamma(self, alpha, beta):
"""Sets the parameters for the Gamma prior over the concentration. Note that whilst alpha and beta are used for the parameter names, in accordance with standard terminology for Gamma distributions, they are not related to the model variable names. Default values are (1,1). The concentration parameter controls how much information the model requires to justify using a stick, such that lower numbers result in fewer sticks, higher numbers in larger numbers of sticks. The concentration parameter is learnt from the data, under the Gamma distribution prior set with this method, but this prior can still have a strong effect. If your data is not producing as many clusters as you expect then adjust this parameter accordingly, (e.g. increase alpha or decrease beta, or both.), but don't go too high or it will start hallucinating patterns where none exist!"""
self.beta[0] = alpha
self.beta[1] = beta
def setThreshold(self, epsilon):
"""Sets the threshold for parameter change below which it considers it to have converged, and stops iterating."""
self.epsilon = epsilon
def add(self, sample):
"""Adds either a single sample or several samples - either give a single sample as a 1D array or a 2D array as a data matrix, where each sample is [i,:]. (Sample = feature. I refer to them as samples as that more effectivly matches the concept of this modeling the probability distribution from which the features are drawn.)"""
sample = numpy.asarray(sample, dtype=numpy.float32)
if len(sample.shape)==1:
self.data.append(numpy.reshape(sample, (1,self.dims)))
else:
assert(len(sample.shape)==2)
assert(sample.shape[1]==self.dims)
self.data.append(sample)
def getDM(self):
"""Returns a data matrix containing all the samples that have been added."""
if len(self.data)==1: return self.data[0]
if len(self.data)==0: return None
self.data = [numpy.vstack(self.data)]
return self.data[0]
def size(self):
"""Returns the number of samples that have been added."""
dm = self.getDM()
if dm!=None: return dm.shape[0]
else: return 0
def lock(self, num=0):
"""Prevents the algorithm updating the component weighting for the first num samples in the database - potentially useful for incrimental learning if in a rush. If set to 0, the default, everything is updated."""
self.skip = num
def solve(self, iterCap=None):
"""Iterates updating the parameters until the model has converged. Note that the system is designed such that you can converge, add more samples, and converge again, i.e. incrimental learning. Alternativly you can converge, add more sticks, and then convegre again without issue, which makes finding the correct number of sticks computationally reasonable.. Returns the number of iterations required to acheive convergance. You can optionally provide a cap on the number of iterations it will perform."""
# Deal with the z array being incomplete - enlarge/create as needed. Random initialisation is used...
dm = self.getDM()
if self.z is None or self.z.shape[0]<dm.shape[0]:
newZ = numpy.empty((dm.shape[0],self.stickCap), dtype=numpy.float32)
if self.z is None: offset = 0
else:
offset = self.z.shape[0]
newZ[:offset,:] = self.z
self.z = newZ
self.z[offset:,:] = numpy.random.mtrand.dirichlet(32.0*numpy.ones(self.stickCap), size=self.z.shape[0]-offset) # 32 is to avoid extreme values, which can lock it in place, without the distribution being too flat as to cause problems converging.
# Iterate until convergance...
prev = self.z.copy()
iters = 0
sys.stderr.write("components: %2d\n"%(self.stickCap))
while True:
iters += 1
# Update the concentration parameter...
self.alpha[0] = self.beta[0] + self.stickCap
self.alpha[1] = self.beta[1] - self.vExpNegLog.sum()
# Record the expected values of a stick given the prior alone - needed to normalise the z values...
expLogStick = -scipy.special.psi(1.0 + self.alpha[0]/self.alpha[1])
expNegLogStick = expLogStick
expLogStick += scipy.special.psi(1.0)
expNegLogStick += scipy.special.psi(self.alpha[0]/self.alpha[1])
# Update the stick breaking weights...
self.v[:,0] = 1.0
self.v[:,1] = self.alpha[0]/self.alpha[1]
sums = self.z.sum(axis=0)
self.v[:,0] += sums
self.v[:,1] += self.z.shape[0]
self.v[:,1] -= numpy.cumsum(sums)
# Calculate the log expectations on the stick breaking weights...
self.vExpLog[:] = -scipy.special.psi(self.v.sum(axis=1))
self.vExpNegLog[:] = self.vExpLog
self.vExpLog[:] += scipy.special.psi(self.v[:,0])
self.vExpNegLog[:] += scipy.special.psi(self.v[:,1])
# Update the Gaussian conjugate priors, extracting the student-t distributions as well...
for k in xrange(self.stickCap):
self.n[k].reset()
self.n[k].addGP(self.prior)
self.n[k].addSamples(dm, self.z[:,k])
self.nT[k] = self.n[k].intProb()
# Update the z values...
prev[self.skip:,:] = self.z[self.skip:,:]
vExpNegLogCum = self.vExpNegLog.cumsum()
base = self.vExpLog.copy()
base[1:] += vExpNegLogCum[:-1]
self.z[self.skip:,:] = numpy.exp(base).reshape((1,self.stickCap))
for k in xrange(self.stickCap):
self.z[self.skip:,k] *= self.nT[k].batchProb(dm[self.skip:,:])
norm = self.priorT.batchProb(dm[self.skip:,:])
norm *= math.exp(expLogStick + vExpNegLogCum[-1]) / (1.0 - math.exp(expNegLogStick))
self.z[self.skip:,:] /= (self.z[self.skip:,:].sum(axis=1) + norm).reshape((self.z.shape[0]-self.skip,1))
# Check for convergance...
change = numpy.abs(prev[self.skip:,:]-self.z[self.skip:,:]).sum(axis=1).max()
if change<self.epsilon:
break
if iters==iterCap:
break
# Return the number of iterations that were required to acheive convergance...
return iters
def solveGrow(self, iterCap=None, maxStick=None):
"""This method works by solving for the current stick cap, and then it keeps increasing the stick cap until there is no longer an improvement in the model quality. If using this method you should probably initialise with a stick cap of 1. By using this method the stick cap parameter is lost, and you no longer have to guess what a good value is."""
it = 0
prev = None
while True:
sys.stderr.write("number of sticks:"+str(self.getStickCap())+"\n")
it += self.solve(iterCap)
value = self.nllData()
sys.stderr.write("logL(data):"+str(value)+"\n")
if maxStick!=None:
if self.getStickCap()>=maxStick:
return it
if prev==None or value>prev:
prev = value
self.incStickCap()
else: return it
def sampleMixture(self):
"""Once solve has been called and a distribution over models determined this allows you to draw a specific model. Returns a 2-tuple, where the first entry is an array of weights and the second entry a list of Gaussian distributions - they line up, to give a specific Gaussian mixture model. For density estimation the probability of a specific point is then the sum of each weight multiplied by the probability of it comming from the associated Gaussian. For clustering the probability of a specific point belonging to a cluster is the weight multiplied by the probability of it comming from a specific Gaussian, normalised for all clusters. Note that this includes an additional term to cover the infinite number of terms that follow, which is really an approximation, but tends to be such a small amount as to not matter. Be warned that if doing clustering a point could be assigned to this 'null cluster', indicating that the model thinks the point belongs to an unknown cluster (i.e. one that it doesn't have enough information, or possibly sticks, to instanciate.)."""
weight = numpy.empty(self.stickCap+1, dtype=numpy.float32)
stick = 1.0
for i in xrange(self.stickCap):
val = random.betavariate(self.v[i,0], self.v[i,1])
weight[i] = stick * val
stick *= 1.0 - val
weight[-1] = stick
gauss = map(lambda x: x.sample(), self.n)
gauss.append(self.prior.sample())
return (weight,gauss)
def intMixture(self):
"""Returns the details needed to calculate the probability of a point given the model (density estimation), or its probability of belonging to each stick (clustering), but with the actual draw of a mixture model from the model integrated out. It is an apprximation, though not a bad one. Basically you get a 2-tuple - the first entry is an array of weights, the second a list of student-t distributions. The weights and distributions align, such that for density estimation the probability for a point is the sum over all entrys of the weight multiplied by the probability of the sample comming from the student-t distribution. The prob method of this class calculates the use of this for a sample directly. For clustering the probability of belonging to each cluster is calculated as the weight multiplied by the probability of comming from the associated student-t, noting that you will need to normalise. stickProb allows you to get this assesment directly. Do not edit the returned value; also, it will not persist if solve is called again. This must only be called after solve is called at least once. Note that an extra element is included to cover the remainder of the infinite number of elements - be warned that a sample could have the highest probability of belonging to this dummy element, indicating that it probably belongs to something for which there is not enough data to infer a reasonable model."""
weights = numpy.empty(self.stickCap+1, dtype=numpy.float32)
stick = 1.0
for i in xrange(self.stickCap):
ev = self.v[i,0] / self.v[i,:].sum()
weights[i] = stick * ev
stick *= 1.0 - ev
weights[-1] = stick
return (weights, self.nT + [self.priorT])
def prob(self, x):
"""Given a sample this returns its probability, with the actual draw from the model integrated out. Must not be called until after solve has been called. This is the density estimate if using this model for density estimation. Will also accept a data matrix, in which case it will return a 1D array of probabilities aligning with the input data matrix."""
x = numpy.asarray(x)
if len(x.shape)==1:
ret = 0.0
stick = 1.0
for i in xrange(self.stickCap):
bp = self.nT[i].prob(x)
ev = self.v[i,0] / self.v[i,:].sum()
ret += bp * stick * ev
stick *= 1.0 - ev
bp = self.priorT.prob(x)
ret += bp * stick
return ret
else:
ret = numpy.zeros(x.shape[0])
stick = 1.0
for i in xrange(self.stickCap):
bp = self.nT[i].batchProb(x)
ev = self.v[i,0] / self.v[i,:].sum()
ret += bp * stick * ev
stick *= 1.0 - ev
bp = self.priorT.batchProb(x)
ret += bp * stick
return ret
def stickProb(self, x):
"""Given a sample this returns its probability of belonging to each of the components, as a 1D array, including a dummy element at the end to cover the infinite number of sticks not being explicitly modeled. This is the probability of belonging to each cluster if using the model for clustering. Must not be called until after solve has been called. Will also accept a data matrix, in which case it will return a matrix with a row for each vector in the input data matrix."""
x = numpy.asarray(x)
if len(x.shape)==1:
ret = numpy.empty(self.stickCap+1, dtype=numpy.float32)
stick = 1.0
for i in xrange(self.stickCap):
bp = self.nT[i].prob(x)
ev = self.v[i,0] / self.v[i,:].sum()
ret[i] = bp * stick * ev
stick *= 1.0 - ev
bp = self.priorT.prob(x)
ret[self.stickCap] = bp * stick
ret /= ret.sum()
return ret
else:
ret = numpy.empty((x.shape[0],self.stickCap+1), dtype=numpy.float32)
stick = 1.0
for i in xrange(self.stickCap):
bp = self.nT[i].batchProb(x)
ev = self.v[i,0] / self.v[i,:].sum()
ret[:,i] = bp * stick * ev
stick *= 1.0 - ev
bp = self.priorT.batchProb(x)
ret[:,self.stickCap] = bp * stick
ret /= ret.sum(axis=1).reshape((-1,1))
return ret
def reset(self, alphaParam = True, vParam = True, zParam = True):
"""Allows you to reset the parameters associated with each variable - good for doing a restart if your doing multiple restarts, or if you suspect it has got stuck in a local minima whilst doing incrimental stuff."""
if alphaParam:
self.alpha[:] = 1.0
if vParam:
self.v[:,:] = 1.0
self.vExpLog[:] = -1.0
self.vExpNegLog[:] = -1.0
if zParam:
self.z = None
def nllData(self):
"""Returns the negative log likelihood of the data given the current distribution over models, with the model integrated out - good for comparing multiple restarts/different numbers of sticks to find which is the best."""
dm = self.getDM()
model = self.intMixture()
probs = numpy.empty((dm.shape[0],model[0].shape[0]), dtype=numpy.float32)
for i, st in enumerate(model[1]):
probs[:,i] = st.batchLogProb(dm) + math.log(model[0][i])
offsets = probs.max(axis=1)
probs -= offsets.reshape((-1,1))
ret = offsets.sum()
ret += numpy.log(numpy.exp(probs).sum(axis=1)).sum()
return ret
if ret<0:
return ret
else:
return -ret
def multiSolve(self, runs, testIter=256):
"""Clones this object a number of times, given by the runs parameter, and then runs each for the testIters parameter number of iterations, to give them time to converge a bit. It then selects the one with the best nllData() score and runs that to convergance, before returning that specific clone. This is basically a simple way of avoiding getting stuck in a really bad local minima, though chances are you will end up in another one, just not a terrible one. Obviously testIter limits the effectivness of this, but as it tends to converge faster if your closer to the correct answer hopefully not by much. (To be honest I have not found this method to be of much use - in testing when this techneque converges to the wrong answer it does so consistantly, indicating that there is insufficient data regardless of initialisation.)"""
best = None
bestNLL = None
for _ in xrange(runs):
clone = DPGMM(self)
clone.solve(testIter)
score = clone.nllData()
if bestNLL==None or score<bestNLL:
best = clone
bestNLL = score
best.solve()
return best
def multiGrowSolve(self, runs):
"""This effectivly does multiple calls of growSolve, as indicated by runs, and returns a clone of this object that has converged to the best solution found. This is without a doubt the best solving techneque provided by this method, just remember to use the default stickCap of 1 when setting up the object. Also be warned - this can take an aweful long time to produce its awesomness. Can return self, if the starting number of sticks is the best (Note that self will always be converged after this method returns.)."""
self.solve()
best = self
bestNLL = self.nllData()
for _ in xrange(runs):
current = self
lastScore = None
while True:
current = DPGMM(current)
current.incStickCap()
current.solve()
score = current.nllData()
if score<bestNLL:
best = current
bestNLL = score
if lastScore!=None and score>lastScore: break
lastScore = score
return best