GMM in python and EM

第一部分

1
2
3
4
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import numpy as np
1
2
3
4
5
6
7
8
9
10
from sklearn.datasets.samples_generator import make_blobs
X, y_true = make_blobs(n_samples=400, centers=4,
cluster_std=0.60, random_state=0)
X = X[:, ::-1] # flip axes for better plotting

# Plot the data with K Means Labels
from sklearn.cluster import KMeans
kmeans = KMeans(4, random_state=0)
labels = kmeans.fit(X).predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

def plot_kmeans(kmeans, X, n_clusters=4, rseed=0, ax=None):
labels = kmeans.fit_predict(X)

# plot the input data
ax = ax or plt.gca()
ax.axis('equal')
ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)

# plot the representation of the KMeans model
centers = kmeans.cluster_centers_
radii = [cdist(X[labels == i], [center]).max()
for i, center in enumerate(centers)]
for c, r in zip(centers, radii):
ax.add_patch(plt.Circle(c, r, fc='#CCCCCC', lw=3, alpha=0.5, zorder=1))
1
2
kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X)

png

1
2
3
4
5
rng = np.random.RandomState(13)
X_stretched = np.dot(X, rng.randn(2, 2))

kmeans = KMeans(n_clusters=4, random_state=0)
plot_kmeans(kmeans, X_stretched)

png

使用GMM

1
2
3
4
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4).fit(X)
labels = gmm.predict(X)
plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis');

png

1
2
probs = gmm.predict_proba(X)
print(probs[:5].round(3))
[[0.    0.463 0.    0.537]
 [0.    0.    1.    0.   ]
 [0.    0.    1.    0.   ]
 [0.    0.    0.    1.   ]
 [0.    0.    1.    0.   ]]
1
2
size = 50 * probs.max(1) ** 2  # square emphasizes differences
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis', s=size);

png

mnist数据及生成

1
from sklearn.mixture import GaussianMixture
1
2
3
from sklearn.datasets import load_digits
digits = load_digits()
digits.data.shape
(1797, 64)
1
2
3
4
5
6
7
8
def plot_digits(data):
fig, ax = plt.subplots(10, 10, figsize=(8, 8),
subplot_kw=dict(xticks=[], yticks=[]))
fig.subplots_adjust(hspace=0.05, wspace=0.05)
for i, axi in enumerate(ax.flat):
im = axi.imshow(data[i].reshape(8, 8), cmap='binary')
im.set_clim(0, 16)
plot_digits(digits.data)

png

1
2
3
4
from sklearn.decomposition import PCA
pca = PCA(0.99, whiten=True)
data = pca.fit_transform(digits.data)
data.shape
(1797, 41)
1
2
3
4
5
n_components = np.arange(50, 210, 10)
models = [GaussianMixture(n, covariance_type='full', random_state=0)
for n in n_components]
aics = [model.fit(data).aic(data) for model in models]
plt.plot(n_components, aics);

png

1
2
3
gmm = GaussianMixture(110, covariance_type='full', random_state=0)
gmm.fit(data)
print(gmm.converged_)
True
1
2
data_new = gmm.sample(100)
print(len(data_new))
2
1
data_new = data_new[0]
1
2
digits_new = pca.inverse_transform(data_new)
plot_digits(digits_new)

png

第二部分:使用python从头开始实现gmm

初始化

  1. Decide how many sources/clusters (c) you want to fit to your data

  2. Initialize the parameters mean$\mu_c$, covariance$\sigma_c$, and fraction_per_class $\pi_c$ per cluster c

E-step

  1. Calculate for each datapoint $xi$ the probability $r{ic}$ that datapoint $x_i$ belongs to cluster c with:

where $N(x|\mu,\Sigma)$describes the mulitvariate Gaussian with:

$r_{ic}$ gives us for each datapoint $x_i$ the measure of:

$\frac{\mathrm{probability\ that\ xi\ belongs\ to\ class\ c}}{\mathrm{probability\ of\ x_i\ over\ all\ classes}}$ ,hence if $x_i$ is very close to one gaussian c, it will get a high $r{ic}$ value for this gaussian and relatively low values otherwise.

M-step

for each cluster c: Calculate the total weight $mc$(loosely speaking the fraction of points allocated to cluster c) and update $\pi_c$,$\mu_c$and $\Sigma_c$ using $r{ic}$with:

注意:最好一个式子所使用的$\mu_c$是前面更新后的$\mu_c$。

Iteratively repeat the E and M step until the log-likelihood function of our model converges where the log likelihood is computed with:

E-Step

  1. Decide how many sources/clusters(c) you want to fit to your data —> Mind that each cluster c is represented by gaussian g
  2. Initialize the parameters mean $\mu_c$,covariance $\Sigma_c$,and fraction_per_class $\pi_c$ per cluster c
  3. calculate for each datapoint $xi$ the probability $r{ic}$ that datapoint $x_i$ belongs to cluster c with:

where $N(x|\mu,\Sigma)$describes the mulitvariate Gaussian with:

$r_{ic}$ gives us for each datapoint $x_i$ the measure of:

$\frac{\mathrm{probability\ that\ xi\ belongs\ to\ class\ c}}{\mathrm{probability\ of\ x_i\ over\ all\ classes}}$ ,hence if $x_i$ is very close to one gaussian c, it will get a high $r{ic}$ value for this gaussian and relatively low values otherwise.

1
2
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import numpy as np
from scipy.stats import norm
np.random.seed(0)

X = np.linspace(-5,5,num=20)
X0 = X*np.random.rand(len(X))+10 #create data cluster 1
X1 = X*np.random.rand(len(X))-10 # Create data cluster 2
X2 = X*np.random.rand(len(X)) # Create data cluster 3
X_tot = np.stack((X0,X1,X2)).flatten()

"""
E-Step
"""

"""Create the array r with dimensionality nxK"""
r = np.zeros((len(X_tot),3))
print('Dimensionality','=',np.shape(r))

"""Instantiate the random gaussians"""
gauss_1 = norm(loc=-5,scale=5)
gauss_2 = norm(loc=8,scale=3)
gauss_3 = norm(loc=1.5,scale=1)

"""Instantiate the random pi_c,m_c"""
m_c = np.array([1/3,1/3,1/3]) #wwe expect to have three clusters
pi_c = m_c/np.sum(m_c)

"""Probability for each datapoint x_i to belong to gaussian g """
for c,g,p in zip(range(3),[gauss_1,gauss_2,gauss_3],pi_c):
r[:,c] = p*g.pdf(X_tot) # Write the probability that x belongs to gaussian c in column c.

"""Normalize the probabilities such that each row of r sums to 1"""
for i in range(len(r)):
r[i] = r[i]/np.sum(r,axis=1)[i]

print(r)
print(np.sum(r,axis=1)) # As we can see, as result each row sums up to one, just as we want it.
Dimensionality = (60, 3)
[[2.97644006e-02 9.70235407e-01 1.91912550e-07]
 [3.85713024e-02 9.61426220e-01 2.47747304e-06]
 [2.44002651e-02 9.75599713e-01 2.16252823e-08]
 [1.86909096e-02 9.81309090e-01 8.07574590e-10]
 [1.37640773e-02 9.86235923e-01 9.93606589e-12]
 [1.58674083e-02 9.84132592e-01 8.42447356e-11]
 [1.14191259e-02 9.88580874e-01 4.48947365e-13]
 [1.34349421e-02 9.86565058e-01 6.78305927e-12]
 [1.11995848e-02 9.88800415e-01 3.18533028e-13]
 [8.57645259e-03 9.91423547e-01 1.74498648e-15]
 [7.64696969e-03 9.92353030e-01 1.33051021e-16]
 [7.10275112e-03 9.92897249e-01 2.22285146e-17]
 [6.36154765e-03 9.93638452e-01 1.22221112e-18]
 [4.82376290e-03 9.95176237e-01 1.55549544e-22]
 [7.75866904e-03 9.92241331e-01 1.86665135e-16]
 [7.52759691e-03 9.92472403e-01 9.17205413e-17]
 [8.04550643e-03 9.91954494e-01 4.28205323e-16]
 [3.51864573e-03 9.96481354e-01 9.60903037e-30]
 [3.42631418e-03 9.96573686e-01 1.06921949e-30]
 [3.14390460e-03 9.96856095e-01 3.91217273e-35]
 [1.00000000e+00 2.67245688e-12 1.56443629e-57]
 [1.00000000e+00 4.26082753e-11 9.73970426e-49]
 [9.99999999e-01 1.40098281e-09 3.68939866e-38]
 [1.00000000e+00 2.65579518e-10 4.05324196e-43]
 [9.99999977e-01 2.25030673e-08 3.11711096e-30]
 [9.99999997e-01 2.52018974e-09 1.91287930e-36]
 [9.99999974e-01 2.59528826e-08 7.72534540e-30]
 [9.99999996e-01 4.22823192e-09 5.97494463e-35]
 [9.99999980e-01 1.98158593e-08 1.38414545e-30]
 [9.99999966e-01 3.43722391e-08 4.57504394e-29]
 [9.99999953e-01 4.74290492e-08 3.45975850e-28]
 [9.99999876e-01 1.24093364e-07 1.31878573e-25]
 [9.99999878e-01 1.21709730e-07 1.17161878e-25]
 [9.99999735e-01 2.65048706e-07 1.28402556e-23]
 [9.99999955e-01 4.53370639e-08 2.60841891e-28]
 [9.99999067e-01 9.33220139e-07 2.02379180e-20]
 [9.99998448e-01 1.55216175e-06 3.63693167e-19]
 [9.99997285e-01 2.71542629e-06 8.18923788e-18]
 [9.99955648e-01 4.43516655e-05 1.59283752e-11]
 [9.99987200e-01 1.28004505e-05 3.20565446e-14]
 [9.64689131e-01 9.53405294e-03 2.57768163e-02]
 [9.77001731e-01 7.96383733e-03 1.50344317e-02]
 [9.96373670e-01 2.97775078e-03 6.48579562e-04]
 [3.43634425e-01 2.15201653e-02 6.34845409e-01]
 [9.75390877e-01 8.19866977e-03 1.64104537e-02]
 [9.37822997e-01 1.19363656e-02 5.02406373e-02]
 [4.27396946e-01 2.18816340e-02 5.50721420e-01]
 [3.28570544e-01 2.14190231e-02 6.50010433e-01]
 [3.62198108e-01 2.16303800e-02 6.16171512e-01]
 [2.99837196e-01 2.11991858e-02 6.78963618e-01]
 [2.21768797e-01 2.04809383e-02 7.57750265e-01]
 [1.76497129e-01 2.01127714e-02 8.03390100e-01]
 [8.23252013e-02 2.50758227e-02 8.92598976e-01]
 [2.11943183e-01 2.03894641e-02 7.67667353e-01]
 [1.50351209e-01 2.00499057e-02 8.29598885e-01]
 [1.54779991e-01 2.00449518e-02 8.25175057e-01]
 [7.92109803e-02 5.93118654e-02 8.61477154e-01]
 [9.71905134e-02 2.18698473e-02 8.80939639e-01]
 [7.60625670e-02 4.95831879e-02 8.74354245e-01]
 [8.53513721e-02 2.40396004e-02 8.90609028e-01]]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
1
2
3
4
5
6
7
8
9
10
11
12
"""Plot the data"""
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
ax0.scatter(X_tot[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100) # We have defined the first column as red, the second as
# green and the third as blue
"""plot the gaussians"""
for g,c in zip([gauss_1.pdf(np.linspace(-15,15)),gauss_2.pdf(np.linspace(-15,15)),gauss_3.pdf(np.linspace(-15,15))],['r','g','b']):
ax0.plot(np.linspace(-15,15),g,c=c,zorder=0)


plt.show()

png

M-step

如何利用E-step中得到的信息?

So why did this help us? Well, we now know the probability for each point to belong to each gaussian.
What can we do with this information? Well, with this information we can calculate a new mean as well as a new variance (in 1D) or covariance matrix in > 1D datasets. What will be the result of that? Well, this would change the location of each gaussian in the direction of the “real” mean and would re-shape each gaussian using a value for the variance which is closer to the “real” variance. This procedure is called the Maximization step of the EM algorithm. The Maximization step looks as follows:

for each cluster c: Calculate the total weight $mc$(loosely speaking the fraction of points allocated to cluster c) and update $\pi_c$,$\mu_c$and $\Sigma_c$ using $r{ic}$with:

注意:最好一个式子所使用的$\mu_c$是前面更新后的$\mu_c$。

so let’s look at out plot if we do the above updates,that is run the first EM iteration.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
"""
M-step
"""

"""calculate m_c"""
m_c = []
for c in range(len(r[0])):
m = np.sum(r[:,c])
m_c.append(m) #for each cluster c, calculate the m_c and add it to the list m_c

"""calculate pi_c"""
pi_c = []
for m in m_c:
pi_c.append(m/np.sum(m_c)) #For each cluster c, calculate the fraction of points pi_c which belongs to cluster c

"""calculate mu_c"""
mu_c = np.sum(X_tot.reshape(len(X_tot),1)*r, axis=0)/m_c

"""calculate var_c"""
var_c = []
for c in range(len(r[0])):
var_c.append((1/m_c[c])*np.dot(((np.array(r[:,c]).reshape(60,1))*(
X_tot.reshape(len(X_tot),1)-mu_c[c])).T, (X_tot.reshape(len(X_tot),1)-mu_c[c])))

"""update the gaussians"""
gauss_1 = norm(loc=mu_c[0],scale=var_c[0])
gauss_2 = norm(loc=mu_c[1],scale=var_c[1])
gauss_3 = norm(loc=mu_c[2],scale=var_c[2])
1
2
3
4
5
6
7
8
9
10
11
12
#可视化更新后的高斯分布
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
ax0.scatter(X_tot[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100)

"""plot the gaussians"""
for g,c in zip([gauss_1.pdf(np.sort(X_tot).reshape(60,1)),gauss_2.pdf(np.sort(X_tot).reshape(60,1))\
,gauss_3.pdf(np.sort(X_tot).reshape(60,1))],['r','g','b']):
ax0.plot(np.sort(X_tot),g,c=c)

plt.show()

png

so as you can see the occurence of our gaussians changed dramatically after the first EM iteration.Let’s update r and illustrate the coloring of the points.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
"""update r"""
#mind that we use the new pi_c here
for c,g,p in zip(range(3),[gauss_1,gauss_2,gauss_3],pi_c):
r[:,c] = p*g.pdf(X_tot)

"""
Normalize the probabilities such that each row of r sums to 1 and
weight it by mu_c==the fraction of points belonging to cluster c
"""

for i in range(len(r)):
r[i] = r[i]/(np.sum(pi_c)*np.sum(r,axis=1)[i])

"""plot the data"""
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
ax0.scatter(X_tot[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100) #we have defined the first column as red,the second as
#green and the third as blue
"""plot the gaussians"""
for g,c in zip([gauss_1.pdf(np.sort(X_tot).reshape(60,1)),gauss_2.pdf(
np.sort(X_tot).reshape(60,1)),gauss_3.pdf(np.sort(X_tot).reshape(60,1))],['r','g','b']):
ax0.plot(np.sort(X_tot),g,c=c)

plt.show()

png

迭代10次EM过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
import numpy as np
from scipy.stats import norm
np.random.seed(0)
X = np.linspace(-5,5,num=20)
X0 = X*np.random.rand(len(X))+15 # Create data cluster 1
X1 = X*np.random.rand(len(X))-15 # Create data cluster 2
X2 = X*np.random.rand(len(X)) # Create data cluster 3
X_tot = np.stack((X0,X1,X2)).flatten() # Combine the clusters to get the random datapoints from above
class GM1D:
def __init__(self,X,iterations):
self.iterations = iterations
self.X = X
self.mu_c = None
self.pi_c = None
self.var_c = None

def run(self):

"""
Instantiate the random mu, pi and var
"""
self.mu_c = [-8,8,5]
self.pi_c = [1/3,1/3,1/3]
self.var_c = [5,3,1]



"""
E-Step
"""

for iter in range(self.iterations):
"""Create the array r with dimensionality nxK"""
r = np.zeros((len(X_tot),3))

"""
Probability for each datapoint x_i to belong to gaussian g
"""
for c,g,p in zip(range(3),[norm(loc=self.mu_c[0],scale=self.var_c[0]),
norm(loc=self.mu_c[1],scale=self.var_c[1]),
norm(loc=self.mu_c[2],scale=self.var_c[2])],self.pi_c):
r[:,c] = p*g.pdf(X_tot) # Write the probability that x belongs to gaussian c in column c.
# Therewith we get a 60x3 array filled with the probability that each x_i belongs to one of the gaussians
"""
Normalize the probabilities such that each row of r sums to 1 and weight it by mu_c == the fraction of points belonging to
cluster c
"""
for i in range(len(r)):
r[i] = r[i]/(np.sum(pi_c)*np.sum(r,axis=1)[i])
"""Plot the data"""
fig = plt.figure(figsize=(5,5))
ax0 = fig.add_subplot(111)
for i in range(len(r)):
ax0.scatter(self.X[i],0,c=np.array([r[i][0],r[i][1],r[i][2]]),s=100)
"""Plot the gaussians"""
for g,c in zip([norm(loc=self.mu_c[0],scale=self.var_c[0]).pdf(np.linspace(-20,20,num=60)),
norm(loc=self.mu_c[1],scale=self.var_c[1]).pdf(np.linspace(-20,20,num=60)),
norm(loc=self.mu_c[2],scale=self.var_c[2]).pdf(np.linspace(-20,20,num=60))],['r','g','b']):
ax0.plot(np.linspace(-20,20,num=60),g,c=c)



"""M-Step"""

"""calculate m_c"""
m_c = []
for c in range(len(r[0])):
m = np.sum(r[:,c])
m_c.append(m) # For each cluster c, calculate the m_c and add it to the list m_c
"""calculate pi_c"""
for k in range(len(m_c)):
self.pi_c[k] = (m_c[k]/np.sum(m_c)) # For each cluster c, calculate the fraction of points pi_c which belongs to cluster c
"""calculate mu_c"""
self.mu_c = np.sum(self.X.reshape(len(self.X),1)*r,axis=0)/m_c
"""calculate var_c"""
var_c = []
for c in range(len(r[0])):
var_c.append((1/m_c[c])*np.dot(((np.array(r[:,c]).reshape(60,1))*(self.X.reshape(len(self.X),1)-self.mu_c[c])).T,(self.X.reshape(len(self.X),1)-self.mu_c[c])))
plt.show()
1
2
GM1D = GM1D(X_tot,10)
GM1D.run()

png

png

png

png

png

png

png

png

png

png

multi dimensional case

1
2
3
4
5
6
import matplotlib.pyplot as plt
from matplotlib import style
style.use('fivethirtyeight')
from sklearn.datasets.samples_generator import make_blobs
import numpy as np
from scipy.stats import multivariate_normal
1
2
3
4
#0. Create dataset
X,Y = make_blobs(cluster_std=1.5, random_state=20,n_samples=500,centers=3)
#stratch dataset to get ellipsoid data
X = np.dot(X, np.random.RandomState(0).randn(2,2))
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
class GMM(object):
def __init__(self, X, number_of_sources, iterations):
self.iterations = iterations
self.number_of_sources = number_of_sources
self.X = X
self.mu = None
self.pi = None
self.cov = None
self.XY = None

def run(self):
'''define a function which runs for iterations, iterations'''
self.reg_cov = 1e-6*np.identity(len(self.X[0]))
x,y = np.meshgrid(np.sort(self.X[:,0]),np.sort(self.X[:,1]))
self.XY = np.array([x.flatten(),y.flatten()]).T

"""
1. set the initial mu, covariance and pi values
this is a nxm matrix,since we assume n sources(nGaussians) where each has m dimensions
"""
self.mu = np.random.randint(min(self.X[:,0]), max(self.X[:,0]), size=(self.number_of_sources, len(self.X[0])))
"""
we need a nxmxm covariance matrix for each source since we have m features-->we create symmetric
covariance matrices with ones on the digonal
"""
self.cov = np.zeros((self.number_of_sources, len(X[0]),len(X[0])))

for dim in range(len(self.cov)):
np.fill_diagonal(self.cov[dim],5)

self.pi = np.ones(self.number_of_sources)/self.number_of_sources #Are 'fractions'
log_likelihoods = [] #In this list we store the log likehoods per iteration and plot them in the end to check if we have converged

"""plot the initial state"""
fig = plt.figure(figsize=(10,10))
ax0 = fig.add_subplot(111)
ax0.scatter(self.X[:,0],self.X[:,1])
ax0.set_title('Initial state')
for m,c in zip(self.mu, self.cov):
c+=self.reg_cov
multi_normal = multivariate_normal(mean=m,cov=c)
ax0.contour(np.sort(self.X[:,0]),np.sort(self.X[:,1]),multi_normal.pdf(self.XY).reshape(len(self.X),len(self.X)),
colors='black',alpha=0.3)
ax0.scatter(m[0],m[1],c='grey',zorder=10,s=100)

for i in range(self.iterations):
'''E step'''
r_ic = np.zeros((len(self.X),len(self.cov)))
for m,co,p,r in zip(self.mu, self.cov, self.pi, range(len(r_ic[0]))):
co+=self.reg_cov
mn = multivariate_normal(mean=m, cov=co)
r_ic[:,r] = p*mn.pdf(self.X)/np.sum([pi_c*multivariate_normal(mean=mu_c,cov=cov_c).pdf(X)
for pi_c,mu_c,cov_c in zip(self.pi,self.mu,self.cov+self.reg_cov)],axis=0)
"""
the above calculation of r_ic is not that obvious why I want to quickly derive what we have done above.

First of all the nominator:
we calculate for each source c which is defined by m,co and p for every instance x_i, the multivatiate_normal.pdf()value.

for each loop this gives us a 100x1 matrix(this value divided by the denominator is then assigned to
r_ic[:,r] which is in the end a 100x3 matrix).

Second the denominator:

What we do here is, we calculate the multivariate_normal.pdf() for every instance x_i for every source c
which is defined by pi_c,mu_c,and cov_c and write this into a list. This gives us a 3x100 matrix where we
have 100 entrances per source c.

Now the formula wants us to add up the pdf() values given by the 3 sources for each x_i. Hence we sum up
this list over axis=0.

this gives us then a list with 100 entries.

What we have now is FOR EACH LOOP a list with 100 entries in the nominator and a list with 100 entries
in the denominator where each element is the pdf per class c for each instance x_i(nominator) respectively
the summed pdf's of classes c for each instance x_i.

Consequently we can now divide the nominator by the denominator and have as result a list with 100 elements
which we can then assign to r_ic[:,r] --> One row r per source c. In the end after we have done this for all
three sources (three loops) and run from r==0 to r==2 we get a matrix with dimensionallity 100x3 which is
exactly what we want.

If we check the entries of r_ic we see that there mostly one element which is much larger than the other two.
This is because every instance x_i is much closer to one of the three gaussians (that is, much more likely to
come from this gaussian) than it is to the other two. That is practically speaing, r_ic gives us the fraction
of the probability that x_i belongs to class c over the probability that x_i belonges to any of the classes c
(Probability that x_i occurs given the 3 Gaussians).
"""

"""
M-Step

Calculate the new mean vector and new covariance matrices, based on the probable membership of the single x_i to
classes c-->r_ic
"""
self.mu = []
self.cov = []
self.pi = []
log_likelihood = []
for c in range(len(r_ic[0])):
m_c = np.sum(r_ic[:,c],axis=0)
mu_c = (1/m_c)*np.sum(self.X*r_ic[:,c].reshape(len(self.X),1),axis=0)
self.mu.append(mu_c)
#Calculate the covariance matrix per source based on the new mean
self.cov.append(((1/m_c)*np.dot((np.array(r_ic[:,c]).reshape(len(self.X),1)*(self.X-mu_c)).T,(self.X-mu_c)))+self.reg_cov)

#Calculate pi_new which is the 'fraction of points' reprectively the fraction of the probability assigned to each source
"""
Here np.sum(r_ic) gives as result the number of instances. This is logical since we know that the columns of each row of r_ic adds up to 1.
Since we add up all elements, we sum up all columns per row which gives 1 and then all rows which gives then the number of instances (rows)
in X --> Since pi_new contains the fractions of datapoints, assigned to the sources c,The elements in pi_new must add up to 1
"""
self.pi.append(m_c/np.sum(r_ic))

"""Log likelihood"""
log_likelihoods.append(np.log(np.sum([k*multivariate_normal(self.mu[i],self.cov[j]).pdf(X) for k,i,j in
zip(self.pi,range(len(self.mu)),range(len(self.cov)))])))

"""
This process of E step followed by a M step is now iterated a number of n times. In the second step for instance,
we use the calculated pi_new, mu_new and cov_new to calculate the new r_ic which are then used in the second M step
to calculat the mu_new2 and cov_new2 and so on....
"""
fig2 = plt.figure(figsize=(10,10))
ax1 = fig2.add_subplot(111)
ax1.set_title('Log-Likelihood')
ax1.plot(range(0,self.iterations,1),log_likelihoods)
#plt.show()

def predict(self,Y):
'''predict the membership of an unseen,new datapoint'''
#Plot the point onto the fittet gaussians
fig3 = plt.figure(figsize=(10,10))
ax2 = fig3.add_subplot(111)
ax2.scatter(self.X[:,0],self.X[:,1])
for m,c in zip(self.mu, self.cov):
multi_normal = multivariate_normal(mean=m,cov=c)
ax2.contour(np.sort(self.X[:,0]),np.sort(self.X[:,1]),multi_normal.pdf(self.XY).reshape(len(self.X),len(self.X)),colors='black',alpha=0.3)
ax2.scatter(m[0],m[1],c='grey',zorder=10,s=100)
ax2.set_title('Final state')
for y in Y:
ax2.scatter(y[0],y[1],c='orange',zorder=10,s=100)

prediction = []
for m,c in zip(self.mu, self.cov):
prediction.append(multivariate_normal(mean=m,cov=c).pdf(Y)/np.sum([multivariate_normal(mean=mean,cov=cov).pdf(Y) for mean,cov in zip(self.mu,self.cov)]))

plt.show()
return prediction
1
2
3
GMM = GMM(X,3,50)
GMM.run()
GMM.predict([[0.5,0.5]])

png

png

png

[0.8429320897452095, 0.15706464234352321, 3.2679112672282063e-06]

参考资料