# Fit mixture of two gaussian/normal distributions to a histogram from one set of data, python

Posted on

Problem :

I have one set of data in python. I am plotting this as a histogram, this plot shows a bimodal distribution, therefore I am trying to plot two gaussian profiles over each peak in the bimodality.

If i use the code below is requires me to have two datasets with the same size. however I just have one dataset, and this cannot be divided equally. How can I fit these two gaussians

``````from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(yourdata)
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
histdist = matplotlib.pyplot.hist(yourdata, 100, normed=True)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1)), linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2)), linewidth=3)
plotgauss1(histdist)
plotgauss2(histdist)
``````

Solution :

Here a simulation with scipy tools :

``````from pylab import *
from scipy.optimize import curve_fit

data=concatenate((normal(1,.2,5000),normal(2,.2,2500)))
y,x,_=hist(data,100,alpha=.3,label='data')

x=(x[1:]+x[:-1])/2 # for len(x)==len(y)

def gauss(x,mu,sigma,A):
return A*exp(-(x-mu)**2/2/sigma**2)

def bimodal(x,mu1,sigma1,A1,mu2,sigma2,A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

expected=(1,.2,250,2,.2,125)
params,cov=curve_fit(bimodal,x,y,expected)
sigma=sqrt(diag(cov))
plot(x,bimodal(x,*params),color='red',lw=3,label='model')
legend()
print(params,'n',sigma)
``````

The data is the superposition of two normal samples, the model a sum of Gaussian curves. we obtain : And the estimate parameters are :

``````# via pandas :
# pd.DataFrame(data={'params':params,'sigma':sigma},index=bimodal.__code__.co_varnames[1:])
params     sigma
mu1       0.999447  0.002683
sigma1    0.202465  0.002696
A1      226.296279  2.597628
mu2       2.003028  0.005036
sigma2    0.193235  0.005058
A2      117.823706  2.658789
``````

This can be achieved in a clean and simple way using `sklearn` Python library:

``````import numpy as np
from sklearn.mixture import GaussianMixture
from pylab import concatenate, normal

# First normal distribution parameters
mu1 = 1
sigma1 = 0.1

# Second normal distribution parameters
mu2 = 2
sigma2 = 0.2

w1 = 2/3 # Proportion of samples from first distribution
w2 = 1/3 # Proportion of samples from second distribution

n = 7500 # Total number of samples

n1 = int(n*w1) # Number of samples from first distribution
n2 = int(n*w2) # Number of samples from second distribution

# Generate n1 samples from the first normal distribution and n2 samples from the second normal distribution
X = concatenate((normal(mu1, sigma1, n1), normal(mu2, sigma2, n2))).reshape(-1, 1)

# Determine parameters mu1, mu2, sigma1, sigma2, w1 and w2
gm = GaussianMixture(n_components=2, random_state=0).fit(X)
print(f'mu1={gm.means_}, mu2={gm.means_}')
print(f'sigma1={np.sqrt(gm.covariances_)}, sigma2={np.sqrt(gm.covariances_)}')
print(f'w1={gm.weights_}, w2={gm.weights_}')
print(f'n1={int(n * gm.weights_)} n2={int(n * gm.weights_)}')
``````

As the use of pylab is now strongly discouraged by matplotlib because of namespace cluttering, I have rewritten the script by B.M. and added a plot of the two components:

``````import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit

#data generation
np.random.seed(123)
data=np.concatenate((np.random.normal(1, .2, 5000), np.random.normal(1.6, .3, 2500)))
y,x,_=plt.hist(data, 100, alpha=.3, label='data')
x=(x[1:]+x[:-1])/2 # for len(x)==len(y)

#x, y inputs can be lists or 1D numpy arrays

def gauss(x, mu, sigma, A):
return A*np.exp(-(x-mu)**2/2/sigma**2)

def bimodal(x, mu1, sigma1, A1, mu2, sigma2, A2):
return gauss(x,mu1,sigma1,A1)+gauss(x,mu2,sigma2,A2)

expected = (1, .2, 250, 2, .2, 125)
params, cov = curve_fit(bimodal, x, y, expected)
sigma=np.sqrt(np.diag(cov))
x_fit = np.linspace(x.min(), x.max(), 500)
#plot combined...
plt.plot(x_fit, bimodal(x_fit, *params), color='red', lw=3, label='model')
#...and individual Gauss curves
plt.plot(x_fit, gauss(x_fit, *params[:3]), color='red', lw=1, ls="--", label='distribution 1')
plt.plot(x_fit, gauss(x_fit, *params[3:]), color='red', lw=1, ls=":", label='distribution 2')
#and the original data points if no histogram has been created before
#plt.scatter(x, y, marker="X", color="black", label="original data")
plt.legend()
print(pd.DataFrame(data={'params': params, 'sigma': sigma}, index=bimodal.__code__.co_varnames[1:]))
plt.show()
`````` ``````            params     sigma
mu1       1.014589  0.005273
sigma1    0.203826  0.004067
A1      230.654585  3.667409
mu2       1.635225  0.022423
sigma2    0.282690  0.019070
A2       72.621856  2.148459
``````

This answer extensively uses B. M. code but has been modified to
get the bimodal distribution parameters so as to have a statistical distribution that integrates to 1. Detailed comments explaining every step and potential pitfalls

``````"""
This function will return the number of items at given x values
mu1, sigma1 and m2, sigma2 are the sought for normal distributions parameters
p is the proportion of draws from the first distribution, 1-p is the proportion
of draws from the second distribution. A is a scaling factor that is multiplied
by the density gives a number of items in a bin.
"""
def bimodal(x,mu1,sigma1,mu2,sigma2, p, A):
return p * A * norm.pdf(x,mu1,sigma1)+ (1-p) * A * norm.pdf(x,mu2,sigma2)

"""
Determine the best combination of normal distributions parameters, proportion of
of first distribution and scaling factor so that we match the number of draw in
each bin.
It should be noted that the initial guess has to be close enough, otherwise
curve_fit will not converge. As such, under the assumption that the
means and variances are unknown, a good guess could be gleaned by looking
at the histogram.
"""
initial_guess=(0.5, 0.3, 2.5, 0.1, 0.5, 500)
params,cov=curve_fit(bimodal, x, y, initial_guess)

"""
Plot the expected number of items using the parameterized model on top of the
histogram.
"""
sigma=sqrt(diag(cov))
plt.plot(x,bimodal(x,*params),color='red',lw=3,label='model')
plt.legend()
print(params,'n',sigma)

"""
Normalize y values by dividing by the scaling factor and show the bimodal
distribution plot.
"""
y = y / params[-1] # Normalize data
params[-1]=1;      # Get rid of the scaling factor
plt.figure()       # Create a new figure
plt.scatter(x, y); # Plot normalized data
# Plot the bimodal distribution that integrates to 1
plt.plot(np.linspace(0.5, 3, 1000), bimodal(np.linspace(0.5, 3, 1000), *params), color='red')
``````