Download as pdf or txt
Download as pdf or txt
You are on page 1of 6

Week-08 Central Limit Theorem

[6]: import pandas as pd


import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random
random.seed = 42
import warnings
warnings.filterwarnings("ignore")

import plotly.offline as offline


import plotly.graph_objs as go
offline.init_notebook_mode()
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from prettytable import PrettyTable
from IPython.display import HTML, display
plt.xkcd()

[6]: <contextlib.ExitStack at 0x1e6cbd0d790>

[7]: #This data is downloaded from black friday


df = pd.read_csv('train.csv')
print("number of data points in our popultion:", df.shape)
#print("% of missing values",df['Purchase'].isnull().sum() * 100 / len(df))
df.head(10)

number of data points in our popultion: (537577, 12)


[7]: User_ID Product_ID Gender Age Occupation City_Category \
0 1000001 P00069042 F 0-17 10 A
1 1000001 P00248942 F 0-17 10 A

1
2 1000001 P00087842 F 0-17 10 A
3 1000001 P00085442 F 0-17 10 A
4 1000002 P00285442 M 55+ 16 C
5 1000003 P00193542 M 26-35 15 A
6 1000004 P00184942 M 46-50 7 B
7 1000004 P00346142 M 46-50 7 B
8 1000004 P0097242 M 46-50 7 B
9 1000005 P00274942 M 26-35 20 A

Stay_In_Current_City_Years Marital_Status Product_Category_1 \


0 2 0 3
1 2 0 1
2 2 0 12
3 2 0 12
4 4+ 0 8
5 3 0 1
6 2 1 1
7 2 1 1
8 2 1 1
9 1 1 8

Product_Category_2 Product_Category_3 Purchase


0 NaN NaN 8370
1 6.0 14.0 15200
2 NaN NaN 1422
3 14.0 NaN 1057
4 NaN NaN 7969
5 2.0 NaN 15227
6 8.0 17.0 19215
7 15.0 NaN 15854
8 16.0 NaN 15686
9 NaN NaN 7871

[8]: data = np.array(df['Purchase'].values)


print("Number of samples in our data: ",data.shape[0])
sns.distplot(data, color='g')
plt.show()
# population mean
population_mean = np.round(data.mean(),3)
# population std
population_std = np.round(data.std(),3)

Number of samples in our data: 537577

2
[9]: def get_means_of_n_samples_with_m_size(data, n, m):
sample_mean_m_samples_n_ele = []
for i in range(0,n):
samples = random.sample(range(0, data.shape[0]), m)
sample_mean_m_samples_n_ele.append(data[samples].mean())
return sample_mean_m_samples_n_ele

[10]: def central_limit_theorem(data, population_mean , i, j, color, key):


sns.distplot(np.array(data), color=color, ax=axs[i, j])
axs[i, j].axvline(population_mean, linestyle="--", color='r',␣
↪label="p_mean")

axs[i, j].axvline(np.array(data).mean(), linestyle="-.", color='b',␣


↪label="s_mean")

axs[i, j].set_title(key)
axs[i, j].legend()

3
[11]: sample_means = dict()

[12]: sample_means['100samples_50ele'] = get_means_of_n_samples_with_m_size(data,100,␣


↪50)

sample_means['1000samples_50ele'] =␣
↪get_means_of_n_samples_with_m_size(data,1000, 50)

sample_means['100samples_100ele'] =␣
↪get_means_of_n_samples_with_m_size(data,100, 100)

sample_means['1000samples_100ele'] =␣
↪get_means_of_n_samples_with_m_size(data,1000, 100)

sample_means['100samples_1000ele'] =␣
↪get_means_of_n_samples_with_m_size(data,100, 1000)

sample_means['1000samples_1000ele'] =␣
↪get_means_of_n_samples_with_m_size(data,1000, 1000)

[13]: #red, green, blue, yellow, etc


colrs = ['r','g','b','y', 'c', 'm', 'k']
plt_grid = [(0,0), (0, 1), (1, 0), (1, 1), (2, 0), (2, 1)]
sample_sizes = [(100,50), (1000, 50), (100, 100), (1000, 100), (100, 1000),␣
↪(100, 1000)]

fig, axs = plt.subplots(3, 2, figsize=(10, 10))


for i, key in enumerate(sample_means.keys()):
central_limit_theorem(sample_means[key], population_mean , plt_grid[i][0],␣
↪plt_grid[i][1] , colrs[i], key)

plt.show()

4
[14]: x = PrettyTable()
x = PrettyTable(["#samples_name", "P_Mean", "Sampel mean", "P_Std", "Sample␣
↪Std", "mu_x"+u"\u2248"+"mu", "std_x"+u"\u2248"+"std/"+u"\u221A"+"n"])

5
for i, key in enumerate(sample_means.keys()):
sample_mean = np.round(np.array(sample_means[key]).mean(), 3)
sample_std = np.round(np.array(sample_means[key]).std(), 3)
population_std_est = np.round(population_std/np.sqrt(sample_sizes[i][1]), 3)

row = []
row.append(key)
row.append(population_mean)
row.append(sample_mean)
row.append(population_std)
row.append(sample_std)
row.append(str(sample_mean)+u"\u2248"+str(population_mean))
row.append(str(sample_std)+u"\u2248"+str(population_std_est))
x.add_row(row)
print(x)

+---------------------+---------+-------------+----------+------------+---------
---------+-----------------+
| #samples_name | P_Mean | Sampel mean | P_Std | Sample Std |
mu_x�mu | std_x�std/√n |
+---------------------+---------+-------------+----------+------------+---------
---------+-----------------+
| 100samples_50ele | 9333.86 | 9293.414 | 4981.017 | 642.308 |
9293.414�9333.86 | 642.308�704.422 |
| 1000samples_50ele | 9333.86 | 9304.882 | 4981.017 | 711.299 |
9304.882�9333.86 | 711.299�704.422 |
| 100samples_100ele | 9333.86 | 9343.436 | 4981.017 | 529.173 |
9343.436�9333.86 | 529.173�498.102 |
| 1000samples_100ele | 9333.86 | 9336.437 | 4981.017 | 508.684 |
9336.437�9333.86 | 508.684�498.102 |
| 100samples_1000ele | 9333.86 | 9334.247 | 4981.017 | 147.713 |
9334.247�9333.86 | 147.713�157.514 |
| 1000samples_1000ele | 9333.86 | 9331.619 | 4981.017 | 158.095 |
9331.619�9333.86 | 158.095�157.514 |
+---------------------+---------+-------------+----------+------------+---------
---------+-----------------+

You might also like